conflict resolution in the comparator test class

2024-11-18 14:59:30 +01:00 · 2024-11-18 14:59:30 +01:00 · c97facf5e6
parent 6c17993d16 cf7d9a32ab
commit c97facf5e6
91 changed files with 1954 additions and 804 deletions
--- a/.gitignore
+++ b/.gitignore
@ -28,3 +28,4 @@ spark-warehouse
 /**/.scalafmt.conf
 /.java-version
 /dhp-shade-package/dependency-reduced-pom.xml
+/**/job.properties
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.actionmanager.personentity;
+package eu.dnetlib.dhp.common.person;

 import java.util.Arrays;
 import java.util.Iterator;
@ -61,7 +61,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
 	private Relation getRelation(String orcid1, String orcid2) {
 		String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
 		String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
-		return OafMapperUtils
+		Relation relation = OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.PERSON_PERSON_RELTYPE,
 				ModelConstants.PERSON_PERSON_SUBRELTYPE,
@ -76,5 +76,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
 								ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
+		relation.setValidated(true);
+		return relation;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
@ -1,12 +1,9 @@

-package eu.dnetlib.dhp.actionmanager.personentity;
+package eu.dnetlib.dhp.common.person;

 import java.io.Serializable;
-import java.util.ArrayList;
 import java.util.List;

-import eu.dnetlib.dhp.schema.oaf.Relation;
-
 public class Coauthors implements Serializable {
 	private List<String> coauthors;

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java
@ -2,8 +2,7 @@
 package eu.dnetlib.dhp.oa.merge;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static org.apache.spark.sql.functions.col;
-import static org.apache.spark.sql.functions.when;
+import static org.apache.spark.sql.functions.*;

 import java.util.Map;
 import java.util.Optional;
@ -135,7 +134,9 @@ public class GroupEntitiesSparkJob {
 					.applyCoarVocabularies(entity, vocs),
 				OAFENTITY_KRYO_ENC)
 			.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
-			.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
+			.mapGroups(
+				(MapGroupsFunction<String, OafEntity, OafEntity>) (key, group) -> MergeUtils.mergeById(group, vocs),
+				OAFENTITY_KRYO_ENC)
 			.map(
 				(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
 					t.getClass().getName(), t),
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -2,7 +2,6 @@
 package eu.dnetlib.dhp.schema.oaf.utils;

 import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE;
 import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;

 import java.net.MalformedURLException;
@ -363,6 +362,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 				// nothing to clean here
 			} else if (value instanceof Project) {
 				// nothing to clean here
+			} else if (value instanceof Person) {
+				// nothing to clean here
 			} else if (value instanceof Organization) {
 				Organization o = (Organization) value;
 				if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
@ -694,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 						}
 					}

+					// set ORCID_PENDING to all orcid values that are not coming from ORCID provenance
 					for (Author a : r.getAuthor()) {
 						if (Objects.isNull(a.getPid())) {
 							a.setPid(Lists.newArrayList());
@ -750,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 										.collect(Collectors.toList()));
 						}
 					}
+
+					// Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result
+					Map<String, Integer> clashing_orcid = new HashMap<>();
+
+					for (Author a : r.getAuthor()) {
+						a
+							.getPid()
+							.stream()
+							.filter(
+								p -> StringUtils
+									.contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING))
+							.map(StructuredProperty::getValue)
+							.distinct()
+							.forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1));
+					}
+
+					Set<String> clashing = clashing_orcid
+						.entrySet()
+						.stream()
+						.filter(ee -> ee.getValue() > 1)
+						.map(Map.Entry::getKey)
+						.collect(Collectors.toSet());
+
+					// filter out clashing orcids
+					for (Author a : r.getAuthor()) {
+						a
+							.setPid(
+								a
+									.getPid()
+									.stream()
+									.filter(p -> !clashing.contains(p.getValue()))
+									.collect(Collectors.toList()));
+					}
+
 				}
 				if (value instanceof Publication) {

@ -808,7 +844,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 		return author;
 	}

-	private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
+	public static Optional<String> cleanDateField(Field<String> dateofacceptance) {
 		return Optional
 			.ofNullable(dateofacceptance)
 			.map(Field::getValue)
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java
@ -204,6 +204,7 @@ public class IdentifierFactory implements Serializable {
 			.map(
 				pp -> pp
 					.stream()
+					.filter(p -> StringUtils.isNotBlank(p.getValue()))
 					// filter away PIDs provided by a DS that is not considered an authority for the
 					// given PID Type
 					.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -23,24 +23,30 @@ import org.apache.commons.lang3.tuple.Pair;
 import com.github.sisyphsu.dateparser.DateParserUtils;
 import com.google.common.base.Joiner;

+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.common.AccessRightComparator;
+import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;

 public class MergeUtils {

-	public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
-		return mergeGroup(s, oafEntityIterator, true);
+	public static <T extends Oaf> T mergeById(Iterator<T> oafEntityIterator, VocabularyGroup vocs) {
+		return mergeGroup(oafEntityIterator, true, vocs);
 	}

-	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
-		return mergeGroup(s, oafEntityIterator, false);
+	public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator) {
+		return mergeGroup(oafEntityIterator, false);
 	}

-	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
-		boolean checkDelegateAuthority) {
+	public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator, boolean checkDelegateAuthority) {
+		return mergeGroup(oafEntityIterator, checkDelegateAuthority, null);
+	}
+
+	public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator,
+		boolean checkDelegateAuthority, VocabularyGroup vocs) {

 		ArrayList<T> sortedEntities = new ArrayList<>();
 		oafEntityIterator.forEachRemaining(sortedEntities::add);
@ -49,13 +55,55 @@ public class MergeUtils {
 		Iterator<T> it = sortedEntities.iterator();
 		T merged = it.next();

-		while (it.hasNext()) {
-			merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
+		if (!it.hasNext() && merged instanceof Result && vocs != null) {
+			return enforceResultType(vocs, (Result) merged);
+		} else {
+			while (it.hasNext()) {
+				merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
+			}
 		}
-
 		return merged;
 	}

+	private static <T extends Oaf> T enforceResultType(VocabularyGroup vocs, Result mergedResult) {
+		if (Optional.ofNullable(mergedResult.getInstance()).map(List::isEmpty).orElse(true)) {
+			return (T) mergedResult;
+		} else {
+			final Instance i = mergedResult.getInstance().get(0);
+
+			if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
+				return (T) mergedResult;
+			} else {
+				final String expectedResultType = Optional
+					.ofNullable(
+						vocs
+							.lookupTermBySynonym(
+								ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()))
+					.orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE)
+					.getClassid();
+
+				// there is a clash among the result types
+				if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) {
+
+					Result result = (Result) Optional
+						.ofNullable(ModelSupport.oafTypes.get(expectedResultType))
+						.map(r -> {
+							try {
+								return r.newInstance();
+							} catch (InstantiationException | IllegalAccessException e) {
+								throw new IllegalStateException(e);
+							}
+						})
+						.orElse(new OtherResearchProduct());
+					result.setId(mergedResult.getId());
+					return (T) mergeResultFields(result, mergedResult);
+				} else {
+					return (T) mergedResult;
+				}
+			}
+		}
+	}
+
 	public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
 		return (T) merge(left, right, checkDelegateAuthority);
 	}
@ -106,7 +154,7 @@ public class MergeUtils {
 				return mergeSoftware((Software) left, (Software) right);
 			}

-			return mergeResultFields((Result) left, (Result) right);
+			return left;
 		} else if (sameClass(left, right, Datasource.class)) {
 			// TODO
 			final int trust = compareTrust(left, right);
@ -654,16 +702,9 @@ public class MergeUtils {
 	}

 	private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
-		if (d1 == null || StringUtils.isBlank(d1.getValue())) {
+		if (!GraphCleaningFunctions.cleanDateField(d1).isPresent()) {
 			return d2;
-		} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
-			return d1;
-		}
-
-		if (StringUtils.contains(d1.getValue(), "null")) {
-			return d2;
-		}
-		if (StringUtils.contains(d2.getValue(), "null")) {
+		} else if (!GraphCleaningFunctions.cleanDateField(d2).isPresent()) {
 			return d1;
 		}

@ -715,7 +756,11 @@ public class MergeUtils {
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
 			.ofNullable(sp)
-			.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
+			.map(
+				s -> Joiner
+					.on("||")
+					.useForNull("")
+					.join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
 			.orElse(null);
 	}

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java
@ -1,6 +1,12 @@

 package eu.dnetlib.dhp.schema.oaf.utils;

+import java.util.Map;
+
+import com.google.common.collect.Maps;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+
 public class ModelHardLimits {

 	private ModelHardLimits() {
@ -12,6 +18,7 @@ public class ModelHardLimits {

 	public static final int MAX_EXTERNAL_ENTITIES = 50;
 	public static final int MAX_AUTHORS = 200;
+	public static final int MAX_RELATED_AUTHORS = 20;
 	public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
 	public static final int MAX_TITLE_LENGTH = 5000;
 	public static final int MAX_TITLES = 10;
@ -19,6 +26,12 @@ public class ModelHardLimits {
 	public static final int MAX_ABSTRACT_LENGTH = 150000;
 	public static final int MAX_RELATED_ABSTRACT_LENGTH = 500;
 	public static final int MAX_INSTANCES = 10;
+	public static final Map<String, Long> MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap();
+
+	static {
+		MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500L);
+		MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500L);
+	}

 	public static String getCollectionName(String format) {
 		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java
@ -26,7 +26,7 @@ public class PidCleaner {
 		String value = Optional
 			.ofNullable(pidValue)
 			.map(String::trim)
-			.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
+			.orElseThrow(() -> new IllegalArgumentException("PID (" + pidType + ") value cannot be empty"));

 		switch (pidType) {

--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@ -179,7 +179,7 @@ class OafMapperUtilsTest {
 		assertEquals(
 			ModelConstants.DATASET_RESULTTYPE_CLASSID,
 			((Result) MergeUtils
-				.merge(p2, d1))
+				.merge(p2, d1, true))
 					.getResulttype()
 					.getClassid());
 	}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java
@ -38,7 +38,7 @@ public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction

 	@Override
 	protected Collection<String> doApply(Config conf, String s) {
-		return suffixPrefixChain(cleanup(s), param("mod"));
+		return suffixPrefixChain(cleanup(s), paramOrDefault("mod", 10));
 	}

 	private Collection<String> suffixPrefixChain(String s, int mod) {
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -90,7 +90,7 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
 		inferFrom = normalize(inferFrom);
 		inferFrom = filterAllStopWords(inferFrom);
 		Set<String> cities = getCities(inferFrom, 4);
-		return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
+		return citiesToCountry(cities).stream().filter(Objects::nonNull).findFirst().orElse("UNKNOWN");
 	}

 	public static String cityInference(String original) {
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@ -54,6 +54,22 @@ public class FieldDef implements Serializable {
 	public FieldDef() {
 	}

+	public FieldDef clone() {
+		FieldDef fieldDef = new FieldDef();
+		fieldDef.setName(this.name);
+		fieldDef.setPath(this.path);
+		fieldDef.setType(this.type);
+		fieldDef.setOverrideMatch(this.overrideMatch);
+		fieldDef.setSize(this.size);
+		fieldDef.setLength(this.length);
+		fieldDef.setFilter(this.filter);
+		fieldDef.setSorted(this.sorted);
+		fieldDef.setClean(this.clean);
+		fieldDef.setInfer(this.infer);
+		fieldDef.setInferenceFrom(this.inferenceFrom);
+		return fieldDef;
+	}
+
 	public String getInferenceFrom() {
 		return inferenceFrom;
 	}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala
@ -19,48 +19,10 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
  val model: SparkModel = SparkModel(conf)

  val dedup: (Dataset[Row] => Dataset[Row]) = df => {
-    df.transform(filterAndCleanup)
-      .transform(generateClustersWithCollect)
+    df.transform(generateClustersWithCollect)
      .transform(processBlocks)
  }

-
-  val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
-    val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
-      if (conf.blacklists.containsKey(fdef.getName)) {
-        res.withColumn(
-          fdef.getName + "_filtered",
-          filterColumnUDF(fdef).apply(new Column(fdef.getName))
-        )
-      } else {
-        res
-      }
-    })
-
-    df_with_filters
-  }
-
-  def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
-    val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
-
-    if (blacklist == null) {
-      throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
-    } else {
-      fdef.getType match {
-        case Type.List | Type.JSON =>
-          udf[Array[String], Array[String]](values => {
-            values.filter((v: String) => !blacklist.test(v))
-          })
-
-        case _ =>
-          udf[String, String](v => {
-            if (blacklist.test(v)) ""
-            else v
-          })
-      }
-    }
-  }
-
  val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
    var df_with_clustering_keys: Dataset[Row] = null

--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -5,12 +5,12 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
 import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
 import org.apache.spark.sql.{Dataset, Row}

 import java.util.Locale
+import java.util.function.Predicate
 import java.util.regex.Pattern
 import scala.collection.JavaConverters._

@ -29,8 +29,20 @@ case class SparkModel(conf: DedupConfig) {
    identifier.setName(identifierFieldName)
    identifier.setType(Type.String)

+    // create fields for blacklist
+    val filtered = conf.getPace.getModel.asScala.flatMap(fdef => {
+      if (conf.blacklists().containsKey(fdef.getName)) {
+        val fdef_filtered = fdef.clone()
+        fdef_filtered.setName(fdef.getName + "_filtered")
+        Seq(fdef, fdef_filtered)
+      }
+      else {
+        Seq(fdef)
+      }
+    })
+
    // Construct a Spark StructType representing the schema of the model
-    (Seq(identifier) ++ conf.getPace.getModel.asScala)
+    (Seq(identifier) ++ filtered)
      .foldLeft(
        new StructType()
      )((resType, fieldDef) => {
@ -44,7 +56,6 @@ case class SparkModel(conf: DedupConfig) {
        })
      })

-
  }

  val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
@ -52,7 +63,8 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)

  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
+    df
+      .map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }

  def rowFromJson(json: String): Row = {
@ -64,41 +76,63 @@ case class SparkModel(conf: DedupConfig) {

    schema.fieldNames.zipWithIndex.foldLeft(values) {
      case ((res, (fname, index))) =>
-        val fdef = conf.getPace.getModelMap.get(fname)
+
+        val fdef = conf.getPace.getModelMap.get(fname.split("_filtered")(0))

        if (fdef != null) {
-          res(index) = fdef.getType match {
-            case Type.String | Type.Int =>
-              MapDocumentUtil.truncateValue(
-                MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
-                fdef.getLength
-              )
+          if (!fname.contains("_filtered")) { //process fields with no blacklist
+            res(index) = fdef.getType match {
+              case Type.String | Type.Int =>
+                MapDocumentUtil.truncateValue(
+                  MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
+                  fdef.getLength
+                )

-            case Type.URL =>
-              var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
-              if (!URL_REGEX.matcher(uv).matches)
-                uv = ""
-              uv
+              case Type.URL =>
+                var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
+                if (!URL_REGEX.matcher(uv).matches)
+                  uv = ""
+                uv

-            case Type.List | Type.JSON =>
-              MapDocumentUtil.truncateList(
-                MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
-                fdef.getSize
-              ).asScala
+              case Type.List | Type.JSON =>
+                MapDocumentUtil.truncateList(
+                  MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
+                  fdef.getSize
+                ).asScala

-            case Type.StringConcat =>
-              val jpaths = CONCAT_REGEX.split(fdef.getPath)
+              case Type.StringConcat =>
+                val jpaths = CONCAT_REGEX.split(fdef.getPath)

-              MapDocumentUtil.truncateValue(
-                jpaths
-                  .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
-                  .mkString(" "),
-                fdef.getLength
-              )
+                MapDocumentUtil.truncateValue(
+                  jpaths
+                    .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
+                    .mkString(" "),
+                  fdef.getLength
+                )

-            case Type.DoubleArray =>
-              MapDocumentUtil.getJPathArray(fdef.getPath, json)
+              case Type.DoubleArray =>
+                MapDocumentUtil.getJPathArray(fdef.getPath, json)
+            }
          }
+          else { //process fields with blacklist
+            val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
+
+            res(index) = fdef.getType match {
+              case Type.List | Type.JSON =>
+                MapDocumentUtil.truncateList(
+                  MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
+                  fdef.getSize
+                ).asScala.filter((v: String) => !blacklist.test(v))
+
+              case _ =>
+                val value: String = MapDocumentUtil.truncateValue(
+                  MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
+                  fdef.getLength
+                )
+                if (blacklist.test(value)) "" else value
+            }
+          }
+

          val filter = fdef.getFilter

@ -125,13 +159,12 @@ case class SparkModel(conf: DedupConfig) {
          }

          if (StringUtils.isNotBlank(fdef.getInfer)) {
-            val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
+            val inferFrom: String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
            res(index) = res(index) match {
              case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
              case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
            }
          }
-
        }

        res
@ -139,6 +172,7 @@ case class SparkModel(conf: DedupConfig) {
    }

    new GenericRowWithSchema(values, schema)
+
  }

  def clean(value: String, cleantype: String) : String = {
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@ -227,4 +227,17 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 		System.out.println(cf.apply(conf, Lists.newArrayList(s)));
 	}

+	@Test
+	public void testNumAuthorsTitleSuffixPrefixChain() {
+
+		final ClusteringFunction cf = new NumAuthorsTitleSuffixPrefixChain(params);
+		params.put("mod", 10);
+
+		final String title = "PARP-2 Regulates SIRT1 Expression and Whole-Body Energy Expenditure";
+		final String num_authors = "10";
+		System.out.println("title = " + title);
+		System.out.println("num_authors = " + num_authors);
+		System.out.println(cf.apply(conf, Lists.newArrayList(num_authors, title)));
+	}
+
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java
@ -1,8 +1,7 @@

 package eu.dnetlib.pace.common;

-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.*;

 import org.junit.jupiter.api.*;

@ -54,8 +53,17 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
 		System.out.println("Fixed aliases  : " + fixAliases(TEST_STRING));
 	}

+	@Test()
+	public void countryInferenceTest_NPE() {
+		assertThrows(
+			NullPointerException.class,
+			() -> countryInference("UNKNOWN", null),
+			"Expected countryInference() to throw an NPE");
+	}
+
 	@Test
 	public void countryInferenceTest() {
+		assertEquals("UNKNOWN", countryInference("UNKNOWN", ""));
 		assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
 		assertEquals("UK", countryInference("UK", "Università di Bologna"));
 		assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@ -367,7 +367,18 @@ public class ComparatorTest extends AbstractPaceTest {

 		result = dateRange.distance("invalid date", "2021-05-02", conf);
 		assertEquals(-1.0, result);
+	}

+	@Test
+	public void titleVersionMatchTest() {
+
+		TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
+
+		double result = titleVersionMatch
+			.compare(
+				"parp 2 regulates sirt 1 expression and whole body energy expenditure",
+				"parp 2 regulates sirt 1 expression and whole body energy expenditure", conf);
+		assertEquals(1.0, result);
 	}

 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 import eu.dnetlib.pace.model.Person;
-import jdk.nashorn.internal.ir.annotations.Ignore;

 public class UtilTest {

--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
@ -151,12 +151,17 @@ public class PromoteActionPayloadForGraphTableJob {
 		SparkSession spark, String path, Class<G> rowClazz) {
 		logger.info("Reading graph table from path: {}", path);

-		return spark
-			.read()
-			.textFile(path)
-			.map(
-				(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
-				Encoders.bean(rowClazz));
+		if (HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) {
+			return spark
+				.read()
+				.textFile(path)
+				.map(
+					(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
+					Encoders.bean(rowClazz));
+		} else {
+			logger.info("Found empty graph table from path: {}", path);
+			return spark.emptyDataset(Encoders.bean(rowClazz));
+		}
 	}

 	private static <A extends Oaf> Dataset<A> readActionPayload(
@ -223,7 +228,7 @@ public class PromoteActionPayloadForGraphTableJob {
 				rowClazz,
 				actionPayloadClazz);

-		if (shouldGroupById) {
+		if (Boolean.TRUE.equals(shouldGroupById)) {
 			return PromoteActionPayloadFunctions
 				.groupGraphTableByIdAndMerge(
 					joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
@ -250,6 +255,8 @@ public class PromoteActionPayloadForGraphTableJob {
 				return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation());
 			case "eu.dnetlib.dhp.schema.oaf.Software":
 				return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software());
+			case "eu.dnetlib.dhp.schema.oaf.Person":
+				return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Person());
 			default:
 				throw new RuntimeException("unknown class: " + clazz.getCanonicalName());
 		}
--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
@ -50,7 +50,7 @@ public class PromoteActionPayloadFunctions {
 		PromoteAction.Strategy promoteActionStrategy,
 		Class<G> rowClazz,
 		Class<A> actionPayloadClazz) {
-		if (!isSubClass(rowClazz, actionPayloadClazz)) {
+		if (Boolean.FALSE.equals(isSubClass(rowClazz, actionPayloadClazz))) {
 			throw new RuntimeException(
 				"action payload type must be the same or be a super type of table row type");
 		}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt
@ -7,3 +7,4 @@ promote_action_payload_for_project_table classpath eu/dnetlib/dhp/actionmanager/
 promote_action_payload_for_publication_table classpath eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app
 promote_action_payload_for_relation_table classpath eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app
 promote_action_payload_for_software_table classpath eu/dnetlib/dhp/actionmanager/wf/software/oozie_app
+promote_action_payload_for_person_table classpath eu/dnetlib/dhp/actionmanager/wf/person/oozie_app
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
@ -148,6 +148,7 @@
        <path start="PromoteActionPayloadForPublicationTable"/>
        <path start="PromoteActionPayloadForRelationTable"/>
        <path start="PromoteActionPayloadForSoftwareTable"/>
+        <path start="PromoteActionPayloadForPersonTable"/>
    </fork>

    <action name="PromoteActionPayloadForDatasetTable">
@ -270,6 +271,21 @@
        <error to="Kill"/>
    </action>

+    <action name="PromoteActionPayloadForPersonTable">
+        <sub-workflow>
+            <app-path>${wf:appPath()}/promote_action_payload_for_person_table</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>inputActionPayloadRootPath</name>
+                    <value>${workingDir}/action_payload_by_type</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="JoinPromote"/>
+        <error to="Kill"/>
+    </action>
+
    <join name="JoinPromote" to="End"/>

    <end name="End"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml
@ -0,0 +1,129 @@
+<workflow-app name="promote_action_payload_for_person_table" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>activePromotePersonActionPayload</name>
+            <description>when true will promote actions with eu.dnetlib.dhp.schema.oaf.Person payload</description>
+        </property>
+        <property>
+            <name>inputGraphRootPath</name>
+            <description>root location of input materialized graph</description>
+        </property>
+        <property>
+            <name>inputActionPayloadRootPath</name>
+            <description>root location of action payloads to promote</description>
+        </property>
+        <property>
+            <name>outputGraphRootPath</name>
+            <description>root location for output materialized graph</description>
+        </property>
+        <property>
+            <name>mergeAndGetStrategy</name>
+            <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="DecisionPromotePersonActionPayload"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <decision name="DecisionPromotePersonActionPayload">
+        <switch>
+            <case to="PromotePersonActionPayloadForPersonTable">
+                ${(activePromotePersonActionPayload eq "true") and
+                (fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputActionPayloadRootPath')),'/'),'clazz=eu.dnetlib.dhp.schema.oaf.Person')) eq "true")}
+            </case>
+            <default to="SkipPromotePersonActionPayloadForPersonTable"/>
+        </switch>
+    </decision>
+
+    <action name="PromotePersonActionPayloadForPersonTable">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>PromotePersonActionPayloadForPersonTable</name>
+            <class>eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob</class>
+            <jar>dhp-actionmanager-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/person</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--inputActionPayloadPath</arg><arg>${inputActionPayloadRootPath}/clazz=eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/person</arg>
+            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="SkipPromotePersonActionPayloadForPersonTable">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <prepare>
+                <delete path="${outputGraphRootPath}/person"/>
+            </prepare>
+            <arg>-pb</arg>
+            <arg>${inputGraphRootPath}/person</arg>
+            <arg>${outputGraphRootPath}/person</arg>
+        </distcp>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -34,7 +34,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import scala.Tuple2;

 /**
- * Creates action sets for Crossref affiliation relations inferred by BIP!
+ * Creates action sets for Crossref affiliation relations inferred by OpenAIRE
 */
 public class PrepareAffiliationRelations implements Serializable {

@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable {
 			.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);

 		JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
-			spark, crossrefInputPath, collectedfromOpenAIRE);
+			spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":crossref");

 		JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
-			spark, pubmedInputPath, collectedfromOpenAIRE);
+			spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":pubmed");

 		JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
-			spark, openapcInputPath, collectedfromOpenAIRE);
+			spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":openapc");

-		JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
-			spark, dataciteInputPath, collectedfromOpenAIRE);
+		JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
+			spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":datacite");

-		JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
-			spark, webcrawlInputPath, collectedfromOpenAIRE);
+		JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
+			spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":rawaff");

-		JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
-			spark, publisherlInputPath, collectedfromOpenAIRE);
+		JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
+			spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":webcrawl");

 		crossrefRelations
 			.union(pubmedRelations)
@ -133,7 +133,8 @@ public class PrepareAffiliationRelations implements Serializable {

 	private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
 		String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom,
+		String dataprovenance) {

 		Dataset<Row> df = spark
 			.read()
@ -142,12 +143,13 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");

-		return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
+		return getTextTextJavaPairRDDNew(
+			collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);

 	}

 	private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom, String dataprovenance) {

 		Dataset<Row> df = spark
 			.read()
@ -155,13 +157,14 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");

-		return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
+		return getTextTextJavaPairRDD(
+			collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);

 	}

 	private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
 		String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom, String dataprovenance) {

 		// load and parse affiliation relations from HDFS
 		Dataset<Row> df = spark
@ -170,12 +173,12 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");

-		return getTextTextJavaPairRDD(collectedfrom, df);
+		return getTextTextJavaPairRDD(collectedfrom, df, dataprovenance);
 	}

 	private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
 		String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom, String dataprovenance) {
 		// load and parse affiliation relations from HDFS
 		Dataset<Row> df = spark
 			.read()
@ -184,10 +187,11 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");

-		return getTextTextJavaPairRDDNew(collectedfrom, df);
+		return getTextTextJavaPairRDDNew(collectedfrom, df, dataprovenance);
 	}

-	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
+	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df,
+		String dataprovenance) {
 		// unroll nested arrays
 		df = df
 			.withColumn("matching", functions.explode(new Column("Matchings")))
@ -219,7 +223,7 @@ public class PrepareAffiliationRelations implements Serializable {
 				DataInfo dataInfo = OafMapperUtils
 					.dataInfo(
 						false,
-						BIP_INFERENCE_PROVENANCE,
+						dataprovenance,
 						true,
 						false,
 						qualifier,
@ -235,7 +239,8 @@ public class PrepareAffiliationRelations implements Serializable {
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))));
 	}

-	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
+	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df,
+		String dataprovenance) {
 		// unroll nested arrays
 		df = df
 			.withColumn("matching", functions.explode(new Column("Matchings")))
@ -276,7 +281,7 @@ public class PrepareAffiliationRelations implements Serializable {
 				DataInfo dataInfo = OafMapperUtils
 					.dataInfo(
 						false,
-						BIP_INFERENCE_PROVENANCE,
+						dataprovenance,
 						true,
 						false,
 						qualifier,
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@ -2,21 +2,31 @@
 package eu.dnetlib.dhp.actionmanager.personentity;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static org.apache.spark.sql.functions.*;

+import java.io.BufferedWriter;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
 import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.sql.ResultSet;
+import java.sql.SQLException;
 import java.util.*;
 import java.util.stream.Collectors;

 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.*;
 import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -28,13 +38,14 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.collection.orcid.model.Author;
 import eu.dnetlib.dhp.collection.orcid.model.Employment;
 import eu.dnetlib.dhp.collection.orcid.model.Work;
+import eu.dnetlib.dhp.common.DbClient;
 import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
+import eu.dnetlib.dhp.common.person.Coauthors;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.Person;
-import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
@ -44,7 +55,7 @@ import scala.Tuple2;

 public class ExtractPerson implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
-
+	private static final String QUERY = "SELECT * FROM project_person WHERE pid_type = 'ORCID'";
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final String OPENAIRE_PREFIX = "openaire____";
 	private static final String SEPARATOR = "::";
@ -58,9 +69,48 @@ public class ExtractPerson implements Serializable {

 	private static final String PMCID_PREFIX = "50|pmcid_______::";
 	private static final String ROR_PREFIX = "20|ror_________::";
-	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
+	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class)
+		+ IdentifierFactory.ID_PREFIX_SEPARATOR + ModelConstants.ORCID + "_______";
+	private static final String PROJECT_ID_PREFIX = ModelSupport.getIdPrefix(Project.class)
+		+ IdentifierFactory.ID_PREFIX_SEPARATOR;
+
 	public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
 	public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
+	public static final String FUNDER_AUTHORS_CLASSID = "sysimport:crosswalk:funderdatabase";
+	public static final String FUNDER_AUTHORS_CLASSNAME = "Imported from Funder Database";
+	public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
+	public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
+
+	public static List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
+		.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
+
+	public static final DataInfo ORCIDDATAINFO = OafMapperUtils
+		.dataInfo(
+			false,
+			null,
+			false,
+			false,
+			OafMapperUtils
+				.qualifier(
+					ORCID_AUTHORS_CLASSID,
+					ORCID_AUTHORS_CLASSNAME,
+					ModelConstants.DNET_PROVENANCE_ACTIONS,
+					ModelConstants.DNET_PROVENANCE_ACTIONS),
+			"0.91");
+
+	public static final DataInfo FUNDERDATAINFO = OafMapperUtils
+		.dataInfo(
+			false,
+			null,
+			false,
+			false,
+			OafMapperUtils
+				.qualifier(
+					FUNDER_AUTHORS_CLASSID,
+					FUNDER_AUTHORS_CLASSNAME,
+					ModelConstants.DNET_PROVENANCE_ACTIONS,
+					ModelConstants.DNET_PROVENANCE_ACTIONS),
+			"0.91");

 	public static void main(final String[] args) throws IOException, ParseException {

@ -91,19 +141,130 @@ public class ExtractPerson implements Serializable {
 		final String workingDir = parser.get("workingDir");
 		log.info("workingDir {}", workingDir);

+		final String dbUrl = parser.get("postgresUrl");
+		final String dbUser = parser.get("postgresUser");
+		final String dbPassword = parser.get("postgresPassword");
+
+		final String hdfsNameNode = parser.get("hdfsNameNode");
+
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
-				createActionSet(spark, inputPath, outputPath, workingDir);
+				extractInfoForActionSetFromORCID(spark, inputPath, workingDir);
+				extractInfoForActionSetFromProjects(
+					spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode);
+				createActionSet(spark, outputPath, workingDir);
 			});

 	}

-	private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
+	private static void extractInfoForActionSetFromProjects(SparkSession spark, String inputPath, String workingDir,
+		String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException {

+		Configuration conf = new Configuration();
+		conf.set("fs.defaultFS", hdfsNameNode);
+
+		FileSystem fileSystem = FileSystem.get(conf);
+		Path hdfsWritePath = new Path(hdfsPath);
+		FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
+		try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
+			try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
+				dbClient.processResults(QUERY, rs -> writeRelation(getRelationWithProject(rs), writer));
+			}
+
+		} catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+
+	}
+
+	public static Relation getRelationWithProject(ResultSet rs) {
+		try {
+			return getProjectRelation(
+				rs.getString("project"), rs.getString("pid"),
+				rs.getString("role"));
+		} catch (final SQLException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private static Relation getProjectRelation(String project, String orcid, String role) {
+
+		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
+		String target = PROJECT_ID_PREFIX + StringUtils.substringBefore(project, "::") + "::"
+			+ IdentifierFactory.md5(StringUtils.substringAfter(project, "::"));
+		List<KeyValue> properties = new ArrayList<>();
+
+		Relation relation = OafMapperUtils
+			.getRelation(
+				source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE,
+				ModelConstants.PROJECT_PERSON_PARTICIPATES,
+				collectedfromOpenAIRE,
+				FUNDERDATAINFO,
+				null);
+		relation.setValidated(true);
+
+		if (StringUtil.isNotBlank(role)) {
+			KeyValue kv = new KeyValue();
+			kv.setKey("role");
+			kv.setValue(role);
+			properties.add(kv);
+		}
+
+		if (!properties.isEmpty())
+			relation.setProperties(properties);
+		return relation;
+
+	}
+
+	protected static void writeRelation(final Relation relation, BufferedWriter writer) {
+		try {
+			writer.write(OBJECT_MAPPER.writeValueAsString(relation));
+			writer.newLine();
+		} catch (final IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private static void createActionSet(SparkSession spark, String outputPath, String workingDir) {
+
+		Dataset<Person> people;
+		people = spark
+			.read()
+			.textFile(workingDir + "/people")
+			.map(
+				(MapFunction<String, Person>) value -> OBJECT_MAPPER
+					.readValue(value, Person.class),
+				Encoders.bean(Person.class));
+
+		people
+			.toJavaRDD()
+			.map(p -> new AtomicAction(p.getClass(), p))
+			.union(
+				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
+			.union(
+				getRelations(spark, workingDir + "/coauthorship")
+					.toJavaRDD()
+					.map(r -> new AtomicAction(r.getClass(), r)))
+			.union(
+				getRelations(spark, workingDir + "/affiliation")
+					.toJavaRDD()
+					.map(r -> new AtomicAction(r.getClass(), r)))
+			.union(
+				getRelations(spark, workingDir + "/project")
+					.toJavaRDD()
+					.map(r -> new AtomicAction(r.getClass(), r)))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(
+				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
+	}
+
+	private static void extractInfoForActionSetFromORCID(SparkSession spark, String inputPath, String workingDir) {
 		Dataset<Author> authors = spark
 			.read()
 			.parquet(inputPath + "Authors")
@ -129,18 +290,13 @@ public class ExtractPerson implements Serializable {
 			.parquet(inputPath + "Employments")
 			.as(Encoders.bean(Employment.class));

-		Dataset<Author> peopleToMap = authors
-			.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
-			.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
-			.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
-			.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
-
 		Dataset<Employment> employment = employmentDataset
-			.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
+			.joinWith(authors, employmentDataset.col("orcid").equalTo(authors.col("orcid")))
 			.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));

-		Dataset<Person> people;
-		peopleToMap.map((MapFunction<Author, Person>) op -> {
+		// Mapping all the orcid profiles even if the profile has no visible works
+
+		authors.map((MapFunction<Author, Person>) op -> {
 			Person person = new Person();
 			person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
 			person
@ -190,9 +346,19 @@ public class ExtractPerson implements Serializable {
 					OafMapperUtils
 						.structuredProperty(
 							op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
-							ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
+							ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
+								OafMapperUtils.dataInfo(false,
+										null,
+										false,
+										false,
+										OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
+												ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
+												ModelConstants.DNET_PID_TYPES,
+												ModelConstants.DNET_PID_TYPES),
+								"0.91")));
 			person.setDateofcollection(op.getLastModifiedDate());
 			person.setOriginalId(Arrays.asList(op.getOrcid()));
+			person.setDataInfo(ORCIDDATAINFO);
 			return person;
 		}, Encoders.bean(Person.class))
 			.write()
@ -246,34 +412,6 @@ public class ExtractPerson implements Serializable {
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingDir + "/affiliation");
-
-		people = spark
-			.read()
-			.textFile(workingDir + "/people")
-			.map(
-				(MapFunction<String, Person>) value -> OBJECT_MAPPER
-					.readValue(value, Person.class),
-				Encoders.bean(Person.class));
-
-		people.show(false);
-		people
-			.toJavaRDD()
-			.map(p -> new AtomicAction(p.getClass(), p))
-			.union(
-				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
-			.union(
-				getRelations(spark, workingDir + "/coauthorship")
-					.toJavaRDD()
-					.map(r -> new AtomicAction(r.getClass(), r)))
-			.union(
-				getRelations(spark, workingDir + "/affiliation")
-					.toJavaRDD()
-					.map(r -> new AtomicAction(r.getClass(), r)))
-			.mapToPair(
-				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
-					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
-			.saveAsHadoopFile(
-				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 	}

 	private static Dataset<Relation> getRelations(SparkSession spark, String path) {
@ -307,15 +445,9 @@ public class ExtractPerson implements Serializable {
 				source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
 				ModelConstants.ORG_PERSON_PARTICIPATES,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-				OafMapperUtils
-					.dataInfo(
-						false, null, false, false,
-						OafMapperUtils
-							.qualifier(
-								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
-								ModelConstants.DNET_PROVENANCE_ACTIONS),
-						"0.91"),
+				ORCIDDATAINFO,
 				null);
+		relation.setValidated(true);

 		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
 			KeyValue kv = new KeyValue();
@ -336,45 +468,6 @@ public class ExtractPerson implements Serializable {

 	}

-	private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
-		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
-		String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
-
-		return Arrays
-			.asList(
-				OafMapperUtils
-					.getRelation(
-						source, target, ModelConstants.PERSON_PERSON_RELTYPE,
-						ModelConstants.PERSON_PERSON_SUBRELTYPE,
-						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
-						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-						OafMapperUtils
-							.dataInfo(
-								false, null, false, false,
-								OafMapperUtils
-									.qualifier(
-										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
-										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
-								"0.91"),
-						null),
-				OafMapperUtils
-					.getRelation(
-						target, source, ModelConstants.PERSON_PERSON_RELTYPE,
-						ModelConstants.PERSON_PERSON_SUBRELTYPE,
-						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
-						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-						OafMapperUtils
-							.dataInfo(
-								false, null, false, false,
-								OafMapperUtils
-									.qualifier(
-										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
-										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
-								"0.91"),
-						null));
-
-	}
-
 	private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {

 		if (Optional.ofNullable(w.getPids()).isPresent())
@ -417,21 +510,15 @@ public class ExtractPerson implements Serializable {
 			default:
 				return null;
 		}
-
-		return OafMapperUtils
+		Relation relation = OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.RESULT_PERSON_RELTYPE,
 				ModelConstants.RESULT_PERSON_SUBRELTYPE,
 				ModelConstants.RESULT_PERSON_HASAUTHORED,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-				OafMapperUtils
-					.dataInfo(
-						false, null, false, false,
-						OafMapperUtils
-							.qualifier(
-								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
-								ModelConstants.DNET_PROVENANCE_ACTIONS),
-						"0.91"),
+				ORCIDDATAINFO,
 				null);
+		relation.setValidated(true);
+		return relation;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
@ -31,9 +31,11 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
 # The following is needed as a property of a workflow
 oozie.wf.application.path=${oozieTopWfApplicationPath}

-crossrefInputPath=/data/bip-affiliations/crossref-data.json
-pubmedInputPath=/data/bip-affiliations/pubmed-data.json
-openapcInputPath=/data/bip-affiliations/openapc-data.json
-dataciteInputPath=/data/bip-affiliations/datacite-data.json
+crossrefInputPath=/data/openaire-affiliations/crossref-data.json
+pubmedInputPath=/data/openaire-affiliations/pubmed-data-v4.json
+openapcInputPath=/data/openaire-affiliations/openapc-data.json
+dataciteInputPath=/data/openaire-affiliations/datacite-data.json
+webCrawlInputPath=/data/openaire-affiliations/webCrawl
+publisherInputPath=/data/openaire-affiliations/publishers

-outputPath=/tmp/crossref-affiliations-output-v5
+outputPath=/tmp/affRoAS
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="BipAffiliations" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="OpenAIREAffiliations" xmlns="uri:oozie:workflow:0.5">
    <parameters>

        <property>
@ -21,6 +21,10 @@
            <name>webCrawlInputPath</name>
            <description>the path where to find the inferred affiliation relations from webCrawl</description>
        </property>
+        <property>
+            <name>publisherInputPath</name>
+            <description>the path where to find the inferred affiliation relations from publisher websites</description>
+        </property>
        <property>
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
@ -99,7 +103,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Produces the atomic action with the inferred by BIP! affiliation relations (from Crossref and Pubmed)</name>
+            <name>Produces the atomic action with the inferred by OpenAIRE affiliation relations</name>
            <class>eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
@ -117,6 +121,7 @@
            <arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
            <arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
            <arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
+            <arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
@ -21,5 +21,30 @@
  "paramLongName": "workingDir",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
+},
+  {
+    "paramName": "pu",
+    "paramLongName": "postgresUrl",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  },
+
+  {
+    "paramName": "ps",
+    "paramLongName": "postgresUser",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  },
+  {
+  "paramName": "pp",
+  "paramLongName": "postgresPassword",
+  "paramDescription": "the hdfs name node",
+  "paramRequired": false
+},{
+  "paramName": "nn",
+  "paramLongName": "hdfsNameNode",
+  "paramDescription": "the hdfs name node",
+  "paramRequired": false
 }
+
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
@ -1,2 +1,5 @@
 inputPath=/data/orcid_2023/tables/
-outputPath=/user/miriam.baglioni/peopleAS
+outputPath=/user/miriam.baglioni/peopleAS
+postgresUrl=jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus
+postgresUser=dnet
+postgresPassword=dnetPwd
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
@ -9,6 +9,18 @@
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
+        <property>
+            <name>postgresUrl</name>
+            <description>the path where to store the actionset</description>
+        </property>
+        <property>
+            <name>postgresUser</name>
+            <description>the path where to store the actionset</description>
+        </property>
+        <property>
+            <name>postgresPassword</name>
+            <description>the path where to store the actionset</description>
+        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -102,6 +114,10 @@
            <arg>--inputPath</arg><arg>${inputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--postgresUrl</arg><arg>${postgresUrl}</arg>
+            <arg>--postgresUser</arg><arg>${postgresUser}</arg>
+            <arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml
@ -24,7 +24,7 @@

    <decision name="resume_from">
        <switch>
-            <case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
+            <case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
            <default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
        </switch>
    </decision>
@ -33,6 +33,14 @@
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

+    <action name="reset_workingDir">
+        <fs>
+            <delete path="${workingDir}"/>
+            <mkdir path="${workingDir}"/>
+        </fs>
+        <ok to="download"/>
+        <error to="Kill"/>
+    </action>
    <action name="download">
        <shell xmlns="uri:oozie:shell-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{
  PidType
 }
 import eu.dnetlib.dhp.utils.DHPUtils
-import org.apache.commons.lang.StringUtils
+import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.Row
 import org.json4s
 import org.json4s.DefaultFormats
@ -673,11 +673,12 @@ case object Crossref2Oaf {
    val doi = input.getString(0)
    val rorId = input.getString(1)

-    val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
+
+    val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true)
    val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)

    val r: Relation = new Relation
-    DoiCleaningRule.clean(doi)
+
    r.setSource(pubId)
    r.setTarget(affId)
    r.setRelType(ModelConstants.RESULT_ORGANIZATION)
@ -978,7 +979,26 @@ case object Crossref2Oaf {
            case "10.13039/501100010790" =>
              generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
            case _ => logger.debug("no match for " + funder.DOI.get)
-
+            //Add for Danish funders
+            //Independent Research Fund Denmark (IRFD)
+            case "10.13039/501100004836" =>
+              generateSimpleRelationFromAward(funder, "irfd________", a => a)
+              val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+            //Carlsberg Foundation (CF)
+            case "10.13039/501100002808" =>
+              generateSimpleRelationFromAward(funder, "cf__________", a => a)
+              val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+            //Novo Nordisk Foundation (NNF)
+            case "10.13039/501100009708" =>
+              generateSimpleRelationFromAward(funder, "nnf___________", a => a)
+              val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+            case _ => logger.debug("no match for " + funder.DOI.get)
          }

        } else {
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
@ -98,9 +98,9 @@ public class PrepareAffiliationRelationsTest {
 					"-crossrefInputPath", crossrefAffiliationRelationPathNew,
 					"-pubmedInputPath", crossrefAffiliationRelationPath,
 					"-openapcInputPath", crossrefAffiliationRelationPathNew,
-					"-dataciteInputPath", crossrefAffiliationRelationPath,
-					"-webCrawlInputPath", crossrefAffiliationRelationPath,
-					"-publisherInputPath", publisherAffiliationRelationOldPath,
+					"-dataciteInputPath", crossrefAffiliationRelationPathNew,
+					"-webCrawlInputPath", crossrefAffiliationRelationPathNew,
+					"-publisherInputPath", publisherAffiliationRelationPath,
 					"-outputPath", outputPath
 				});

@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest {
 			.map(aa -> ((Relation) aa.getPayload()));

 		// count the number of relations
-		assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
+		assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 =

 		Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
 		dataset.createOrReplaceTempView("result");
@ -123,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
 		// verify that we have equal number of bi-directional relations
 		Assertions
 			.assertEquals(
-				75, execVerification
+				81, execVerification
 					.filter(
 						"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
 					.collectAsList()
@ -131,7 +131,7 @@ public class PrepareAffiliationRelationsTest {

 		Assertions
 			.assertEquals(
-				75, execVerification
+				81, execVerification
 					.filter(
 						"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
 					.collectAsList()
@ -158,7 +158,7 @@ public class PrepareAffiliationRelationsTest {

 		Assertions
 			.assertEquals(
-				2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
+				4, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());

 		Assertions
 			.assertEquals(
@ -173,7 +173,7 @@ public class PrepareAffiliationRelationsTest {

 		Assertions
 			.assertEquals(
-				3, execVerification
+				1, execVerification
 					.filter(
 						"source = '" + ID_PREFIX
 							+ IdentifierFactory
--- a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
@ -63,6 +63,7 @@
        <path start="copy_software"/>
        <path start="copy_datasource"/>
        <path start="copy_project"/>
+        <path start="copy_person"/>
        <path start="copy_organization"/>
    </fork>

@ -120,6 +121,15 @@
        <error to="Kill"/>
    </action>

+    <action name="copy_person">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <arg>${nameNode}/${sourcePath}/person</arg>
+            <arg>${nameNode}/${outputPath}/person</arg>
+        </distcp>
+        <ok to="wait"/>
+        <error to="Kill"/>
+    </action>
+
    <action name="copy_datasource">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <arg>${nameNode}/${sourcePath}/datasource</arg>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -2,14 +2,13 @@
 package eu.dnetlib.dhp.oa.dedup;

 import java.util.*;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import org.apache.commons.beanutils.BeanUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.FlatMapGroupsFunction;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;

 import eu.dnetlib.dhp.oa.dedup.model.Identifier;
@ -107,6 +106,8 @@ public class DedupRecordFactory {

 					final HashSet<String> acceptanceDate = new HashSet<>();

+					boolean isVisible = false;
+
 					while (it.hasNext()) {
 						Tuple3<String, String, OafEntity> t = it.next();
 						OafEntity entity = t._3();
@ -114,6 +115,7 @@ public class DedupRecordFactory {
 						if (entity == null) {
 							aliases.add(t._2());
 						} else {
+							isVisible = isVisible || !entity.getDataInfo().getInvisible();
 							cliques.add(entity);

 							if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
@ -129,13 +131,20 @@ public class DedupRecordFactory {

 					}

-					if (acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
+					if (!isVisible || acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
 						return Collections.emptyIterator();
 					}

-					OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator());
+					OafEntity mergedEntity = MergeUtils.mergeGroup(cliques.iterator());
 					// dedup records do not have date of transformation attribute
 					mergedEntity.setDateoftransformation(null);
+					mergedEntity
+						.setMergedIds(
+							Stream
+								.concat(cliques.stream().map(OafEntity::getId), aliases.stream())
+								.distinct()
+								.sorted()
+								.collect(Collectors.toList()));

 					return Stream
 						.concat(
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java
@ -91,7 +91,6 @@ public class SparkBlockStats extends AbstractSparkAction {
 				.read()
 				.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
 				.transform(deduper.model().parseJsonDataset())
-				.transform(deduper.filterAndCleanup())
 				.transform(deduper.generateClustersWithCollect())
 				.filter(functions.size(new Column("block")).geq(1));

--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java
@ -5,11 +5,11 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTION
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP;

 import java.io.IOException;
+import java.util.Arrays;

 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.*;
 import org.dom4j.DocumentException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -17,6 +17,7 @@ import org.xml.sax.SAXException;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.common.EntityType;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.OafEntity;
@ -25,6 +26,8 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
+import scala.collection.JavaConversions;
+import scala.collection.JavaConverters;

 public class SparkCreateDedupRecord extends AbstractSparkAction {

@ -85,6 +88,36 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
 				.mode(SaveMode.Overwrite)
 				.option("compression", "gzip")
 				.json(outputPath);
+
+			log.info("Updating mergerels for: '{}'", subEntity);
+			final Dataset<Row> dedupIds = spark
+				.read()
+				.schema("`id` STRING, `mergedIds` ARRAY<STRING>")
+				.json(outputPath)
+				.selectExpr("id as source", "explode(mergedIds) as target");
+			spark
+				.read()
+				.load(mergeRelPath)
+				.where("relClass == 'merges'")
+				.join(dedupIds, JavaConversions.asScalaBuffer(Arrays.asList("source", "target")), "left_semi")
+				.write()
+				.mode(SaveMode.Overwrite)
+				.option("compression", "gzip")
+				.save(workingPath + "/mergerel_filtered");
+
+			final Dataset<Row> validRels = spark.read().load(workingPath + "/mergerel_filtered");
+
+			final Dataset<Row> filteredMergeRels = validRels
+				.union(
+					validRels
+						.withColumnRenamed("source", "source_tmp")
+						.withColumnRenamed("target", "target_tmp")
+						.withColumn("relClass", functions.lit(ModelConstants.IS_MERGED_IN))
+						.withColumnRenamed("target_tmp", "source")
+						.withColumnRenamed("source_tmp", "target"));
+
+			saveParquet(filteredMergeRels, mergeRelPath, SaveMode.Overwrite);
+			removeOutputDir(spark, workingPath + "/mergerel_filtered");
 		}
 	}

--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -69,6 +69,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {

 		Dataset<Relation> mergeRels = spark
 			.read()
+			.schema(REL_BEAN_ENC.schema())
 			.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
 			.as(REL_BEAN_ENC);

--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
@ -46,8 +46,8 @@ class DatasetMergerTest implements Serializable {
 	}

 	@Test
-	void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
-		Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
+	void datasetMergerTest() {
+		Dataset pub_merged = MergeUtils.mergeGroup(datasets.stream().map(Tuple2::_2).iterator());

 		// verify id
 		assertEquals(dedupId, pub_merged.getId());
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
@ -96,7 +96,7 @@
        "aggregation": "MAX",
        "positive": "layer4",
        "negative": "NO_MATCH",
-        "undefined": "MATCH",
+        "undefined": "layer4",
        "ignoreUndefined": "true"
      },
      "layer4": {
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -7,7 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactor
 import eu.dnetlib.dhp.utils.DHPUtils
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.DoiBoostMappingUtil._
-import org.apache.commons.lang.StringUtils
+import org.apache.commons.lang3.StringUtils
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST._
@ -560,9 +560,32 @@ case object Crossref2Oaf {
                "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
                "10.13039/501100013589" | "10.13039/501100000271" =>
              generateSimpleRelationFromAward(funder, "ukri________", a => a)
-
+            //DFG
+            case "10.13039/501100001659" =>
+              val targetId = getProjectId("dfgf________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+            case _ => logger.debug("no match for " + funder.DOI.get)
+            //Add for Danish funders
+            //Independent Research Fund Denmark (IRFD)
+            case "10.13039/501100004836" =>
+              generateSimpleRelationFromAward(funder, "irfd________", a => a)
+              val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+            //Carlsberg Foundation (CF)
+            case "10.13039/501100002808" =>
+              generateSimpleRelationFromAward(funder, "cf__________", a => a)
+              val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+            //Novo Nordisk Foundation (NNF)
+            case "10.13039/501100009708" =>
+              generateSimpleRelationFromAward(funder, "nnf___________", a => a)
+              val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            case _ => logger.debug("no match for " + funder.DOI.get)
-
          }

        } else {
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
 import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
-import org.apache.commons.lang.StringUtils
+import org.apache.commons.lang3.StringUtils
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST._
--- a/dhp-workflows/dhp-enrichment/pom.xml
+++ b/dhp-workflows/dhp-enrichment/pom.xml
@ -48,12 +48,7 @@
            <groupId>io.github.classgraph</groupId>
            <artifactId>classgraph</artifactId>
        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-aggregation</artifactId>
-            <version>1.2.5-SNAPSHOT</version>
-            <scope>compile</scope>
-        </dependency>
+


    </dependencies>
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java
@ -6,11 +6,11 @@ import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;

+import org.apache.commons.lang3.StringUtils;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import com.amazonaws.util.StringUtils;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;

@ -81,7 +81,7 @@ public class Utils implements Serializable {
 		Community c = new Community();
 		c.setId(cm.getId());
 		c.setZenodoCommunities(cm.getOtherZenodoCommunities());
-		if (!StringUtils.isNullOrEmpty(cm.getZenodoCommunity()))
+		if (StringUtils.isNotBlank(cm.getZenodoCommunity()))
 			c.getZenodoCommunities().add(cm.getZenodoCommunity());
 		c.setSubjects(cm.getSubjects());
 		c.getSubjects().addAll(cm.getFos());
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java
@ -13,13 +13,13 @@ public class CommunityContentprovider {
 	private String openaireId;
 	private SelectionConstraints selectioncriteria;

-	private String enabled;
+	private Boolean enabled;

-	public String getEnabled() {
+	public Boolean getEnabled() {
 		return enabled;
 	}

-	public void setEnabled(String enabled) {
+	public void setEnabled(Boolean enabled) {
 		this.enabled = enabled;
 	}

--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
@ -4,7 +4,7 @@ package eu.dnetlib.dhp.bulktag.community;
 import java.io.Serializable;
 import java.lang.reflect.InvocationTargetException;

-import org.apache.htrace.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonIgnore;

 import eu.dnetlib.dhp.bulktag.criteria.Selection;
 import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java
@ -0,0 +1,302 @@
+
+package eu.dnetlib.dhp.person;
+
+import static com.ibm.icu.text.PluralRules.Operand.w;
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.MapGroupsFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
+import eu.dnetlib.dhp.common.person.Coauthors;
+import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import scala.Tuple2;
+
+public class SparkExtractPersonRelations {
+
+	private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
+	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
+
+	public static final DataInfo DATAINFO = OafMapperUtils
+		.dataInfo(
+			false,
+			"openaire",
+			true,
+			false,
+			OafMapperUtils
+				.qualifier(
+					ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
+					ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
+					ModelConstants.DNET_PROVENANCE_ACTIONS,
+					ModelConstants.DNET_PROVENANCE_ACTIONS),
+			"0.85");
+
+	public static void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				SparkCountryPropagationJob.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		String sourcePath = parser.get("sourcePath");
+		log.info("sourcePath: {}", sourcePath);
+
+		final String workingPath = parser.get("outputPath");
+		log.info("workingPath: {}", workingPath);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+
+				extractRelations(
+					spark,
+					sourcePath,
+					workingPath);
+				removeIsolatedPerson(spark, sourcePath, workingPath);
+			});
+	}
+
+	private static void removeIsolatedPerson(SparkSession spark, String sourcePath, String workingPath) {
+		Dataset<Person> personDataset = spark
+			.read()
+			.schema(Encoders.bean(Person.class).schema())
+			.json(sourcePath + "person")
+			.as(Encoders.bean(Person.class));
+
+		Dataset<Relation> relationDataset = spark
+			.read()
+			.schema(Encoders.bean(Relation.class).schema())
+			.json(sourcePath + "relation")
+			.as(Encoders.bean(Relation.class));
+
+		personDataset
+			.join(relationDataset, personDataset.col("id").equalTo(relationDataset.col("source")), "left_semi")
+			.write()
+			.option("compression", "gzip")
+			.mode(SaveMode.Overwrite)
+			.json(workingPath + "person");
+
+		spark
+			.read()
+			.schema(Encoders.bean(Person.class).schema())
+			.json(workingPath + "person")
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(sourcePath + "person");
+	}
+
+	private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) {
+
+		Dataset<Tuple2<String, Relation>> relationDataset = spark
+			.read()
+			.schema(Encoders.bean(Relation.class).schema())
+			.json(sourcePath + "relation")
+			.as(Encoders.bean(Relation.class))
+			.map(
+				(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
+					r.getSource() + r.getRelClass() + r.getTarget(), r),
+				Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
+
+		ModelSupport.entityTypes
+			.keySet()
+			.stream()
+			.filter(ModelSupport::isResult)
+			.forEach(
+				e -> {
+					// 1. search for results having orcid_pending and orcid in the set of pids for the authors
+					Dataset<Result> resultWithOrcids = spark
+						.read()
+						.schema(Encoders.bean(Result.class).schema())
+						.json(sourcePath + e.name())
+						.as(Encoders.bean(Result.class))
+						.filter(
+							(FilterFunction<Result>) r -> !r.getDataInfo().getDeletedbyinference() &&
+								!r.getDataInfo().getInvisible() &&
+								Optional
+									.ofNullable(r.getAuthor())
+									.isPresent())
+						.filter(
+							(FilterFunction<Result>) r -> r
+								.getAuthor()
+								.stream()
+								.anyMatch(
+									a -> Optional
+										.ofNullable(
+											a
+												.getPid())
+										.isPresent() &&
+										a
+											.getPid()
+											.stream()
+											.anyMatch(
+												p -> Arrays
+													.asList("orcid", "orcid_pending")
+													.contains(p.getQualifier().getClassid().toLowerCase()))));
+					// 2. create authorship relations between the result identifier and the person entity with
+					// orcid_pending.
+					Dataset<Tuple2<String, Relation>> newRelations = resultWithOrcids
+						.flatMap(
+							(FlatMapFunction<Result, Relation>) r -> getAuthorshipRelations(r),
+							Encoders.bean(Relation.class))
+//							.groupByKey((MapFunction<Relation, String>) r-> r.getSource()+r.getTarget(), Encoders.STRING() )
+//							.mapGroups((MapGroupsFunction<String, Relation, Relation>) (k,it) -> it.next(), Encoders.bean(Relation.class) )
+						.map(
+							(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
+								r.getSource() + r.getRelClass() + r.getTarget(), r),
+							Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
+					newRelations
+						.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
+						.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
+							if (t2._2() == null)
+								return t2._1()._2();
+							return null;
+						}, Encoders.bean(Relation.class))
+						.filter((FilterFunction<Relation>) r -> r != null)
+						.write()
+						.mode(SaveMode.Append)
+						.option("compression", "gzip")
+						.json(workingPath);
+
+					// 2.1 store in a separate location the relation between the person and the pids for the result?
+
+					// 3. create co_authorship relations between the pairs of authors with orcid/orcid_pending pids
+					newRelations = resultWithOrcids
+						.map((MapFunction<Result, Coauthors>) r -> getAuthorsPidList(r), Encoders.bean(Coauthors.class))
+						.flatMap(
+							(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
+							Encoders.bean(Relation.class))
+						.groupByKey(
+							(MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
+						.mapGroups(
+							(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(),
+							Encoders.bean(Relation.class))
+						.map(
+							(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
+								r.getSource() + r.getRelClass() + r.getTarget(), r),
+							Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
+					newRelations
+						.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
+						.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
+							if (t2._2() == null)
+								return t2._1()._2();
+							return null;
+						}, Encoders.bean(Relation.class))
+						.filter((FilterFunction<Relation>) r -> r != null)
+						.write()
+						.mode(SaveMode.Append)
+						.option("compression", "gzip")
+						.json(workingPath);
+
+				});
+		spark
+			.read()
+			.schema(Encoders.bean(Relation.class).schema())
+			.json(workingPath)
+			.write()
+			.mode(SaveMode.Append)
+			.option("compression", "gzip")
+			.json(sourcePath + "relation");
+
+	}
+
+	private static Coauthors getAuthorsPidList(Result r) {
+		Coauthors coauth = new Coauthors();
+		coauth
+			.setCoauthors(
+				r
+					.getAuthor()
+					.stream()
+					.filter(
+						a -> a
+							.getPid()
+							.stream()
+							.anyMatch(
+								p -> Arrays.asList("orcid", "orcid_pending").contains(p.getQualifier().getClassid())))
+					.map(a -> {
+						Optional<StructuredProperty> tmp = a
+							.getPid()
+							.stream()
+							.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid"))
+							.findFirst();
+						if (tmp.isPresent())
+							return tmp.get().getValue();
+						tmp = a
+							.getPid()
+							.stream()
+							.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
+							.findFirst();
+						if (tmp.isPresent())
+							return tmp.get().getValue();
+
+						return null;
+					})
+					.filter(Objects::nonNull)
+					.collect(Collectors.toList()));
+		return coauth;
+
+	}
+
+	private static Iterator<Relation> getAuthorshipRelations(Result r) {
+		List<Relation> relationList = new ArrayList<>();
+		for (Author a : r.getAuthor())
+
+			relationList.addAll(a.getPid().stream().map(p -> {
+
+				if (p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
+					return getRelation(p.getValue(), r.getId());
+				return null;
+			})
+				.filter(Objects::nonNull)
+				.collect(Collectors.toList()));
+
+		return relationList.iterator();
+	}
+
+	private static Relation getRelation(String orcid, String resultId) {
+
+		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
+
+		Relation relation = OafMapperUtils
+			.getRelation(
+				source, resultId, ModelConstants.RESULT_PERSON_RELTYPE,
+				ModelConstants.RESULT_PERSON_SUBRELTYPE,
+				ModelConstants.RESULT_PERSON_HASAUTHORED,
+				null, // collectedfrom = null
+				DATAINFO,
+				null);
+
+		return relation;
+	}
+
+}
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt
@ -7,4 +7,5 @@ community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunit
 result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app
 community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app
 community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
-country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
+country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
+person_propagation classpath eu/dnetlib/dhp/wf/subworkflows/person/oozie_app
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml
@ -122,6 +122,7 @@
            <case to="community_project">${wf:conf('resumeFrom') eq 'CommunityProject'}</case>
            <case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
            <case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
+            <case to="person_propagation">${wf:conf('resumeFrom') eq 'PersonPropagation'}</case>
            <default to="orcid_propagation"/>
        </switch>
    </decision>
@ -291,10 +292,24 @@
                </property>
            </configuration>
        </sub-workflow>
+        <ok to="person_propagation" />
+        <error to="Kill" />
+    </action>
+    <action name="person_propagation">
+        <sub-workflow>
+            <app-path>${wf:appPath()}/person_propagation
+            </app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>sourcePath</name>
+                    <value>${outputPath}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
        <ok to="country_propagation" />
        <error to="Kill" />
    </action>
-
    <action name="country_propagation">
        <sub-workflow>
            <app-path>${wf:appPath()}/country_propagation
@ -319,6 +334,8 @@
        <error to="Kill" />
    </action>

+
+
    <end name="End"/>

 </workflow-app>
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
@ -34,6 +34,7 @@
        <path start="copy_organization"/>
        <path start="copy_projects"/>
        <path start="copy_datasources"/>
+        <path start="copy_persons"/>
    </fork>

    <action name="copy_relation">
@ -80,6 +81,17 @@
        <error to="Kill"/>
    </action>

+    <action name="copy_persons">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <arg>${nameNode}/${sourcePath}/person</arg>
+            <arg>${nameNode}/${outputPath}/person</arg>
+        </distcp>
+        <ok to="copy_wait"/>
+        <error to="Kill"/>
+    </action>
+
    <join name="copy_wait" to="fork_prepare_assoc_step1"/>

    <fork name="fork_prepare_assoc_step1">
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json
@ -0,0 +1,21 @@
+[
+  {
+    "paramName":"s",
+    "paramLongName":"sourcePath",
+    "paramDescription": "the path of the sequencial file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path used to store temporary output files",
+    "paramRequired": true
+  },
+
+  {
+    "paramName": "ssm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "true if the spark session is managed, false otherwise",
+    "paramRequired": false
+  }
+]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties
@ -0,0 +1 @@
+sourcePath=/tmp/miriam/13_graph_copy
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml
@ -0,0 +1,58 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
+    </property>
+    <property>
+        <name>spark2EventLogDir</name>
+        <value>/user/spark/spark2ApplicationHistory</value>
+    </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+    </property>
+    <property>
+        <name>sparkExecutorNumber</name>
+        <value>4</value>
+    </property>
+    <property>
+        <name>sparkDriverMemory</name>
+        <value>15G</value>
+    </property>
+    <property>
+        <name>sparkExecutorMemory</name>
+        <value>5G</value>
+    </property>
+    <property>
+        <name>sparkExecutorCores</name>
+        <value>4</value>
+    </property>
+    <property>
+        <name>spark2MaxExecutors</name>
+        <value>50</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml
@ -0,0 +1,68 @@
+<workflow-app name="person_propagation" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the source path</description>
+        </property>
+
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="reset_outputpath"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="reset_outputpath">
+        <fs>
+            <delete path="${workingDir}"/>
+            <mkdir path="${workingDir}"/>
+        </fs>
+        <ok to="extract_person_relation_from_graph"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="extract_person_relation_from_graph">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>personPropagation</name>
+            <class>eu.dnetlib.dhp.person.SparkExtractPersonRelations</class>
+            <jar>dhp-enrichment-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.speculation=false
+                --conf spark.hadoop.mapreduce.map.speculative=false
+                --conf spark.hadoop.mapreduce.reduce.speculative=false
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${sourcePath}/</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+
+
+    <end name="End"/>
+
+</workflow-app>
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java
@ -0,0 +1,93 @@
+
+package eu.dnetlib.dhp.person;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
+import eu.dnetlib.dhp.schema.oaf.*;
+import scala.Tuple2;
+
+public class PersonPropagationJobTest {
+
+	private static final Logger log = LoggerFactory.getLogger(PersonPropagationJobTest.class);
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static SparkSession spark;
+
+	private static Path workingDir;
+
+	@BeforeAll
+	public static void beforeAll() throws IOException {
+		workingDir = Files.createTempDirectory(PersonPropagationJobTest.class.getSimpleName());
+		log.info("using work dir {}", workingDir);
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(PersonPropagationJobTest.class.getSimpleName());
+
+		conf.setMaster("local[*]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.toString());
+		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(PersonPropagationJobTest.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@AfterAll
+	public static void afterAll() throws IOException {
+		FileUtils.deleteDirectory(workingDir.toFile());
+		spark.stop();
+	}
+
+	@Test
+	void testPersonPropagation() throws Exception {
+		final String sourcePath = getClass()
+			.getResource("/eu/dnetlib/dhp/personpropagation/graph")
+			.getPath();
+
+		SparkExtractPersonRelations
+			.main(
+				new String[] {
+					"--isSparkSessionManaged", Boolean.FALSE.toString(),
+					"--sourcePath", sourcePath,
+					"--outputPath", workingDir.toString()
+				});
+
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.textFile(workingDir.toString() + "/relation")
+			.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+		// TODO write assertions and find relevant information for hte resource files
+	}
+
+}
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob {
 		final Encoder<T> clazzEncoder = Encoders.bean(clazz);

 		Dataset<Row> dataset = spark
-				.read()
-				.schema(clazzEncoder.schema())
-				.json(inputPath);
+			.read()
+			.schema(clazzEncoder.schema())
+			.json(inputPath);

 		if (numPartitions > 0) {
 			log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -153,34 +153,40 @@ public abstract class AbstractMdRecordToOafMapper {
 			final DataInfo entityInfo = prepareDataInfo(doc, this.invisible);
 			final long lastUpdateTimestamp = new Date().getTime();

-			final List<Instance> instances = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
+			final Instance instance = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);

-			final String type = getResultType(doc, instances);
+			if (!Optional
+				.ofNullable(instance.getInstancetype())
+				.map(Qualifier::getClassid)
+				.filter(StringUtils::isNotBlank)
+				.isPresent()) {
+				return Lists.newArrayList();
+			}

-			return createOafs(doc, type, instances, collectedFrom, entityInfo, lastUpdateTimestamp);
+			final String type = getResultType(instance);
+
+			return createOafs(doc, type, instance, collectedFrom, entityInfo, lastUpdateTimestamp);
 		} catch (final DocumentException e) {
 			log.error("Error with record:\n" + xml);
 			return Lists.newArrayList();
 		}
 	}

-	protected String getResultType(final Document doc, final List<Instance> instances) {
-		final String type = doc.valueOf("//dr:CobjCategory/@type");
-
-		if (StringUtils.isBlank(type) && this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
-			final String instanceType = instances
-				.stream()
-				.map(i -> i.getInstancetype().getClassid())
-				.findFirst()
-				.filter(s -> !UNKNOWN.equalsIgnoreCase(s))
-				.orElse("0000"); // Unknown
+	protected String getResultType(final Instance instance) {
+		if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
 			return Optional
-				.ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
+				.ofNullable(instance.getInstancetype())
 				.map(Qualifier::getClassid)
+				.map(
+					instanceType -> Optional
+						.ofNullable(
+							this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
+						.map(Qualifier::getClassid)
+						.orElse("0000"))
 				.orElse("0000");
+		} else {
+			throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES);
 		}
-
-		return type;
 	}

 	private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
@ -197,12 +203,12 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected List<Oaf> createOafs(
 		final Document doc,
 		final String type,
-		final List<Instance> instances,
+		final Instance instance,
 		final KeyValue collectedFrom,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {

-		final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
+		final OafEntity entity = createEntity(doc, type, instance, collectedFrom, info, lastUpdateTimestamp);

 		final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
 		originalId.add(entity.getId());
@ -235,19 +241,19 @@ public abstract class AbstractMdRecordToOafMapper {

 	private OafEntity createEntity(final Document doc,
 		final String type,
-		final List<Instance> instances,
+		final Instance instance,
 		final KeyValue collectedFrom,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
 		switch (type.toLowerCase()) {
 			case "publication":
 				final Publication p = new Publication();
-				populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(p, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				p.setJournal(prepareJournal(doc, info));
 				return p;
 			case "dataset":
 				final Dataset d = new Dataset();
-				populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(d, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				d.setStoragedate(prepareDatasetStorageDate(doc, info));
 				d.setDevice(prepareDatasetDevice(doc, info));
 				d.setSize(prepareDatasetSize(doc, info));
@ -258,7 +264,7 @@ public abstract class AbstractMdRecordToOafMapper {
 				return d;
 			case "software":
 				final Software s = new Software();
-				populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(s, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
 				s.setLicense(prepareSoftwareLicenses(doc, info));
 				s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
@ -268,7 +274,7 @@ public abstract class AbstractMdRecordToOafMapper {
 			case "otherresearchproducts":
 			default:
 				final OtherResearchProduct o = new OtherResearchProduct();
-				populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(o, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
 				o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
 				o.setTool(prepareOtherResearchProductTools(doc, info));
@ -415,7 +421,7 @@ public abstract class AbstractMdRecordToOafMapper {
 	private void populateResultFields(
 		final Result r,
 		final Document doc,
-		final List<Instance> instances,
+		final Instance instance,
 		final KeyValue collectedFrom,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
@ -449,8 +455,8 @@ public abstract class AbstractMdRecordToOafMapper {
 		r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
 		r.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
 		r.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
-		r.setInstance(instances);
-		r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
+		r.setInstance(Arrays.asList(instance));
+		r.setBestaccessright(OafMapperUtils.createBestAccessRights(Arrays.asList(instance)));
 		r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
 	}

@ -509,7 +515,7 @@ public abstract class AbstractMdRecordToOafMapper {

 	protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);

-	protected abstract List<Instance> prepareInstances(
+	protected abstract Instance prepareInstances(
 		Document doc,
 		DataInfo info,
 		KeyValue collectedfrom,
@ -657,13 +663,21 @@ public abstract class AbstractMdRecordToOafMapper {
 			final Node n = (Node) o;
 			final String classId = n.valueOf(xpathClassId).trim();
 			if (this.vocs.termExists(schemeId, classId)) {
-				res
-					.add(
-						HashableStructuredProperty
-							.newInstance(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
+				final String value = n.getText();
+				if (StringUtils.isNotBlank(value)) {
+					res
+						.add(
+							HashableStructuredProperty
+								.newInstance(value, this.vocs.getTermAsQualifier(schemeId, classId), info));
+				}
 			}
 		}
-		return Lists.newArrayList(res);
+		return res
+			.stream()
+			.filter(Objects::nonNull)
+			.filter(p -> StringUtils.isNotBlank(p.getValue()))
+			.filter(p -> StringUtils.isNotBlank(p.getValue().trim()))
+			.collect(Collectors.toList());
 	}

 	protected List<StructuredProperty> prepareListStructProps(
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@ -133,7 +133,7 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication {
 					inputRdd
 						.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
 						.groupByKey()
-						.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
+						.map(t -> MergeUtils.mergeGroup(t._2.iterator())),
 					// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
 					// .reduceByKey(MergeUtils::merge)
 					// .map(Tuple2::_2),
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -519,6 +519,28 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 						r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
 						r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
 						break;
+					case "resultOrganization_affiliation_isAuthorInstitutionOf":
+						if (!"organization".equals(sourceType)) {
+							throw new IllegalStateException(
+								String
+									.format(
+										"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
+										semantics));
+						}
+						r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
+						r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
+						break;
+					case "resultOrganization_affiliation_hasAuthorInstitution":
+						if (!"organization".equals(targetType)) {
+							throw new IllegalStateException(
+								String
+									.format(
+										"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
+										semantics));
+						}
+						r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
+						r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
+						break;
 					default:
 						throw new IllegalArgumentException("claim semantics not managed: " + semantics);
 				}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
@ -135,7 +135,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
 	}

 	@Override
-	protected List<Instance> prepareInstances(
+	protected Instance prepareInstances(
 		final Document doc,
 		final DataInfo info,
 		final KeyValue collectedfrom,
@ -197,7 +197,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
 			instance.getUrl().addAll(validUrl);
 		}

-		return Lists.newArrayList(instance);
+		return instance;
 	}

 	/**
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -126,7 +126,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 	}

 	@Override
-	protected List<Instance> prepareInstances(
+	protected Instance prepareInstances(
 		final Document doc,
 		final DataInfo info,
 		final KeyValue collectedfrom,
@ -210,7 +210,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 			instance.setUrl(new ArrayList<>());
 			instance.getUrl().addAll(validUrl);
 		}
-		return Arrays.asList(instance);
+		return instance;
 	}

 	protected String trimAndDecodeUrl(String url) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
@ -80,9 +80,6 @@ public class PatchRelationsApplication {
 		final Dataset<Relation> rels = readPath(spark, relationPath, Relation.class);
 		final Dataset<RelationIdMapping> idMapping = readPath(spark, idMappingPath, RelationIdMapping.class);

-		log.info("relations: {}", rels.count());
-		log.info("idMapping: {}", idMapping.count());
-
 		final Dataset<Relation> bySource = rels
 			.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
 			.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json
@ -22,5 +22,11 @@
    "paramLongName": "targetPath",
    "paramDescription": "the output path of the graph enriched",
    "paramRequired": true
+  },
+  {
+    "paramName": "wp",
+    "paramLongName": "workingDir",
+    "paramDescription": "the working dir",
+    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml
@ -51,6 +51,7 @@
            <arg>--orcidPath</arg><arg>${orcidPath}</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--graphPath</arg><arg>${graphPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="reset_outputpath"/>
@ -89,6 +90,14 @@
            <arg>${nameNode}/${graphPath}/project</arg>
            <arg>${nameNode}/${targetPath}/project</arg>
        </distcp>
+        <ok to="copy_person"/>
+        <error to="Kill"/>
+    </action>
+    <action name="copy_person">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <arg>${nameNode}/${graphPath}/person</arg>
+            <arg>${nameNode}/${targetPath}/person</arg>
+        </distcp>
        <ok to="copy_relation"/>
        <error to="Kill"/>
    </action>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -142,6 +142,7 @@
        <path start="clean_datasource"/>
        <path start="clean_organization"/>
        <path start="clean_project"/>
+        <path start="clean_person"/>
        <path start="clean_relation"/>
    </fork>

@ -161,6 +162,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
@ -196,6 +198,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
@ -231,6 +234,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=5000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
@ -266,6 +270,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=2000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
@ -301,6 +306,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
@ -336,6 +342,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
@ -371,6 +378,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=2000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
@ -390,6 +398,42 @@
        <error to="Kill"/>
    </action>

+    <action name="clean_person">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean person</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
+                --conf spark.sql.shuffle.partitions=2000
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${graphInputPath}/person</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/person</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>--contextId</arg><arg>${contextId}</arg>
+            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
+            <arg>--country</arg><arg>${country}</arg>
+            <arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
+            <arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
+            <arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
+            <arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
+            <arg>--deepClean</arg><arg>${shouldClean}</arg>
+        </spark>
+        <ok to="wait_clean"/>
+        <error to="Kill"/>
+    </action>
+
    <action name="clean_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -406,6 +450,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=20000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -102,6 +102,7 @@
        <path start="import_datasource"/>
        <path start="import_organization"/>
        <path start="import_project"/>
+        <path start="import_person"/>
        <path start="import_relation"/>
    </fork>

@ -308,6 +309,35 @@
        <error to="Kill"/>
    </action>

+    <action name="import_person">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Import table person</name>
+            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/person</arg>
+            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
+            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
+        </spark>
+        <ok to="join_import"/>
+        <error to="Kill"/>
+    </action>
+
    <action name="import_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
@ -68,6 +68,7 @@
        <path start="merge_datasource"/>
        <path start="merge_organization"/>
        <path start="merge_project"/>
+        <path start="merge_person"/>
        <path start="merge_relation"/>
    </fork>

@ -260,6 +261,33 @@
        <error to="Kill"/>
    </action>

+    <action name="merge_person">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Merge person</name>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/person</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/person</arg>
+            <arg>--outputPath</arg><arg>${graphOutputPath}/person</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--priority</arg><arg>${priority}</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
    <action name="merge_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@ -649,6 +649,7 @@
        <path start="merge_claims_datasource"/>
        <path start="merge_claims_organization"/>
        <path start="merge_claims_project"/>
+        <path start="merge_claims_person"/>
        <path start="merge_claims_relation"/>
    </fork>

@ -860,6 +861,32 @@
        <error to="Kill"/>
    </action>

+    <action name="merge_claims_person">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>MergeClaims_person</name>
+            <class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=200
+            </spark-opts>
+            <arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
+            <arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
+            <arg>--outputRawGaphPath</arg><arg>${graphOutputPath}</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+        </spark>
+        <ok to="wait_merge"/>
+        <error to="Kill"/>
+    </action>
+
    <join name="wait_merge" to="decisionPatchRelations"/>

    <decision name="decisionPatchRelations">
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@ -47,13 +47,15 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
    log.info(s"orcidPath is '$orcidPath'")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath is '$targetPath'")
+    val workingDir = parser.get("workingDir")
+    log.info(s"targetPath is '$workingDir'")

-    createTemporaryData(graphPath, orcidPath, targetPath)
-    analisys(targetPath)
-    generateGraph(graphPath, targetPath)
+    createTemporaryData(graphPath, orcidPath, workingDir)
+    analisys(workingDir)
+    generateGraph(graphPath, workingDir, targetPath)
  }

-  private def generateGraph(graphPath: String, targetPath: String): Unit = {
+  private def generateGraph(graphPath: String, workingDir: String, targetPath: String): Unit = {

    ModelSupport.entityTypes.asScala
      .filter(e => ModelSupport.isResult(e._1))
@ -63,7 +65,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]

        val matched = spark.read
          .schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema)
-          .parquet(s"${targetPath}/${resultType}_matched")
+          .parquet(s"${workingDir}/${resultType}_matched")
          .selectExpr("id", "enriched_author")

        spark.read
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
@ -133,7 +133,7 @@ object SparkCreateInputGraph {
    val ds: Dataset[T] = spark.read.load(sourcePath).as[T]

    ds.groupByKey(_.getId)
-      .mapGroups { (id, it) => MergeUtils.mergeGroup(id, it.asJava).asInstanceOf[T] }
+      .mapGroups { (id, it) => MergeUtils.mergeGroup(it.asJava).asInstanceOf[T] }
 //      .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] }
 //      .map(_)
      .write
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java
@ -30,6 +30,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.common.RelationInverse;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;

@ -365,6 +367,40 @@ class MigrateDbEntitiesApplicationTest {
 		assertValidId(r2.getCollectedfrom().get(0).getKey());
 	}

+	@Test
+	void testProcessClaims_affiliation() throws Exception {
+		final List<TypedField> fields = prepareMocks("claimsrel_resultset_affiliation.json");
+
+		final List<Oaf> list = app.processClaims(rs);
+
+		assertEquals(2, list.size());
+		verifyMocks(fields);
+
+		assertTrue(list.get(0) instanceof Relation);
+		assertTrue(list.get(1) instanceof Relation);
+
+		final Relation r1 = (Relation) list.get(0);
+		final Relation r2 = (Relation) list.get(1);
+
+		assertValidId(r1.getSource());
+		assertValidId(r1.getTarget());
+		assertValidId(r2.getSource());
+		assertValidId(r2.getTarget());
+		assertNotNull(r1.getDataInfo());
+		assertNotNull(r2.getDataInfo());
+		assertNotNull(r1.getDataInfo().getTrust());
+		assertNotNull(r2.getDataInfo().getTrust());
+		assertEquals(r1.getSource(), r2.getTarget());
+		assertEquals(r2.getSource(), r1.getTarget());
+		assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
+		assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
+		assertTrue(StringUtils.isNotBlank(r1.getRelType()));
+		assertTrue(StringUtils.isNotBlank(r2.getRelType()));
+
+		assertValidId(r1.getCollectedfrom().get(0).getKey());
+		assertValidId(r2.getCollectedfrom().get(0).getKey());
+	}
+
 	private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
 		final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
 		final ObjectMapper mapper = new ObjectMapper();
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json
@ -0,0 +1,27 @@
+[
+	{
+		"field": "source_type",
+		"type": "string",
+		"value": "organization"
+	},
+	{
+		"field": "source_id",
+		"type": "string",
+		"value": "openorgs____::b5ca9d4340e26454e367e2908ef3872f"
+	},
+	{
+		"field": "target_type",
+		"type": "string",
+		"value": "software"
+	},
+	{
+		"field": "target_id",
+		"type": "string",
+		"value": "userclaim___::bde53826d07c8cf47c99222a375cd2e8"
+	},
+	{
+		"field": "semantics",
+		"type": "string",
+		"value": "resultOrganization_affiliation_isAuthorInstitutionOf"
+	}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
    assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
    // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
  }
+
  @Test def testDocumentationNames(): Unit = {
    assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
  }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.provision;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

+import java.util.Comparator;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
@ -167,8 +168,9 @@ public class CreateRelatedEntitiesJob_phase1 {
 					result
 						.getDescription()
 						.stream()
-						.findFirst()
+						.filter(d -> Objects.nonNull(d.getValue()))
 						.map(Field::getValue)
+						.max(Comparator.comparingInt(String::length))
 						.ifPresent(
 							d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)));
 				}
@ -231,6 +233,14 @@ public class CreateRelatedEntitiesJob_phase1 {
 				if (!f.isEmpty()) {
 					re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList()));
 				}
+				break;
+			case person:
+				final Person person = (Person) entity;
+
+				re.setGivenName(person.getGivenName());
+				re.setFamilyName(person.getFamilyName());
+				re.setAlternativeNames(person.getAlternativeNames());
+
 				break;
 		}
 		return re;
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
@ -2,10 +2,12 @@
 package eu.dnetlib.dhp.oa.provision;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits.MAX_RELATIONS_BY_RELCLASS;
 import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;

 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Optional;

 import org.apache.commons.io.IOUtils;
@ -15,11 +17,13 @@ import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.util.LongAccumulator;
+import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.annotation.JsonInclude;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -27,11 +31,13 @@ import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
 import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
+import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
 import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
 import eu.dnetlib.dhp.schema.solr.SolrRecord;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -124,6 +130,9 @@ public class PayloadConverterJob {
 					.map(Oaf::getDataInfo)
 					.map(DataInfo::getDeletedbyinference)
 					.orElse(false))
+			.map(
+				(MapFunction<JoinedEntity, JoinedEntity>) PayloadConverterJob::pruneRelatedEntities,
+				Encoders.kryo(JoinedEntity.class))
 			.map(
 				(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
 					recordFactory.build(je, validateXML),
@ -139,6 +148,32 @@ public class PayloadConverterJob {
 			.json(outputPath);
 	}

+	/**
+	 * This function iterates through the RelatedEntityWrapper(s) associated to the JoinedEntity and rules out
+	 * those exceeding the maximum allowed frequency defined in eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits#MAX_RELATIONS_BY_RELCLASS
+	 */
+	private static JoinedEntity pruneRelatedEntities(JoinedEntity je) {
+		Map<String, Long> freqs = Maps.newHashMap();
+		List<RelatedEntityWrapper> rew = Lists.newArrayList();
+
+		if (je.getLinks() != null) {
+			je.getLinks().forEach(link -> {
+				final String relClass = link.getRelation().getRelClass();
+
+				final Long count = freqs.getOrDefault(relClass, 0L);
+				final Long max = MAX_RELATIONS_BY_RELCLASS.getOrDefault(relClass, Long.MAX_VALUE);
+
+				if (count <= max) {
+					rew.add(link);
+					freqs.put(relClass, freqs.getOrDefault(relClass, 0L) + 1);
+				}
+			});
+			je.setLinks(rew);
+		}
+
+		return je;
+	}
+
 	private static void removeOutputDir(final SparkSession spark, final String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
@ -23,6 +23,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
 import eu.dnetlib.dhp.schema.solr.*;
 import eu.dnetlib.dhp.schema.solr.AccessRight;
 import eu.dnetlib.dhp.schema.solr.Author;
@ -37,6 +38,8 @@ import eu.dnetlib.dhp.schema.solr.Measure;
 import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
 import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
 import eu.dnetlib.dhp.schema.solr.Organization;
+import eu.dnetlib.dhp.schema.solr.Person;
+import eu.dnetlib.dhp.schema.solr.PersonTopic;
 import eu.dnetlib.dhp.schema.solr.Pid;
 import eu.dnetlib.dhp.schema.solr.Project;
 import eu.dnetlib.dhp.schema.solr.Result;
@ -89,6 +92,8 @@ public class ProvisionModelSupport {
 			r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
 		} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
 			r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
+		} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Person) {
+			r.setPerson(mapPerson((eu.dnetlib.dhp.schema.oaf.Person) e));
 		}
 		r
 			.setLinks(
@ -108,7 +113,7 @@ public class ProvisionModelSupport {
 		RelatedRecord rr = new RelatedRecord();

 		final RelatedEntity re = rew.getTarget();
-		final RecordType relatedRecordType = RecordType.valueOf(re.getType());
+		final RecordType relatedRecordType = RecordType.fromString(re.getType());
 		final Relation relation = rew.getRelation();
 		final String relationProvenance = Optional
 			.ofNullable(relation.getDataInfo())
@ -150,6 +155,17 @@ public class ProvisionModelSupport {
 		rr.setPublisher(re.getPublisher());
 		rr.setResulttype(mapQualifier(re.getResulttype()));
 		rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));
+		rr.setDescription(StringUtils.left(re.getDescription(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH));
+		rr
+			.setAuthor(
+				Optional
+					.ofNullable(re.getAuthor())
+					.map(
+						aa -> aa
+							.stream()
+							.limit(ModelHardLimits.MAX_RELATED_AUTHORS)
+							.collect(Collectors.toList()))
+					.orElse(null));

 		if (relation.getValidated() == null) {
 			relation.setValidated(false);
@ -185,6 +201,18 @@ public class ProvisionModelSupport {
 		return ps;
 	}

+	private static Person mapPerson(eu.dnetlib.dhp.schema.oaf.Person p) {
+		Person ps = new Person();
+		ps.setFamilyName(p.getFamilyName());
+		ps.setGivenName(p.getGivenName());
+		ps.setAlternativeNames(p.getAlternativeNames());
+		ps.setBiography(p.getBiography());
+		ps.setConsent(p.getConsent());
+		// ps.setSubject(...));
+
+		return ps;
+	}
+
 	private static Funding mapFunding(List<String> fundingtree, VocabularyGroup vocs) {
 		SAXReader reader = new SAXReader();
 		return Optional
@ -378,6 +406,7 @@ public class ProvisionModelSupport {
 		rs.setPubliclyFunded(r.getPubliclyFunded());
 		rs.setTransformativeAgreement(r.getTransformativeAgreement());
 		rs.setExternalReference(mapExternalReference(r.getExternalReference()));
+		rs.setBestinstancetype(mapQualifier(r.getBestInstancetype()));
 		rs.setInstance(mapInstances(r.getInstance()));

 		if (r instanceof Publication) {
@ -667,14 +696,23 @@ public class ProvisionModelSupport {
 	}

 	private static List<Author> asAuthor(List<eu.dnetlib.dhp.schema.oaf.Author> authorList) {
+		return asAuthor(authorList, ModelHardLimits.MAX_AUTHORS);
+	}
+
+	private static List<Author> asAuthor(List<eu.dnetlib.dhp.schema.oaf.Author> authorList, int maxAuthors) {
 		return Optional
 			.ofNullable(authorList)
 			.map(
 				authors -> authors
 					.stream()
+					.limit(maxAuthors)
 					.map(
 						a -> Author
-							.newInstance(a.getFullname(), a.getName(), a.getSurname(), a.getRank(), asPid(a.getPid())))
+							.newInstance(
+								StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH),
+								a.getName(),
+								a.getSurname(),
+								a.getRank(), asPid(a.getPid())))
 					.collect(Collectors.toList()))
 			.orElse(null);
 	}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
@ -51,6 +51,11 @@ public class RelatedEntity implements Serializable {
 	private Qualifier contracttype;
 	private List<String> fundingtree;

+	// person
+	private String givenName;
+	private String familyName;
+	private List<String> alternativeNames;
+
 	public String getId() {
 		return id;
 	}
@ -251,6 +256,30 @@ public class RelatedEntity implements Serializable {
 		this.fundingtree = fundingtree;
 	}

+	public String getGivenName() {
+		return givenName;
+	}
+
+	public void setGivenName(String givenName) {
+		this.givenName = givenName;
+	}
+
+	public String getFamilyName() {
+		return familyName;
+	}
+
+	public void setFamilyName(String familyName) {
+		this.familyName = familyName;
+	}
+
+	public List<String> getAlternativeNames() {
+		return alternativeNames;
+	}
+
+	public void setAlternativeNames(List<String> alternativeNames) {
+		this.alternativeNames = alternativeNames;
+	}
+
 	@Override
 	public boolean equals(Object o) {
 		if (this == o)
@ -280,7 +309,10 @@ public class RelatedEntity implements Serializable {
 			&& Objects.equal(code, that.code)
 			&& Objects.equal(acronym, that.acronym)
 			&& Objects.equal(contracttype, that.contracttype)
-			&& Objects.equal(fundingtree, that.fundingtree);
+			&& Objects.equal(fundingtree, that.fundingtree)
+			&& Objects.equal(givenName, that.givenName)
+			&& Objects.equal(familyName, that.familyName)
+			&& Objects.equal(alternativeNames, that.alternativeNames);
 	}

 	@Override
@ -309,6 +341,9 @@ public class RelatedEntity implements Serializable {
 				code,
 				acronym,
 				contracttype,
-				fundingtree);
+				fundingtree,
+				familyName,
+				givenName,
+				alternativeNames);
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@ -1035,6 +1035,48 @@ public class XmlRecordFactory implements Serializable {
 								.collect(Collectors.toList()));
 				}

+				break;
+			case person:
+				final Person person = (Person) entity;
+
+				if (person.getGivenName() != null) {
+					metadata.add(XmlSerializationUtils.asXmlElement("givenname", person.getGivenName()));
+				}
+				if (person.getFamilyName() != null) {
+					metadata.add(XmlSerializationUtils.asXmlElement("familyname", person.getFamilyName()));
+				}
+				if (person.getAlternativeNames() != null) {
+					metadata
+						.addAll(
+							person
+								.getAlternativeNames()
+								.stream()
+								.map(altName -> XmlSerializationUtils.asXmlElement("alternativename", altName))
+								.collect(Collectors.toList()));
+				}
+				if (person.getBiography() != null) {
+					metadata.add(XmlSerializationUtils.asXmlElement("biography", person.getBiography()));
+				}
+				if (person.getSubject() != null) {
+					metadata
+						.addAll(
+							person
+								.getSubject()
+								.stream()
+								.map(pt -> {
+									List<Tuple2<String, String>> attrs = Lists.newArrayList();
+									attrs.add(new Tuple2<>("schema", pt.getSchema()));
+									attrs.add(new Tuple2<>("value", pt.getValue()));
+									attrs.add(new Tuple2<>("fromYear", String.valueOf(pt.getFromYear())));
+									attrs.add(new Tuple2<>("toYear", String.valueOf(pt.getToYear())));
+									return XmlSerializationUtils.asXmlElement("subject", attrs);
+								})
+								.collect(Collectors.toList()));
+				}
+				if (person.getConsent() != null) {
+					metadata.add(XmlSerializationUtils.asXmlElement("consent", String.valueOf(person.getConsent())));
+				}
+
 				break;
 			default:
 				throw new IllegalArgumentException("invalid entity type: " + type);
@ -1240,6 +1282,25 @@ public class XmlRecordFactory implements Serializable {
 								.collect(Collectors.toList()));
 				}
 				break;
+
+			case person:
+
+				if (isNotBlank(re.getGivenName())) {
+					metadata.add(XmlSerializationUtils.asXmlElement("givenname", re.getGivenName()));
+				}
+				if (isNotBlank(re.getFamilyName())) {
+					metadata.add(XmlSerializationUtils.asXmlElement("familyname", re.getFamilyName()));
+				}
+				if (re.getAlternativeNames() != null && !re.getAlternativeNames().isEmpty()) {
+					metadata
+						.addAll(
+							re
+								.getAlternativeNames()
+								.stream()
+								.map(name -> XmlSerializationUtils.asXmlElement("alternativename", name))
+								.collect(Collectors.toList()));
+				}
+				break;
 			default:
 				throw new IllegalArgumentException("invalid target type: " + targetType);
 		}
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -180,6 +180,7 @@
        <path start="join_relation_datasource"/>
        <path start="join_relation_organization"/>
        <path start="join_relation_project"/>
+        <path start="join_relation_person"/>
    </fork>

    <action name="join_relation_publication">
@ -378,6 +379,34 @@
        <error to="Kill"/>
    </action>

+    <action name="join_relation_person">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = person.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=5000
+                --conf spark.network.timeout=${sparkNetworkTimeout}
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial/person</arg>
+        </spark>
+        <ok to="wait_joins"/>
+        <error to="Kill"/>
+    </action>
+
    <join name="wait_joins" to="fork_join_all_entities"/>

    <fork name="fork_join_all_entities">
@ -388,6 +417,7 @@
        <path start="join_datasource_relations"/>
        <path start="join_organization_relations"/>
        <path start="join_project_relations"/>
+        <path start="join_person_relations"/>
    </fork>

    <action name="join_publication_relations">
@ -593,6 +623,35 @@
        <error to="Kill"/>
    </action>

+    <action name="join_person_relations">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[person.id = relatedEntity.source]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=5000
+                --conf spark.network.timeout=${sparkNetworkTimeout}
+            </spark-opts>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_entities/person</arg>
+            <arg>--numPartitions</arg><arg>10000</arg>
+        </spark>
+        <ok to="wait_join_phase2"/>
+        <error to="Kill"/>
+    </action>
+
    <join name="wait_join_phase2" to="create_payloads"/>

    <action name="create_payloads">
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh
@ -1,63 +0,0 @@
-#/usr/bin/bash
-
-# Read log files from ranking scripts and create a two-line file  
-# with score limits for the various measures. To be used by Kleanthis
-
-attrank_file=$(ls *attrank*.log);
-pr_file=$(ls *pagerank*.log)
-ram_file=$(ls *ram*.log);
-cc_file=$(ls *cc*.log);
-impulse_file=$(ls *impulse*.log);
-
-echo
-echo "-----------------------------"
-echo "Attrank file:${attrank_file}";
-echo "PageRank file:${pr_file}";
-echo "RAM file:${ram_file}";
-echo "CC file:${cc_file}";
-echo "Impulse file:${impulse_file}";
-echo "-----------------------------"
-echo
-echo
-
-# output file will be called score_limits.csv
-echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv
-# ---------------------------------------------------- #
-# Get respective score limits (we don't need RAM)
-inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2);
-inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2);
-inf_1=$(grep "^1%" ${pr_file} | cut -f 2);
-inf_10=$(grep "^10%" ${pr_file} | cut -f 2);
-echo "Influnence limits:"
-echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}";
-# ---------------------------------------------------- #
-pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2);
-pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2);
-pop_1=$(grep "^1%" ${attrank_file} | cut -f 2);
-pop_10=$(grep "^10%" ${attrank_file} | cut -f 2);
-echo "Popularity limits:";
-echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}";
-# ---------------------------------------------------- #
-imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2);
-imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2);
-imp_1=$(grep "^1%" ${impulse_file} | cut -f 2);
-imp_10=$(grep "^10%" ${impulse_file} | cut -f 2);
-echo "Popularity limits:";
-echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}";
-# ---------------------------------------------------- #
-cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2);
-cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2);
-cc_1=$(grep "^1%" ${cc_file} | cut -f 2);
-cc_10=$(grep "^10%" ${cc_file} | cut -f 2);
-echo "Popularity limits:";
-echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}";
-# ---------------------------------------------------- #
-
-echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv
-
-echo
-echo "score_limits.csv contents:"
-cat score_limits.csv
-
-echo;
-echo;
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
@ -1,60 +0,0 @@
-import json
-import sys
-from pyspark.sql import SparkSession
-from pyspark import SparkConf, SparkContext
-
-if len(sys.argv) != 3:
-    print("Usage: map_openaire_ids_to_dois.py <hdfs_src_dir> <hdfs_output_dir>")
-    sys.exit(-1)
-
-conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs')
-sc = SparkContext(conf = conf)
-spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate()
-sc.setLogLevel('OFF')
-
-src_dir = sys.argv[1]
-output = sys.argv[2]
-
-# src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/"
-# output = '/tmp/openaireid_to_dois/'
-
-def transform(doc):
-    
-    # get publication year from 'doc.dateofacceptance.value'
-    dateofacceptance = doc.get('dateofacceptance', {}).get('value')
-
-    year = 0 
-    
-    if (dateofacceptance is not None):
-        year = dateofacceptance.split('-')[0]
-
-    # for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi'
-    dois = [ pid['value'] for pid in doc.get('pid', [])  if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)]
-
-    num_dois = len(dois)
-    
-    # exlcude openaire ids that do not correspond to DOIs
-    if (num_dois == 0): 
-        return None
-        
-    fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ]
-    
-    return '\t'.join([ v.encode('utf-8') for v in fields ])
-    
-docs = None
-
-for result_type in ["publication", "dataset", "software", "otherresearchproduct"]:
-    
-    tmp = sc.textFile(src_dir + result_type).map(json.loads)
-    
-    if (docs is None):
-        docs = tmp
-    else:
-        # append all result types in one RDD
-        docs = docs.union(tmp)
-
-docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False)
-
-docs = docs.map(transform).filter(lambda d: d is not None)
-
-docs.saveAsTextFile(output)
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
@ -1,168 +0,0 @@
-#!/usr/bin/python
-# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
-# and uses this mapping to create doi-based score files in the format required by BiP! DB.
-# This is done by reading each openaire-id based ranking file and joining the openaire based
-# score and classes to all the corresponding dois.
-#################################################################################################
-# Imports
-import sys
-
-# Sparksession lib to communicate with cluster via session object
-from pyspark.sql import SparkSession
-
-# Import sql types to define schemas
-from pyspark.sql.types import *
-
-# Import sql functions with shorthand alias
-import pyspark.sql.functions as F
-
-from pyspark.sql.functions import max
-# from pyspark.sql.functions import udf
-#################################################################################################
-#################################################################################################
-# Clean up directory name - no longer needed in final workflow version
-'''
-def clean_directory_name(dir_name):
-    # We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_* 
-    # and we need to keep the parts in *	
-
-    
-    dir_name_parts = dir_name.split('_')
-    dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
-    
-    dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
-    clean_name = dir_name + ".txt.gz"
-
-    # clean_name = '_'.join(dir_name_parts)
-
-    # if '_ids' not in clean_name:
-    #     clean_name = clean_name.replace('id_', 'ids_')
-        	
-    # clean_name = clean_name.replace('.txt', '')
-    # clean_name = clean_name.replace('.gz', '')
-
-    # if 'openaire_ids_' in clean_name:
-    #     clean_name = clean_name.replace('openaire_ids_', '')
-        # clean_name = clean_name + '.txt.gz'
-    # else:
-        # clean_name = clean_name + '.txt.gz'
-	
-    return clean_name
-'''
-#################################################################################################
-if len(sys.argv) < 3:
-    print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
-    sys.exit(-1)
-
-# Read arguments
-synonyms_folder = sys.argv[1]
-num_partitions = int(sys.argv[2])
-input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
-# input_file_list = [clean_directory_name(item) for item in input_file_list]
-
-# Prepare output specific variables
-output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
-output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
-
-# --- INFO MESSAGES --- #
-print ("\n\n----------------------------")
-print ("Mpping openaire ids to DOIs")
-print ("Reading input from: " + synonyms_folder)
-print ("Num partitions: " + str(num_partitions))
-print ("Input files:" + " -- ".join(input_file_list))
-print ("Output files: " + " -- ".join(output_file_list))
-print ("----------------------------\n\n")
-#######################################################################################
-# We weill define the following schemas:
-# --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character)
-# --> a schema for floating point ranking scores [string - float - string]  (the latter string is the class)
-# --> a schema for integer ranking scores [string - int - string]  (the latter string is the class)
-
-float_schema = StructType([
-	StructField('id', StringType(), False),
-	StructField('score', FloatType(), False),
-	StructField('class', StringType(), False)
-	])
-	
-int_schema = StructType([
-	StructField('id', StringType(), False),
-	StructField('score', IntegerType(), False),
-	StructField('class', StringType(), False)
-	])
-	
-# This schema concerns the output of the file
-# containing the number of references of each doi
-synonyms_schema = StructType([
-	StructField('id', StringType(), False),
-	StructField('num_synonyms', IntegerType(), False),
-    StructField('doi_list', StringType(), False),
-	])
-#######################################################################################
-# Start spark session
-spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate()
-# Set Log Level for spark session
-spark.sparkContext.setLogLevel('WARN')
-#######################################################################################
-# MAIN Program
-
-# Read and repartition the synonym folder - also cache it since we will need to perform multiple joins
-synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder)
-synonym_df = synonym_df.select('id',  F.split(F.col('doi_list'), chr(0x02)).alias('doi_list'))
-synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache()
-
-# TESTING
-# print ("Synonyms: " + str(synonym_df.count()))
-# print ("DF looks like this:" )
-# synonym_df.show(1000, False)
-
-print ("\n\n-----------------------------")
-# Now we need to join the score files on the openaire-id with the synonyms and then keep
-# only doi - score - class and write this to the output
-for offset, input_file in enumerate(input_file_list):
-
-    print ("Mapping scores from " + input_file)
-
-    # Select correct schema
-    schema = int_schema
-    if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower():
-        schema = float_schema
-    
-    # Load file to dataframe
-    ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id')
-
-    # Get max score
-    max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max']
-    print ("Max Score for " + str(input_file) + " is " + str(max_score))
-   
-    # TESTING
-    # print ("Loaded df sample:")
-    # ranking_df.show(1000, False)
-
-    # Join scores to synonyms and keep required fields
-    doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache()
-    # Write output
-    output_file = output_file_list[offset]
-    print ("Writing to: " + output_file)
-    doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
-    
-    # Creata another file for the bip update process
-    ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup'))
-    doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache()
-    output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz")
-    print ("Writing bip update to: " + output_file)
-    doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
- 
-    
-    # Free memory?
-    ranking_df.unpersist(True)
-
-print ("-----------------------------")
-print ("\n\nFinished!\n\n")
-
-
-
-
-
-
-
-
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@ -17,10 +17,6 @@
 				<name>openaireGraphInputPath</name>
 				<value>${nameNode}/${workingDir}/openaire_id_graph</value>
 			</property>
-			<property>
-				<name>synonymFolder</name>
-				<value>${nameNode}/${workingDir}/openaireid_to_dois/</value>
-			</property>
 			<property>
 				<name>checkpointDir</name>
 				<value>${nameNode}/${workingDir}/check/</value>
@ -32,29 +28,34 @@
 		</configuration>
 	</global>

-	<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
+	<!-- Start using a decision node, to determine from which point onwards a job will continue -->
 	<start to="entry-point-decision" />

 	<decision name="entry-point-decision">
 		<switch>
-			<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
-			<!-- If any different condition is set, go to the corresponding start -->
+
+			<!-- Start from creating the citation network (i.e., normal execution should start from here) -->
+			<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
+
+			<!-- Different citation-based impact indicators are computed -->
 			<case to="spark-cc">${wf:conf('resume') eq "cc"}</case>
 			<case to="spark-ram">${wf:conf('resume') eq "ram"}</case>
 			<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
 			<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
 			<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
-			<!-- <case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case> -->
-			<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
-			<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
-			<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
-			<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>

-			<!-- Aggregation of impact scores on the project level		-->
+			<!-- Format the results appropriately before transforming them to action sets -->
+			<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
+
+			<!-- Aggregation of impact scores on the project level -->
 			<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
+
+			<!-- Create action sets -->
 			<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>

+			<!-- The default will be set as the normal start, a.k.a. create-openaire-ranking-graph -->
 			<default to="create-openaire-ranking-graph" />
+
 		</switch>
 	</decision>

@ -295,18 +296,11 @@
 			<capture-output/>
 		</shell>

-		<ok to="format-result-files" />
+		<ok to="format-json-files" />
 		<error to="filename-getting-error" />

 	</action>

-	<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
-	<fork name="format-result-files">
-		<path start="format-bip-files"/>
-		<path start="format-json-files"/>
-	</fork>
-
-
 	<!-- Format json files -->
 	<!-- Two parts: a) format files b) make the file endings .json.gz -->
 	<action name="format-json-files">
@ -345,139 +339,8 @@
 			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
 		</spark>

-		<ok to="join-file-formatting" />
-		<error to="json-formatting-fail" />
-	</action>
-
-	<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
-	<action name="format-bip-files">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
-		<spark xmlns="uri:oozie:spark-action:0.2">
-
-			<!-- using configs from an example on openaire -->
-			<master>yarn-cluster</master>
-			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Format Ranking Results BiP! DB</name>
-			<!-- Script name goes here -->
-			<jar>format_ranking_results.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-
-			<spark-opts>
-				--executor-memory=${sparkNormalExecutorMemory}
-				--executor-cores=${sparkExecutorCores}
-				--driver-memory=${sparkNormalDriverMemory}
-				--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
-				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
-				--conf spark.extraListeners=${spark2ExtraListeners}
-				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-			</spark-opts>
-
-			<!-- Script arguments here -->
-			<arg>zenodo</arg>
-			<!-- Input files must be identified dynamically -->
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
-			<!-- Num partitions -->
-			<arg>${sparkShufflePartitions}</arg>
-			<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
-			<arg>openaire</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
-			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
-		</spark>
-
-		<ok to="join-file-formatting" />
-		<error to="bip-formatting-fail" />
-	</action>
-
-	<!-- Finish formatting jobs -->
-	<join name="join-file-formatting" to="map-openaire-to-doi"/>
-
-	<!-- maps openaire ids to DOIs -->
-	<action name="map-openaire-to-doi">
-		<spark xmlns="uri:oozie:spark-action:0.2">
-
-			<!-- Delete previously created doi synonym folder -->
-			<prepare>
-				<delete path="${synonymFolder}"/>
-			</prepare>
-
-			<master>yarn-cluster</master>
-			<mode>cluster</mode>
-			<name>Openaire-DOI synonym collection</name>
-			<jar>map_openaire_ids_to_dois.py</jar>
-
-			<spark-opts>
-				--executor-memory=${sparkHighExecutorMemory}
-				--executor-cores=${sparkExecutorCores}
-				--driver-memory=${sparkHighDriverMemory}
-				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
-				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
-				--conf spark.extraListeners=${spark2ExtraListeners}
-				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-			</spark-opts>
-
-			<!-- Script arguments here -->
-			<arg>${openaireDataInput}/</arg>
-			<!-- number of partitions to be used on joins -->
-			<arg>${synonymFolder}</arg>
-
-			<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
-		</spark>
-
-		<ok to="map-scores-to-dois" />
-		<error to="synonym-collection-fail" />
-
-	</action>
-
-	<!-- mapping openaire scores to DOIs -->
-	<action name="map-scores-to-dois">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
-		<spark xmlns="uri:oozie:spark-action:0.2">
-
-			<!-- using configs from an example on openaire -->
-			<master>yarn-cluster</master>
-			<mode>cluster</mode>
-			<name>Mapping Openaire Scores to DOIs</name>
-			<jar>map_scores_to_dois.py</jar>
-
-			<spark-opts>
-				--executor-memory=${sparkHighExecutorMemory}
-				--executor-cores=${sparkExecutorCores}
-				--driver-memory=${sparkHighDriverMemory}
-				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
-				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
-				--conf spark.extraListeners=${spark2ExtraListeners}
-				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-			</spark-opts>
-
-			<!-- Script arguments here -->
-			<arg>${synonymFolder}</arg>
-			<!-- Number of partitions -->
-			<arg>${sparkShufflePartitions}</arg>
-			<!-- The remaining input are the ranking files fproduced for bip db-->
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
-			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
-
-			<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
-		</spark>
-
 		<ok to="project-impact-indicators" />
-		<error to="map-scores-fail" />
-
+		<error to="json-formatting-fail" />
 	</action>

 	<action name="project-impact-indicators">
@ -594,18 +457,6 @@
 		<message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>

-	<kill name="bip-formatting-fail">
-		<message>Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>
-
-	<kill name="synonym-collection-fail">
-		<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>
-
-	<kill name="map-scores-fail">
-		<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>
-
 	<kill name="actionset-delete-fail">
 		<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java
@ -3,8 +3,8 @@ package eu.dnetlib.dhp.swh.models;

 import java.io.Serializable;

-import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;

@JsonIgnoreProperties(ignoreUnknown = true)
 public class LastVisitData implements Serializable {
--- a/pom.xml
+++ b/pom.xml
@ -937,7 +937,7 @@
        <commons.logging.version>1.1.3</commons.logging.version>
        <commons-validator.version>1.7</commons-validator.version>
        <dateparser.version>1.0.7</dateparser.version>
-        <dhp-schemas.version>[8.0.1]</dhp-schemas.version>
+        <dhp-schemas.version>[9.0.0]</dhp-schemas.version>
        <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
        <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
        <dhp.guava.version>11.0.2</dhp.guava.version>