2024-10-25 10:20:41 +02:00
21 changed files with 771 additions and 87 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.actionmanager.personentity;
+package eu.dnetlib.dhp.common.person;
 import java.util.Arrays;
 import java.util.Iterator;
@ -61,7 +61,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
 	private Relation getRelation(String orcid1, String orcid2) {
 		String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
 		String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
-		return OafMapperUtils
+		Relation relation = OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.PERSON_PERSON_RELTYPE,
 				ModelConstants.PERSON_PERSON_SUBRELTYPE,
@ -76,5 +76,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
 								ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
 		relation.setValidated(true);
 		return relation;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
@ -1,12 +1,9 @@
-package eu.dnetlib.dhp.actionmanager.personentity;
+package eu.dnetlib.dhp.common.person;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class Coauthors implements Serializable {
 	private List<String> coauthors;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@ -2,15 +2,25 @@
 package eu.dnetlib.dhp.actionmanager.personentity;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static org.apache.spark.sql.functions.*;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Serializable;
 import java.nio.charset.StandardCharsets;
 import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.util.*;
 import java.util.stream.Collectors;
 import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
 import eu.dnetlib.dhp.common.person.Coauthors;
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@ -28,6 +38,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.collection.orcid.model.Author;
 import eu.dnetlib.dhp.collection.orcid.model.Employment;
 import eu.dnetlib.dhp.collection.orcid.model.Work;
 import eu.dnetlib.dhp.common.DbClient;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -45,7 +56,7 @@ import scala.Tuple2;
 public class ExtractPerson implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
-
+	private static final String QUERY = "SELECT * FROM project_person WHERE pid_type = 'ORCID'";
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final String OPENAIRE_PREFIX = "openaire____";
 	private static final String SEPARATOR = "::";
@ -62,8 +73,15 @@ public class ExtractPerson implements Serializable {
 	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
 	public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
 	public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
 	public static final String FUNDER_AUTHORS_CLASSID = "sysimport:crosswalk:funderdatabase";
 	public static final String FUNDER_AUTHORS_CLASSNAME = "Imported from Funder Database";
 	public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
 	public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
-	public static final DataInfo DATAINFO = OafMapperUtils
+	public static List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
 		.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
 	public static final DataInfo ORCIDDATAINFO = OafMapperUtils
 		.dataInfo(
 			false,
 			null,
@ -77,6 +95,20 @@ public class ExtractPerson implements Serializable {
 					ModelConstants.DNET_PROVENANCE_ACTIONS),
 			"0.91");
 	public static final DataInfo FUNDERDATAINFO = OafMapperUtils
 			.dataInfo(
 					false,
 					null,
 					false,
 					false,
 					OafMapperUtils
 							.qualifier(
 									FUNDER_AUTHORS_CLASSID,
 									FUNDER_AUTHORS_CLASSNAME,
 									ModelConstants.DNET_PROVENANCE_ACTIONS,
 									ModelConstants.DNET_PROVENANCE_ACTIONS),
 					"0.91");
 	public static void main(final String[] args) throws IOException, ParseException {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -106,19 +138,130 @@ public class ExtractPerson implements Serializable {
 		final String workingDir = parser.get("workingDir");
 		log.info("workingDir {}", workingDir);
 		final String dbUrl = parser.get("postgresUrl");
 		final String dbUser = parser.get("postgresUser");
 		final String dbPassword = parser.get("postgresPassword");
 		final String hdfsNameNode = parser.get("hdfsNameNode");
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
-				createActionSet(spark, inputPath, outputPath, workingDir);
+				extractInfoForActionSetFromORCID(spark, inputPath, workingDir);
 				extractInfoForActionSetFromProjects(
 					spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode);
 				createActionSet(spark, outputPath, workingDir);
 			});
 	}
-	private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
+	private static void extractInfoForActionSetFromProjects(SparkSession spark, String inputPath, String workingDir,
 		String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException {
 		Configuration conf = new Configuration();
 		conf.set("fs.defaultFS", hdfsNameNode);
 		FileSystem fileSystem = FileSystem.get(conf);
 		Path hdfsWritePath = new Path(hdfsPath);
 		FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
 		try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
 			try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
 				dbClient.processResults(QUERY, rs -> writeRelation(getRelationWithProject(rs), writer));
 			}
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
 	}
 	public static Relation getRelationWithProject(ResultSet rs) {
 		try {
 			return getProjectRelation(
 				rs.getString("project"), rs.getString("pid"),
 				rs.getString("role"));
 		} catch (final SQLException e) {
 			throw new RuntimeException(e);
 		}
 	}
 	private static Relation getProjectRelation(String project, String orcid, String role) {
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
 		String target = project.substring(0, 14)
 			+ IdentifierFactory.md5(project.substring(15));
 		List<KeyValue> properties = new ArrayList<>();
 		Relation relation = OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE,
 				ModelConstants.PROJECT_PERSON_PARTICIPATES,
 				collectedfromOpenAIRE,
 					FUNDERDATAINFO,
 				null);
 		relation.setValidated(true);
 		if (StringUtil.isNotBlank(role)) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("role");
 			kv.setValue(role);
 			properties.add(kv);
 		}
 		if (!properties.isEmpty())
 			relation.setProperties(properties);
 		return relation;
 	}
 	protected static void writeRelation(final Relation relation, BufferedWriter writer) {
 		try {
 			writer.write(OBJECT_MAPPER.writeValueAsString(relation));
 			writer.newLine();
 		} catch (final IOException e) {
 			throw new RuntimeException(e);
 		}
 	}
 	private static void createActionSet(SparkSession spark, String outputPath, String workingDir) {
 		Dataset<Person> people;
 		people = spark
 			.read()
 			.textFile(workingDir + "/people")
 			.map(
 				(MapFunction<String, Person>) value -> OBJECT_MAPPER
 					.readValue(value, Person.class),
 				Encoders.bean(Person.class));
 		people
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.union(
 				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/coauthorship")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/affiliation")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/project")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(
 				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 	}
 	private static void extractInfoForActionSetFromORCID(SparkSession spark, String inputPath, String workingDir) {
 		Dataset<Author> authors = spark
 			.read()
 			.parquet(inputPath + "Authors")
@ -144,18 +287,13 @@ public class ExtractPerson implements Serializable {
 			.parquet(inputPath + "Employments")
 			.as(Encoders.bean(Employment.class));
 		Dataset<Author> peopleToMap = authors
 			.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
 			.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
 			.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
 			.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
 		Dataset<Employment> employment = employmentDataset
-			.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
+			.joinWith(authors, employmentDataset.col("orcid").equalTo(authors.col("orcid")))
 			.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
-		Dataset<Person> people;
+		// Mapping all the orcid profiles even if the profile has no visible works
-		peopleToMap.map((MapFunction<Author, Person>) op -> {
+
 		authors.map((MapFunction<Author, Person>) op -> {
 			Person person = new Person();
 			person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
 			person
@ -208,7 +346,7 @@ public class ExtractPerson implements Serializable {
 							ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
 			person.setDateofcollection(op.getLastModifiedDate());
 			person.setOriginalId(Arrays.asList(op.getOrcid()));
-			person.setDataInfo(DATAINFO);
+			person.setDataInfo(ORCIDDATAINFO);
 			return person;
 		}, Encoders.bean(Person.class))
 			.write()
@ -262,34 +400,6 @@ public class ExtractPerson implements Serializable {
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingDir + "/affiliation");
 		people = spark
 			.read()
 			.textFile(workingDir + "/people")
 			.map(
 				(MapFunction<String, Person>) value -> OBJECT_MAPPER
 					.readValue(value, Person.class),
 				Encoders.bean(Person.class));
 		people.show(false);
 		people
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.union(
 				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/coauthorship")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/affiliation")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(
 				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 	}
 	private static Dataset<Relation> getRelations(SparkSession spark, String path) {
@ -323,8 +433,9 @@ public class ExtractPerson implements Serializable {
 				source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
 				ModelConstants.ORG_PERSON_PARTICIPATES,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-				DATAINFO,
+					ORCIDDATAINFO,
 				null);
 		relation.setValidated(true);
 		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
 			KeyValue kv = new KeyValue();
@ -345,31 +456,6 @@ public class ExtractPerson implements Serializable {
 	}
 	private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
 		String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
 		return Arrays
 			.asList(
 				OafMapperUtils
 					.getRelation(
 						source, target, ModelConstants.PERSON_PERSON_RELTYPE,
 						ModelConstants.PERSON_PERSON_SUBRELTYPE,
 						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
 						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 						DATAINFO,
 						null),
 				OafMapperUtils
 					.getRelation(
 						target, source, ModelConstants.PERSON_PERSON_RELTYPE,
 						ModelConstants.PERSON_PERSON_SUBRELTYPE,
 						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
 						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 						DATAINFO,
 						null));
 	}
 	private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
 		if (Optional.ofNullable(w.getPids()).isPresent())
@ -412,14 +498,15 @@ public class ExtractPerson implements Serializable {
 			default:
 				return null;
 		}
-
+		Relation relation = OafMapperUtils
 		return OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.RESULT_PERSON_RELTYPE,
 				ModelConstants.RESULT_PERSON_SUBRELTYPE,
 				ModelConstants.RESULT_PERSON_HASAUTHORED,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-				DATAINFO,
+					ORCIDDATAINFO,
 				null);
 		relation.setValidated(true);
 		return relation;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
@ -21,5 +21,30 @@
  "paramLongName": "workingDir",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 },
  {
    "paramName": "pu",
    "paramLongName": "postgresUrl",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
  },
  {
    "paramName": "ps",
    "paramLongName": "postgresUser",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
  },
  {
  "paramName": "pp",
  "paramLongName": "postgresPassword",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 },{
  "paramName": "nn",
  "paramLongName": "hdfsNameNode",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
@ -1,2 +1,5 @@
 inputPath=/data/orcid_2023/tables/
 outputPath=/user/miriam.baglioni/peopleAS
 postgresUrl=jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus
 postgresUser=dnet
 postgresPassword=dnetPwd
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
@ -9,6 +9,18 @@
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>postgresUrl</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>postgresUser</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>postgresPassword</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -102,6 +114,10 @@
            <arg>--inputPath</arg><arg>${inputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
            <arg>--postgresUrl</arg><arg>${postgresUrl}</arg>
            <arg>--postgresUser</arg><arg>${postgresUser}</arg>
            <arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-enrichment/pom.xml
+++ b/dhp-workflows/dhp-enrichment/pom.xml
@ -48,12 +48,7 @@
            <groupId>io.github.classgraph</groupId>
            <artifactId>classgraph</artifactId>
        </dependency>
-        <dependency>
+
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-aggregation</artifactId>
            <version>1.2.5-SNAPSHOT</version>
            <scope>compile</scope>
        </dependency>
    </dependencies>
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java
@ -0,0 +1,295 @@
 package eu.dnetlib.dhp.person;
 import static com.ibm.icu.text.PluralRules.Operand.w;
 import static eu.dnetlib.dhp.PropagationConstant.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.Dataset;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
 import eu.dnetlib.dhp.common.person.Coauthors;
 import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import scala.Tuple2;
 public class SparkExtractPersonRelations {
 	private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
 	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
 	public static final DataInfo DATAINFO = OafMapperUtils
 		.dataInfo(
 			false,
 			"openaire",
 			true,
 			false,
 			OafMapperUtils
 				.qualifier(
 					ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
 					ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
 					ModelConstants.DNET_PROVENANCE_ACTIONS,
 					ModelConstants.DNET_PROVENANCE_ACTIONS),
 			"0.85");
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				SparkCountryPropagationJob.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		String sourcePath = parser.get("sourcePath");
 		log.info("sourcePath: {}", sourcePath);
 		final String workingPath = parser.get("outputPath");
 		log.info("workingPath: {}", workingPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				extractRelations(
 					spark,
 					sourcePath,
 					workingPath);
 				removeIsolatedPerson(spark,sourcePath, workingPath);
 			});
 	}
 	private static void removeIsolatedPerson(SparkSession spark, String sourcePath, String workingPath) {
 		Dataset<Person> personDataset = spark.read().schema(Encoders.bean(Person.class).schema())
 				.json(sourcePath + "person")
 				.as(Encoders.bean(Person.class));
 		Dataset<Relation> relationDataset = spark.read().schema(Encoders.bean(Relation.class).schema())
 				.json(sourcePath + "relation")
 				.as(Encoders.bean(Relation.class));
 		personDataset.join(relationDataset, personDataset.col("id").equalTo(relationDataset.col("source")), "left_semi")
 				.write()
 				.option("compression","gzip")
 				.mode(SaveMode.Overwrite)
 				.json(workingPath + "person");
 		spark.read().schema(Encoders.bean(Person.class).schema())
 				.json(workingPath + "person")
 				.write()
 				.mode(SaveMode.Overwrite)
 				.option("compression","gzip")
 				.json(sourcePath + "person");
 	}
 	private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) {
 		Dataset<Tuple2<String, Relation>> relationDataset = spark
 			.read()
 			.schema(Encoders.bean(Relation.class).schema())
 			.json(sourcePath + "relation")
 			.as(Encoders.bean(Relation.class))
 			.map(
 				(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
 					r.getSource() + r.getRelClass() + r.getTarget(), r),
 				Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
 		ModelSupport.entityTypes
 			.keySet()
 			.stream()
 			.filter(ModelSupport::isResult)
 			.forEach(
 				e -> {
 					// 1. search for results having orcid_pending and orcid in the set of pids for the authors
 					Dataset<Result> resultWithOrcids = spark
 						.read()
 						.schema(Encoders.bean(Result.class).schema())
 						.json(sourcePath + e.name())
 						.as(Encoders.bean(Result.class))
 						.filter(
 							(FilterFunction<Result>) r -> !r.getDataInfo().getDeletedbyinference() &&
 								!r.getDataInfo().getInvisible() &&
 								Optional
 									.ofNullable(r.getAuthor())
 									.isPresent())
 						.filter(
 							(FilterFunction<Result>) r -> r
 								.getAuthor()
 								.stream()
 								.anyMatch(
 									a -> Optional
 										.ofNullable(
 											a
 												.getPid())
 										.isPresent() &&
 										a
 											.getPid()
 											.stream()
 											.anyMatch(
 												p -> Arrays
 													.asList("orcid", "orcid_pending")
 													.contains(p.getQualifier().getClassid().toLowerCase()))));
 					// 2. create authorship relations between the result identifier and the person entity with
 					// orcid_pending.
 					Dataset<Tuple2<String, Relation>> newRelations = resultWithOrcids
 						.flatMap(
 							(FlatMapFunction<Result, Relation>) r -> getAuthorshipRelations(r),
 							Encoders.bean(Relation.class))
 //							.groupByKey((MapFunction<Relation, String>) r-> r.getSource()+r.getTarget(), Encoders.STRING() )
 //							.mapGroups((MapGroupsFunction<String, Relation, Relation>) (k,it) -> it.next(), Encoders.bean(Relation.class) )
 						.map(
 							(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
 								r.getSource() + r.getRelClass() + r.getTarget(), r),
 							Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
 					newRelations
 						.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
 						.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
 							if (t2._2() == null)
 								return t2._1()._2();
 							return null;
 						}, Encoders.bean(Relation.class))
 						.filter((FilterFunction<Relation>) r -> r != null)
 						.write()
 						.mode(SaveMode.Append)
 						.option("compression", "gzip")
 						.json(workingPath);
 					// 2.1 store in a separate location the relation between the person and the pids for the result?
 					// 3. create co_authorship relations between the pairs of authors with orcid/orcid_pending pids
 					newRelations = resultWithOrcids
 						.map((MapFunction<Result, Coauthors>) r -> getAuthorsPidList(r), Encoders.bean(Coauthors.class))
 						.flatMap(
 							(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
 							Encoders.bean(Relation.class))
 						.groupByKey(
 							(MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
 						.mapGroups(
 							(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(),
 							Encoders.bean(Relation.class))
 						.map(
 							(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
 								r.getSource() + r.getRelClass() + r.getTarget(), r),
 							Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
 					newRelations
 						.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
 						.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
 							if (t2._2() == null)
 								return t2._1()._2();
 							return null;
 						}, Encoders.bean(Relation.class))
 						.filter((FilterFunction<Relation>) r -> r != null)
 						.write()
 						.mode(SaveMode.Append)
 						.option("compression", "gzip")
 						.json(workingPath);
 				});
 		spark
 			.read()
 			.schema(Encoders.bean(Relation.class).schema())
 			.json(workingPath)
 			.write()
 			.mode(SaveMode.Append)
 			.option("compression", "gzip")
 			.json(sourcePath + "relation");
 	}
 	private static Coauthors getAuthorsPidList(Result r) {
 		Coauthors coauth = new Coauthors();
 		coauth
 			.setCoauthors(
 				r
 					.getAuthor()
 					.stream()
 					.filter(
 						a -> a
 							.getPid()
 							.stream()
 							.anyMatch(
 								p -> Arrays.asList("orcid", "orcid_pending").contains(p.getQualifier().getClassid())))
 					.map(a -> {
 						Optional<StructuredProperty> tmp = a
 							.getPid()
 							.stream()
 							.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid"))
 							.findFirst();
 						if (tmp.isPresent())
 							return tmp.get().getValue();
 						tmp = a
 							.getPid()
 							.stream()
 							.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
 							.findFirst();
 						if (tmp.isPresent())
 							return tmp.get().getValue();
 						return null;
 					})
 					.filter(Objects::nonNull)
 					.collect(Collectors.toList()));
 		return coauth;
 	}
 	private static Iterator<Relation> getAuthorshipRelations(Result r) {
 		List<Relation> relationList = new ArrayList<>();
 		for (Author a : r.getAuthor())
 			relationList.addAll(a.getPid().stream().map(p -> {
 				if (p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
 					return getRelation(p.getValue(), r.getId());
 				return null;
 			})
 				.filter(Objects::nonNull)
 				.collect(Collectors.toList()));
 		return relationList.iterator();
 	}
 	private static Relation getRelation(String orcid, String resultId) {
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
 		Relation relation = OafMapperUtils
 			.getRelation(
 				source, resultId, ModelConstants.RESULT_PERSON_RELTYPE,
 				ModelConstants.RESULT_PERSON_SUBRELTYPE,
 				ModelConstants.RESULT_PERSON_HASAUTHORED,
 				null, // collectedfrom = null
 				DATAINFO,
 				null);
 		return relation;
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt
@ -8,3 +8,4 @@ result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_ap
 community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app
 community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
 country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
 person_propagation classpath eu/dnetlib/dhp/wf/subworkflows/person/oozie_app
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml
@ -122,6 +122,7 @@
            <case to="community_project">${wf:conf('resumeFrom') eq 'CommunityProject'}</case>
            <case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
            <case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
            <case to="person_propagation">${wf:conf('resumeFrom') eq 'PersonPropagation'}</case>
            <default to="orcid_propagation"/>
        </switch>
    </decision>
@ -291,10 +292,24 @@
                </property>
            </configuration>
        </sub-workflow>
        <ok to="person_propagation" />
        <error to="Kill" />
    </action>
    <action name="person_propagation">
        <sub-workflow>
            <app-path>${wf:appPath()}/person_propagation
            </app-path>
            <propagate-configuration/>
            <configuration>
                <property>
                    <name>sourcePath</name>
                    <value>${outputPath}</value>
                </property>
            </configuration>
        </sub-workflow>
        <ok to="country_propagation" />
        <error to="Kill" />
    </action>
    <action name="country_propagation">
        <sub-workflow>
            <app-path>${wf:appPath()}/country_propagation
@ -319,6 +334,8 @@
        <error to="Kill" />
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json
@ -0,0 +1,21 @@
 [
  {
    "paramName":"s",
    "paramLongName":"sourcePath",
    "paramDescription": "the path of the sequencial file to read",
    "paramRequired": true
  },
  {
    "paramName": "out",
    "paramLongName": "outputPath",
    "paramDescription": "the path used to store temporary output files",
    "paramRequired": true
  },
  {
    "paramName": "ssm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "true if the spark session is managed, false otherwise",
    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties
@ -0,0 +1 @@
 sourcePath=/tmp/miriam/13_graph_copy
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml
@ -0,0 +1,58 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <property>
        <name>spark2EventLogDir</name>
        <value>/user/spark/spark2ApplicationHistory</value>
    </property>
    <property>
        <name>spark2ExtraListeners</name>
        <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
    </property>
    <property>
        <name>spark2SqlQueryExecutionListeners</name>
        <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
    </property>
    <property>
        <name>sparkExecutorNumber</name>
        <value>4</value>
    </property>
    <property>
        <name>sparkDriverMemory</name>
        <value>15G</value>
    </property>
    <property>
        <name>sparkExecutorMemory</name>
        <value>5G</value>
    </property>
    <property>
        <name>sparkExecutorCores</name>
        <value>4</value>
    </property>
    <property>
        <name>spark2MaxExecutors</name>
        <value>50</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml
@ -0,0 +1,68 @@
 <workflow-app name="person_propagation" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the source path</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="reset_outputpath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="reset_outputpath">
        <fs>
            <delete path="${workingDir}"/>
            <mkdir path="${workingDir}"/>
        </fs>
        <ok to="extract_person_relation_from_graph"/>
        <error to="Kill"/>
    </action>
    <action name="extract_person_relation_from_graph">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>personPropagation</name>
            <class>eu.dnetlib.dhp.person.SparkExtractPersonRelations</class>
            <jar>dhp-enrichment-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/</arg>
            <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java
@ -0,0 +1,95 @@
 package eu.dnetlib.dhp.person;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import eu.dnetlib.dhp.schema.oaf.*;
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
 import scala.Tuple2;
 public class PersonPropagationJobTest {
 	private static final Logger log = LoggerFactory.getLogger(PersonPropagationJobTest.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static SparkSession spark;
 	private static Path workingDir;
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(PersonPropagationJobTest.class.getSimpleName());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(PersonPropagationJobTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(PersonPropagationJobTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	void testPersonPropagation() throws Exception {
 		final String sourcePath = getClass()
 			.getResource("/eu/dnetlib/dhp/personpropagation/graph")
 			.getPath();
 		SparkExtractPersonRelations
 			.main(
 				new String[] {
 					"--isSparkSessionManaged", Boolean.FALSE.toString(),
 					"--sourcePath", sourcePath,
 					"--outputPath", workingDir.toString()
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.textFile(workingDir.toString() + "/relation")
 			.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
 		//TODO write assertions and find relevant information for hte resource files
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
    assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
    // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
  }
  @Test def testDocumentationNames(): Unit = {
    assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
  }