diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index c1d6e1b5b..c7cb11b08 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -83,6 +83,10 @@
com.jayway.jsonpath
json-path
+
+ org.postgresql
+ postgresql
+
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
similarity index 95%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java
rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
index 94f17aad5..cedc9bd4d 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.common;
import java.io.Closeable;
import java.io.IOException;
@@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
- private final Connection connection;
+ private Connection connection;
public DbClient(final String address, final String login, final String password) {
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
index cdde37fd4..fc85b1ac1 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
@@ -13,7 +13,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class ModelSupport {
/** Defines the mapping between the actual entity type and the main entity type */
- private static final Map entityMapping = Maps.newHashMap();
+ private static Map entityMapping = Maps.newHashMap();
static {
entityMapping.put(EntityType.publication, MainEntityType.result);
@@ -53,6 +53,232 @@ public class ModelSupport {
oafTypes.put("relation", Relation.class);
}
+ public static final Map entityIdPrefix = Maps.newHashMap();
+
+ static {
+ entityIdPrefix.put("datasource", "10");
+ entityIdPrefix.put("organization", "20");
+ entityIdPrefix.put("project", "40");
+ entityIdPrefix.put("result", "50");
+ }
+
+ public static final Map relationInverseMap = Maps.newHashMap();
+
+ static {
+ relationInverseMap
+ .put(
+ "personResult_authorship_isAuthorOf", new RelationInverse()
+ .setRelation("isAuthorOf")
+ .setInverse("hasAuthor")
+ .setRelType("personResult")
+ .setSubReltype("authorship"));
+ relationInverseMap
+ .put(
+ "personResult_authorship_hasAuthor", new RelationInverse()
+ .setInverse("isAuthorOf")
+ .setRelation("hasAuthor")
+ .setRelType("personResult")
+ .setSubReltype("authorship"));
+ relationInverseMap
+ .put(
+ "projectOrganization_participation_isParticipant", new RelationInverse()
+ .setRelation("isParticipant")
+ .setInverse("hasParticipant")
+ .setRelType("projectOrganization")
+ .setSubReltype("participation"));
+ relationInverseMap
+ .put(
+ "projectOrganization_participation_hasParticipant", new RelationInverse()
+ .setInverse("isParticipant")
+ .setRelation("hasParticipant")
+ .setRelType("projectOrganization")
+ .setSubReltype("participation"));
+ relationInverseMap
+ .put(
+ "resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
+ .setRelation("hasAuthorInstitution")
+ .setInverse("isAuthorInstitutionOf")
+ .setRelType("resultOrganization")
+ .setSubReltype("affiliation"));
+ relationInverseMap
+ .put(
+ "resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
+ .setInverse("hasAuthorInstitution")
+ .setRelation("isAuthorInstitutionOf")
+ .setRelType("resultOrganization")
+ .setSubReltype("affiliation"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedup_merges", new RelationInverse()
+ .setRelation("merges")
+ .setInverse("isMergedIn")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedupSimilarity"));
+
+ relationInverseMap
+ .put(
+ "resultProject_outcome_isProducedBy", new RelationInverse()
+ .setRelation("isProducedBy")
+ .setInverse("produces")
+ .setRelType("resultProject")
+ .setSubReltype("outcome"));
+ relationInverseMap
+ .put(
+ "resultProject_outcome_produces", new RelationInverse()
+ .setInverse("isProducedBy")
+ .setRelation("produces")
+ .setRelType("resultProject")
+ .setSubReltype("outcome"));
+ relationInverseMap
+ .put(
+ "projectPerson_contactPerson_isContact", new RelationInverse()
+ .setRelation("isContact")
+ .setInverse("hasContact")
+ .setRelType("projectPerson")
+ .setSubReltype("contactPerson"));
+ relationInverseMap
+ .put(
+ "projectPerson_contactPerson_hasContact", new RelationInverse()
+ .setInverse("isContact")
+ .setRelation("hasContact")
+ .setRelType("personPerson")
+ .setSubReltype("coAuthorship"));
+ relationInverseMap
+ .put(
+ "personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
+ .setInverse("isCoAuthorOf")
+ .setRelation("isCoAuthorOf")
+ .setRelType("personPerson")
+ .setSubReltype("coAuthorship"));
+ relationInverseMap
+ .put(
+ "personPerson_dedup_merges", new RelationInverse()
+ .setInverse("isMergedIn")
+ .setRelation("merges")
+ .setRelType("personPerson")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "personPerson_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("personPerson")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("personPerson")
+ .setSubReltype("dedupSimilarity"));
+ relationInverseMap
+ .put(
+ "datasourceOrganization_provision_isProvidedBy", new RelationInverse()
+ .setInverse("provides")
+ .setRelation("isProvidedBy")
+ .setRelType("datasourceOrganization")
+ .setSubReltype("provision"));
+ relationInverseMap
+ .put(
+ "datasourceOrganization_provision_provides", new RelationInverse()
+ .setInverse("isProvidedBy")
+ .setRelation("provides")
+ .setRelType("datasourceOrganization")
+ .setSubReltype("provision"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("isAmongTopNSimilarDocuments")
+ .setRelation("hasAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("hasAmongTopNSimilarDocuments")
+ .setRelation("isAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_relationship_isRelatedTo", new RelationInverse()
+ .setInverse("isRelatedTo")
+ .setRelation("isRelatedTo")
+ .setRelType("resultResult")
+ .setSubReltype("relationship"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("hasAmongTopNSimilarDocuments")
+ .setRelation("isAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_supplement_isSupplementTo", new RelationInverse()
+ .setInverse("isSupplementedBy")
+ .setRelation("isSupplementTo")
+ .setRelType("resultResult")
+ .setSubReltype("supplement"));
+ relationInverseMap
+ .put(
+ "resultResult_supplement_isSupplementedBy", new RelationInverse()
+ .setInverse("isSupplementTo")
+ .setRelation("isSupplementedBy")
+ .setRelType("resultResult")
+ .setSubReltype("supplement"));
+ relationInverseMap
+ .put(
+ "resultResult_part_isPartOf", new RelationInverse()
+ .setInverse("hasPart")
+ .setRelation("isPartOf")
+ .setRelType("resultResult")
+ .setSubReltype("part"));
+ relationInverseMap
+ .put(
+ "resultResult_part_hasPart", new RelationInverse()
+ .setInverse("isPartOf")
+ .setRelation("hasPart")
+ .setRelType("resultResult")
+ .setSubReltype("part"));
+ relationInverseMap
+ .put(
+ "resultResult_dedup_merges", new RelationInverse()
+ .setInverse("isMergedIn")
+ .setRelation("merges")
+ .setRelType("resultResult")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "resultResult_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("resultResult")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("resultResult")
+ .setSubReltype("dedupSimilarity"));
+
+ }
+
private static final String schemeTemplate = "dnet:%s_%s_relations";
private ModelSupport() {
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java
new file mode 100644
index 000000000..4757c637e
--- /dev/null
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java
@@ -0,0 +1,46 @@
+
+package eu.dnetlib.dhp.schema.common;
+
+public class RelationInverse {
+ private String relation;
+ private String inverse;
+ private String relType;
+ private String subReltype;
+
+ public String getRelType() {
+ return relType;
+ }
+
+ public RelationInverse setRelType(String relType) {
+ this.relType = relType;
+ return this;
+ }
+
+ public String getSubReltype() {
+ return subReltype;
+ }
+
+ public RelationInverse setSubReltype(String subReltype) {
+ this.subReltype = subReltype;
+ return this;
+ }
+
+ public String getRelation() {
+ return relation;
+ }
+
+ public RelationInverse setRelation(String relation) {
+ this.relation = relation;
+ return this;
+ }
+
+ public String getInverse() {
+ return inverse;
+ }
+
+ public RelationInverse setInverse(String inverse) {
+ this.inverse = inverse;
+ return this;
+ }
+
+}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
index b9bd4c5f0..231fb1e60 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
@@ -2,8 +2,7 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
-import java.util.List;
-import java.util.Objects;
+import java.util.*;
public class Author implements Serializable {
@@ -86,4 +85,5 @@ public class Author implements Serializable {
public int hashCode() {
return Objects.hash(fullname, name, surname, rank, pid, affiliation);
}
+
}
diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml
new file mode 100644
index 000000000..37abc22f6
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/pom.xml
@@ -0,0 +1,36 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.1-SNAPSHOT
+
+ 4.0.0
+
+ dhp-blacklist
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java
new file mode 100644
index 000000000..0ef59e8c2
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java
@@ -0,0 +1,87 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareMergedRelationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareMergedRelationJob.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareMergedRelationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ String outputPath = parser.get("outputPath");
+ log.info("outputPath: {} ", outputPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ selectMergesRelations(
+ spark,
+ inputPath,
+ outputPath);
+ });
+ }
+
+ private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
+
+ Dataset relation = readRelations(spark, inputPath);
+
+ relation
+ .filter("relclass = 'merges' and datainfo.deletedbyinference=false")
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ public static org.apache.spark.sql.Dataset readRelations(
+ SparkSession spark, String inputPath) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map(
+ (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class),
+ Encoders.bean(Relation.class));
+ }
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java
new file mode 100644
index 000000000..2caa66db4
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java
@@ -0,0 +1,141 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import java.io.BufferedWriter;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.sql.ResultSet;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.function.Function;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.DbClient;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.common.RelationInverse;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class ReadBlacklistFromDB implements Closeable {
+
+ private final DbClient dbClient;
+ private static final Log log = LogFactory.getLog(ReadBlacklistFromDB.class);
+ private final Configuration conf;
+ private final BufferedWriter writer;
+ private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private final static String query = "SELECT source_type, unnest(original_source_objects) as source, " +
+ "target_type, unnest(original_target_objects) as target, " +
+ "relationship FROM blacklist WHERE status = 'ACCEPTED'";
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ ReadBlacklistFromDB.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/blacklist_parameters.json")));
+
+ parser.parseArgument(args);
+
+ final String dbUrl = parser.get("postgresUrl");
+ final String dbUser = parser.get("postgresUser");
+ final String dbPassword = parser.get("postgresPassword");
+ final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
+ final String hdfsNameNode = parser.get("hdfsNameNode");
+
+ try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
+ dbPassword)) {
+
+ log.info("Processing blacklist...");
+ rbl.execute(query, rbl::processBlacklistEntry);
+
+ }
+ }
+
+ public void execute(final String sql, final Function> producer) throws Exception {
+
+ final Consumer consumer = rs -> producer.apply(rs).forEach(r -> writeRelation(r));
+
+ dbClient.processResults(sql, consumer);
+ }
+
+ public List processBlacklistEntry(ResultSet rs) {
+ try {
+ Relation direct = new Relation();
+ Relation inverse = new Relation();
+
+ String source_prefix = ModelSupport.entityIdPrefix.get(rs.getString("source_type"));
+ String target_prefix = ModelSupport.entityIdPrefix.get(rs.getString("target_type"));
+
+ String source_direct = source_prefix + "|" + rs.getString("source");
+ direct.setSource(source_direct);
+ inverse.setTarget(source_direct);
+
+ String target_direct = target_prefix + "|" + rs.getString("target");
+ direct.setTarget(target_direct);
+ inverse.setSource(target_direct);
+
+ String encoding = rs.getString("relationship");
+ RelationInverse ri = ModelSupport.relationInverseMap.get(encoding);
+ direct.setRelClass(ri.getRelation());
+ inverse.setRelClass(ri.getInverse());
+ direct.setRelType(ri.getRelType());
+ inverse.setRelType(ri.getRelType());
+ direct.setSubRelType(ri.getSubReltype());
+ inverse.setSubRelType(ri.getSubReltype());
+
+ return Arrays.asList(direct, inverse);
+
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ dbClient.close();
+ writer.close();
+ }
+
+ public ReadBlacklistFromDB(
+ final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword)
+ throws Exception {
+
+ this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
+ this.conf = new Configuration();
+ this.conf.set("fs.defaultFS", hdfsNameNode);
+ FileSystem fileSystem = FileSystem.get(this.conf);
+ Path hdfsWritePath = new Path(hdfsPath);
+ FSDataOutputStream fsDataOutputStream = null;
+ if (fileSystem.exists(hdfsWritePath)) {
+ fsDataOutputStream = fileSystem.append(hdfsWritePath);
+ } else {
+ fsDataOutputStream = fileSystem.create(hdfsWritePath);
+ }
+
+ this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
+ }
+
+ protected void writeRelation(final Relation r) {
+ try {
+ writer.write(OBJECT_MAPPER.writeValueAsString(r));
+ writer.newLine();
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java
new file mode 100644
index 000000000..86587bfc9
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java
@@ -0,0 +1,147 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Objects;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+public class SparkRemoveBlacklistedRelationJob {
+ private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkRemoveBlacklistedRelationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ final String blacklistPath = parser.get("hdfsPath");
+ log.info("blacklistPath {}: ", blacklistPath);
+
+ final String mergesPath = parser.get("mergesPath");
+ log.info("mergesPath {}: ", mergesPath);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeBlacklistedRelations(
+ spark,
+ blacklistPath,
+ inputPath,
+ outputPath,
+ mergesPath);
+ });
+
+ }
+
+ private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
+ String outputPath, String mergesPath) {
+ Dataset blackListed = readRelations(spark, blacklistPath + "/blacklist");
+ Dataset inputRelation = readRelations(spark, inputPath);
+ Dataset mergesRelation = readRelations(spark, mergesPath);
+
+ log.info("InputRelationCount: {}", inputRelation.count());
+
+ Dataset dedupSource = blackListed
+ .joinWith(
+ mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
+ "left_outer")
+ .map((MapFunction, Relation>) c -> {
+ Optional
+ .ofNullable(c._2())
+ .ifPresent(mr -> c._1().setSource(mr.getSource()));
+ return c._1();
+ }, Encoders.bean(Relation.class));
+
+ Dataset dedupBL = dedupSource
+ .joinWith(
+ mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")),
+ "left_outer")
+ .map((MapFunction, Relation>) c -> {
+ Optional
+ .ofNullable(c._2())
+ .ifPresent(mr -> c._1().setTarget(mr.getSource()));
+ return c._1();
+ }, Encoders.bean(Relation.class));
+
+ dedupBL
+ .write()
+ .mode(SaveMode.Overwrite)
+ .json(blacklistPath + "/deduped");
+
+ inputRelation
+ .joinWith(
+ dedupBL, (inputRelation
+ .col("source")
+ .equalTo(dedupBL.col("source"))
+ .and(
+ inputRelation
+ .col("target")
+ .equalTo(dedupBL.col("target")))),
+ "left_outer")
+ .map((MapFunction, Relation>) c -> {
+ Relation ir = c._1();
+ Optional obl = Optional.ofNullable(c._2());
+ if (obl.isPresent()) {
+ if (ir.equals(obl.get())) {
+ return null;
+ }
+ }
+ return ir;
+ }, Encoders.bean(Relation.class))
+ .filter(Objects::nonNull)
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ public static org.apache.spark.sql.Dataset readRelations(
+ SparkSession spark, String inputPath) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map(
+ (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class),
+ Encoders.bean(Relation.class));
+ }
+
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json
new file mode 100644
index 000000000..9a2eadaa7
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json
@@ -0,0 +1,32 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nn",
+ "paramLongName": "hdfsNameNode",
+ "paramDescription": "the name node on hdfs",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pgurl",
+ "paramLongName": "postgresUrl",
+ "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pguser",
+ "paramLongName": "postgresUser",
+ "paramDescription": "postgres user",
+ "paramRequired": false
+ },
+ {
+ "paramName": "pgpasswd",
+ "paramLongName": "postgresPassword",
+ "paramDescription": "postgres password",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json
new file mode 100644
index 000000000..4a3d21f4d
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json
@@ -0,0 +1,26 @@
+[
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the path to the graph used to remove the relations ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path where to store the temporary result ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "issm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": false
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml
new file mode 100644
index 000000000..fe82ae194
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml
@@ -0,0 +1,54 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
new file mode 100644
index 000000000..1538318c1
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
@@ -0,0 +1,195 @@
+
+
+
+ postgresURL
+ the url of the postgress server to query
+
+
+ postgresUser
+ the username to access the postgres db
+
+
+ postgresPassword
+ the postgres password
+
+
+ sourcePath
+ the source path
+
+
+ outputPath
+ the graph output path
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/publication
+ ${nameNode}/${outputPath}/publication
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/dataset
+ ${nameNode}/${outputPath}/dataset
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/otherresearchproduct
+ ${nameNode}/${outputPath}/otherresearchproduct
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/software
+ ${nameNode}/${outputPath}/software
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB
+ --hdfsPath${workingDir}/blacklist
+ --hdfsNameNode${nameNode}
+ --postgresUrl${postgresURL}
+ --postgresUser${postgresUser}
+ --postgresPassword${postgresPassword}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ PrepareMergedRelation
+ eu.dnetlib.dhp.blacklist.PrepareMergedRelationJob
+ dhp-blacklist-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/relation
+ --outputPath${workingDir}/mergesRelation
+ --hive_metastore_uris${hive_metastore_uris}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ApplyBlacklist
+ eu.dnetlib.dhp.blacklist.SparkRemoveBlacklistedRelationJob
+ dhp-blacklist-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/relation
+ --outputPath${outputPath}/relation
+ --hdfsPath${workingDir}/blacklist
+ --mergesPath${workingDir}/mergesRelation
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json
new file mode 100644
index 000000000..91a87b8b5
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json
@@ -0,0 +1,33 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the path to the graph used to remove the relations ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path where to store the temporary result ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "issm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": false
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "mergesPath",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": true
+
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java
new file mode 100644
index 000000000..bbfd15674
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java
@@ -0,0 +1,167 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class BlackListTest {
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private static final ClassLoader cl = eu.dnetlib.dhp.blacklist.BlackListTest.class.getClassLoader();
+
+ private static SparkSession spark;
+
+ private static Path workingDir;
+ private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.blacklist.BlackListTest.class);
+
+ @BeforeAll
+ public static void beforeAll() throws IOException {
+ workingDir = Files.createTempDirectory(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
+ log.info("using work dir {}", workingDir);
+
+ SparkConf conf = new SparkConf();
+ conf.setAppName(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
+
+ conf.setMaster("local[*]");
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("spark.ui.enabled", "false");
+ conf.set("spark.sql.warehouse.dir", workingDir.toString());
+ conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+ spark = SparkSession
+ .builder()
+ .appName(BlackListTest.class.getSimpleName())
+ .config(conf)
+ .getOrCreate();
+ }
+
+ @AfterAll
+ public static void afterAll() throws IOException {
+ FileUtils.deleteDirectory(workingDir.toFile());
+ spark.stop();
+ }
+
+ /*
+ * String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath =
+ * parser.get("outputPath"); log.info("outputPath {}: ", outputPath); final String blacklistPath =
+ * parser.get("hdfsPath"); log.info("blacklistPath {}: ", blacklistPath); final String mergesPath =
+ * parser.get("mergesPath"); log.info("mergesPath {}: ", mergesPath);
+ */
+ @Test
+ public void noRemoveTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsNoRemoval").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(13, tmp.count());
+
+ }
+
+ @Test
+ public void removeNoMergeMatchTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsOneRemoval").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(12, tmp.count());
+
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
+
+ Assertions
+ .assertEquals(
+ 0, verificationDataset
+ .filter(
+ "source = '40|corda__h2020::5161f53ab205d803c36b4c888fe7deef' and " +
+ "target = '20|dedup_wf_001::157af406bc653aa4d9749318b644de43'")
+ .count());
+
+ Assertions.assertEquals(0, verificationDataset.filter("relClass = 'hasParticipant'").count());
+ }
+
+ @Test
+ public void removeMergeMatchTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRelOneMerge").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(12, tmp.count());
+
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
+
+ Assertions.assertEquals(12, verificationDataset.filter("relClass = 'isProvidedBy'").count());
+
+ }
+}
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist
new file mode 100644
index 000000000..ea95130af
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist
@@ -0,0 +1,20 @@
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"hasParticipant","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"isParticipant","source":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43","target":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::a727cc288016db7132ef9a799aa83350","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::a727cc288016db7132ef9a799aa83350"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::062cf091d5c7a7d730001c34177042e3","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::062cf091d5c7a7d730001c34177042e3"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::1b172ab34639e7935e2357119cf20830","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|od_______908::1b172ab34639e7935e2357119cf20830"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json
new file mode 100644
index 000000000..8f0d296d6
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json
@@ -0,0 +1,14 @@
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______177::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json
new file mode 100644
index 000000000..3d74ffa6e
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json
@@ -0,0 +1,14 @@
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json
new file mode 100644
index 000000000..761cba478
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProducedBy","relType":"resultProject","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"outcome","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json
new file mode 100644
index 000000000..a79d1d8eb
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::018cb61ed43c01704decc66183ce5d60","subRelType":"provision","target":"20|dedup_wf_001::b9fff055ce5efacecbe4ef918c127f86"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json
new file mode 100644
index 000000000..f809acfeb
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","subRelType":"participation","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala
new file mode 100644
index 000000000..189e90ed9
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala
@@ -0,0 +1,53 @@
+package eu.dnetlib.doiboost.mag
+
+
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods.parse
+
+
+case class Papers(PaperId:Long, Rank:Integer, Doi:String,
+ DocType:String, PaperTitle:String, OriginalTitle:String,
+ BookTitle:String, Year:Option[Integer], Date:Option[java.sql.Timestamp], Publisher:String,
+ JournalId:Option[Long], ConferenceSeriesId:Option[Long], ConferenceInstanceId:Option[Long],
+ Volume:String, Issue:String, FirstPage:String, LastPage:String,
+ ReferenceCount:Option[Long], CitationCount:Option[Long], EstimatedCitation:Option[Long],
+ OriginalVenue:String, FamilyId:Option[Long], CreatedDate:java.sql.Timestamp) {}
+
+
+case class PaperAbstract(PaperId:Long,IndexedAbstract:String) {}
+
+
+
+case object ConversionUtil {
+
+
+
+ def transformPaperAbstract(input:PaperAbstract) : PaperAbstract = {
+ PaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
+ }
+
+
+
+ def convertInvertedIndexString(json_input:String) :String = {
+ implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+ lazy val json: json4s.JValue = parse(json_input)
+
+
+
+ val idl = (json \ "IndexLength").extract[Int]
+
+ if (idl > 0) {
+ val res = Array.ofDim[String](idl)
+
+ val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
+
+ for {(k:String,v:List[Int]) <- iid}{
+ v.foreach(item => res(item) = k)
+ }
+ return res.mkString(" ")
+
+ }
+ ""
+ }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
index 82ea48f33..f291a92f9 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
@@ -63,7 +63,7 @@ object SparkImportMagIntoDataset {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_mag_to_oaf_params.json")))
+ val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala
new file mode 100644
index 000000000..4c014a95c
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala
@@ -0,0 +1,63 @@
+package eu.dnetlib.doiboost.mag
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+import org.apache.spark.sql.functions._
+
+object SparkPreProcessMAG {
+
+
+ def main(args: Array[String]): Unit = {
+
+ val logger: Logger = LoggerFactory.getLogger(getClass)
+ val conf: SparkConf = new SparkConf()
+ val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
+ parser.parseArgument(args)
+ val spark: SparkSession =
+ SparkSession
+ .builder()
+ .config(conf)
+ .appName(getClass.getSimpleName)
+ .master(parser.get("master")).getOrCreate()
+ import spark.implicits._
+
+ logger.info("Phase 1) make uninque DOI in Papers:")
+
+ val d: Dataset[Papers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[Papers]
+
+
+ // Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
+ val result: RDD[Papers] = d.where(col("Doi").isNotNull).rdd.map { p: Papers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: Papers, p2: Papers) =>
+ var r = if (p1 == null) p2 else p1
+ if (p1 != null && p2 != null) {
+ if (p1.CreatedDate != null && p2.CreatedDate != null) {
+ if (p1.CreatedDate.before(p2.CreatedDate))
+ r = p1
+ else
+ r = p2
+ } else {
+ r = if (p1.CreatedDate == null) p2 else p1
+ }
+ }
+ r
+ }.map(_._2)
+
+ val distinctPaper: Dataset[Papers] = spark.createDataset(result)
+ distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
+ logger.info(s"Total number of element: ${result.count()}")
+
+ logger.info("Phase 2) convert InverdIndex Abastrac to string")
+ val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[PaperAbstract]
+ pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
+
+
+ distinctPaper.joinWith(pa, col("PaperId").eqia)
+
+ }
+
+
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_mag_to_oaf_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json
similarity index 100%
rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_mag_to_oaf_params.json
rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml
index 801dca612..ba6eea364 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml
@@ -34,7 +34,7 @@
-
+
@@ -59,5 +59,28 @@
+
+
+
+
+ yarn-cluster
+ cluster
+ Convert Mag to Dataset
+ eu.dnetlib.doiboost.mag.SparkPreProcessMAG
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ ${sparkExtraOPT}
+
+ --sourcePath${sourcePath}
+ --targetPath${targetPath}
+ --masteryarn-cluster
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json
new file mode 100644
index 000000000..bf0b80f69
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json
@@ -0,0 +1,6 @@
+[
+ {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
+ {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
+ {"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/CrossrefMappingTest.scala
index 75a63d70f..2d7cf4216 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/CrossrefMappingTest.scala
@@ -1,20 +1,15 @@
package eu.dnetlib.doiboost
-import com.fasterxml.jackson.databind.SerializationFeature
-import eu.dnetlib.dhp.schema.oaf.{Dataset, KeyValue, Oaf, Publication, Relation, Result}
+import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.utils.DHPUtils
-import eu.dnetlib.doiboost.crossref.{Crossref2Oaf, SparkMapDumpIntoOAF}
-import eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset
-import org.apache.spark.{SparkConf, sql}
-import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import eu.dnetlib.doiboost.crossref.Crossref2Oaf
import org.codehaus.jackson.map.ObjectMapper
-import org.junit.jupiter.api.Test
-
-import scala.io.Source
import org.junit.jupiter.api.Assertions._
+import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
+import scala.io.Source
import scala.util.matching.Regex
@@ -24,12 +19,6 @@ class CrossrefMappingTest {
val mapper = new ObjectMapper()
-
- def testMAGCSV() :Unit = {
- SparkImportMagIntoDataset.main(null)
- }
-
-
@Test
def testFunderRelationshipsMapping(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/DatasetModel.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/DatasetModel.scala
deleted file mode 100644
index 07235d770..000000000
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/DatasetModel.scala
+++ /dev/null
@@ -1,14 +0,0 @@
-package eu.dnetlib.doiboost.mag
-
-
-case class Papers(PaperId:Long, Rank:Integer, Doi:String,
- DocType:String, PaperTitle:String, OriginalTitle:String,
- BookTitle:String, Year:Option[Integer], Date:Option[java.sql.Timestamp], Publisher:String,
- JournalId:Option[Long], ConferenceSeriesId:Option[Long], ConferenceInstanceId:Option[Long],
- Volume:String, Issue:String, FirstPage:String, LastPage:String,
- ReferenceCount:Option[Long], CitationCount:Option[Long], EstimatedCitation:Option[Long],
- OriginalVenue:String, FamilyId:Option[Long], CreatedDate:java.sql.Timestamp) {}
-
-
-
-
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala
index f60e10cf5..0aaaeb377 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala
@@ -1,13 +1,10 @@
package eu.dnetlib.doiboost.mag
-import org.apache.spark.SparkConf
-import org.apache.spark.api.java.function.ReduceFunction
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Encoders, SaveMode, SparkSession}
import org.codehaus.jackson.map.ObjectMapper
import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
-import org.apache.spark.sql.functions._
+import org.junit.jupiter.api.Assertions._
+import scala.io.Source
class MAGMappingTest {
@@ -18,34 +15,18 @@ class MAGMappingTest {
//@Test
def testMAGCSV(): Unit = {
-
- val conf: SparkConf = new SparkConf()
- val spark: SparkSession =
- SparkSession
- .builder()
- .config(conf)
- .appName(getClass.getSimpleName)
- .master("local[*]").getOrCreate()
+ SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" "))
+ }
- import spark.implicits._
- val d: Dataset[Papers] = spark.read.load("/data/doiboost/mag/datasets/Papers").as[Papers]
- logger.info(s"Total number of element: ${d.where(col("Doi").isNotNull).count()}")
- //implicit val mapEncoder = org.apache.spark.sql.Encoders.bean[Papers]
- val result: RDD[Papers] = d.where(col("Doi").isNotNull).rdd.map { p: Papers => Tuple2(p.Doi, p) }.reduceByKey {case (p1:Papers, p2:Papers) =>
- var r = if (p1==null) p2 else p1
- if (p1!=null && p2!=null ) if (p1.CreatedDate.before(p2.CreatedDate))
- r = p1
- else
- r = p2
- r
- }.map(_._2)
-
-
- val distinctPaper:Dataset[Papers] = spark.createDataset(result)
- distinctPaper.write.mode(SaveMode.Overwrite).save("/data/doiboost/mag/datasets/Papers_d")
- logger.info(s"Total number of element: ${result.count()}")
+ @Test
+ def buildInvertedIndexTest() :Unit = {
+ val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
+ val description = ConversionUtil.convertInvertedIndexString(json_input)
+ assertNotNull(description)
+ assertTrue(description.nonEmpty)
+ logger.debug(description)
}
diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/invertedIndex.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/invertedIndex.json
new file mode 100644
index 000000000..0a84e330d
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/invertedIndex.json
@@ -0,0 +1,334 @@
+{
+ "IndexLength": 139,
+ "InvertedIndex": {
+ "The": [
+ 0,
+ 23,
+ 47
+ ],
+ "invention": [
+ 1,
+ 53
+ ],
+ "discloses": [
+ 2
+ ],
+ "a": [
+ 3,
+ 10,
+ 71,
+ 81,
+ 121
+ ],
+ "treatment": [
+ 4,
+ 69,
+ 85,
+ 96
+ ],
+ "method": [
+ 5,
+ 24,
+ 49
+ ],
+ "of": [
+ 6,
+ 9,
+ 19,
+ 57,
+ 84,
+ 117,
+ 120
+ ],
+ "waste": [
+ 7,
+ 118
+ ],
+ "mash": [
+ 8,
+ 119
+ ],
+ "cane": [
+ 11,
+ 122
+ ],
+ "sugar": [
+ 12,
+ 123
+ ],
+ "factory,": [
+ 13
+ ],
+ "belonging": [
+ 14
+ ],
+ "to": [
+ 15
+ ],
+ "the": [
+ 16,
+ 26,
+ 52,
+ 55,
+ 66,
+ 93,
+ 115,
+ 135
+ ],
+ "technical": [
+ 17,
+ 48
+ ],
+ "field": [
+ 18
+ ],
+ "industrial": [
+ 20
+ ],
+ "wastewater": [
+ 21
+ ],
+ "treatment.": [
+ 22
+ ],
+ "comprises": [
+ 25
+ ],
+ "following": [
+ 27
+ ],
+ "steps": [
+ 28
+ ],
+ "of:": [
+ 29
+ ],
+ "(1)": [
+ 30
+ ],
+ "pretreatment;": [
+ 31
+ ],
+ "(2)": [
+ 32
+ ],
+ "primary": [
+ 33
+ ],
+ "concentration;": [
+ 34
+ ],
+ "(3)": [
+ 35
+ ],
+ "cooling": [
+ 36
+ ],
+ "sedimentation": [
+ 37
+ ],
+ "and": [
+ 38,
+ 45,
+ 62,
+ 80,
+ 86,
+ 114,
+ 134
+ ],
+ "dense": [
+ 39
+ ],
+ "slurry": [
+ 40
+ ],
+ "drying;": [
+ 41
+ ],
+ "(4)": [
+ 42
+ ],
+ "secondary": [
+ 43
+ ],
+ "concentration": [
+ 44
+ ],
+ "drying.": [
+ 46
+ ],
+ "disclosed": [
+ 50
+ ],
+ "by": [
+ 51
+ ],
+ "has": [
+ 54
+ ],
+ "advantages": [
+ 56
+ ],
+ "small": [
+ 58
+ ],
+ "investment,": [
+ 59
+ ],
+ "simple": [
+ 60
+ ],
+ "equipment": [
+ 61
+ ],
+ "easiness": [
+ 63
+ ],
+ "in": [
+ 64,
+ 132
+ ],
+ "popularization;": [
+ 65
+ ],
+ "product": [
+ 67
+ ],
+ "after": [
+ 68
+ ],
+ "is": [
+ 70,
+ 91,
+ 98,
+ 102,
+ 112,
+ 130,
+ 137
+ ],
+ "high-quality": [
+ 72
+ ],
+ "high": [
+ 73
+ ],
+ "value-added": [
+ 74
+ ],
+ "(fully": [
+ 75
+ ],
+ "water-soluble)": [
+ 76
+ ],
+ "potassium": [
+ 77
+ ],
+ "humate": [
+ 78
+ ],
+ "product,": [
+ 79
+ ],
+ "new": [
+ 82
+ ],
+ "mode": [
+ 83
+ ],
+ "profit": [
+ 87
+ ],
+ "enabling": [
+ 88
+ ],
+ "sustainable": [
+ 89
+ ],
+ "development": [
+ 90
+ ],
+ "realized;": [
+ 92
+ ],
+ "environmental": [
+ 94
+ ],
+ "protection": [
+ 95
+ ],
+ "effect": [
+ 97
+ ],
+ "good,": [
+ 99
+ ],
+ "water": [
+ 100,
+ 106
+ ],
+ "balance": [
+ 101
+ ],
+ "realized": [
+ 103
+ ],
+ "through": [
+ 104
+ ],
+ "final": [
+ 105
+ ],
+ "quality": [
+ 107
+ ],
+ "treatment,": [
+ 108
+ ],
+ "real": [
+ 109
+ ],
+ "zero": [
+ 110
+ ],
+ "emission": [
+ 111
+ ],
+ "realized,": [
+ 113
+ ],
+ "problem": [
+ 116
+ ],
+ "factory": [
+ 124
+ ],
+ "can": [
+ 125
+ ],
+ "be": [
+ 126
+ ],
+ "solved": [
+ 127
+ ],
+ "fundamentally;": [
+ 128
+ ],
+ "energy": [
+ 129
+ ],
+ "saved": [
+ 131
+ ],
+ "operation,": [
+ 133
+ ],
+ "feasibility": [
+ 136
+ ],
+ "high.": [
+ 138
+ ]
+ }
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml
new file mode 100644
index 000000000..fe9833e3e
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/pom.xml
@@ -0,0 +1,64 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.1-SNAPSHOT
+
+ 4.0.0
+
+ dhp-enrichment
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+ org.apache.spark
+ spark-hive_2.11
+ test
+
+
+
+ dom4j
+ dom4j
+
+
+ jaxen
+ jaxen
+
+
+ com.jayway.jsonpath
+ json-path
+
+
+
+ io.github.classgraph
+ classgraph
+ 4.8.71
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java
new file mode 100644
index 000000000..8d2fede82
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java
@@ -0,0 +1,169 @@
+
+package eu.dnetlib.dhp;
+
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class PropagationConstant {
+ public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional";
+
+ public static final String PROPAGATION_DATA_INFO_TYPE = "propagation";
+
+ public static final String TRUE = "true";
+
+ public static final String DNET_COUNTRY_SCHEMA = "dnet:countries";
+ public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
+ public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
+
+ public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
+ public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
+
+ public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = "result:organization:instrepo";
+ public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository";
+
+ public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel";
+ public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation";
+
+ public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = "result:community:semrel";
+ public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation";
+
+ public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = "result:community:organization";
+ public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization";
+
+ public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
+ public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
+
+ public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy";
+
+ public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization";
+ public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation";
+ public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf";
+ public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution";
+
+ public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult";
+
+ public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject";
+ public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome";
+ public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy";
+ public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces";
+
+ public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges";
+
+ public static final String PROPAGATION_AUTHOR_PID = "ORCID";
+
+ public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
+ +
+ "from result r " +
+ "lateral view explode(instance) i as inst " +
+ "where r.datainfo.deletedbyinference=false";
+
+ public static Country getCountry(String classid, String classname) {
+ Country nc = new Country();
+ nc.setClassid(classid);
+ nc.setClassname(classname);
+ nc.setSchemename(DNET_COUNTRY_SCHEMA);
+ nc.setSchemeid(DNET_COUNTRY_SCHEMA);
+ nc
+ .setDataInfo(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_COUNTRY_INSTREPO_CLASS_ID,
+ PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME));
+ return nc;
+ }
+
+ public static DataInfo getDataInfo(
+ String inference_provenance, String inference_class_id, String inference_class_name) {
+ DataInfo di = new DataInfo();
+ di.setInferred(true);
+ di.setDeletedbyinference(false);
+ di.setTrust("0.85");
+ di.setInferenceprovenance(inference_provenance);
+ di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
+ return di;
+ }
+
+ public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
+ Qualifier pa = new Qualifier();
+ pa.setClassid(inference_class_id);
+ pa.setClassname(inference_class_name);
+ pa.setSchemeid(DNET_SCHEMA_ID);
+ pa.setSchemename(DNET_SCHEMA_NAME);
+ return pa;
+ }
+
+ public static Relation getRelation(
+ String source,
+ String target,
+ String rel_class,
+ String rel_type,
+ String subrel_type,
+ String inference_provenance,
+ String inference_class_id,
+ String inference_class_name) {
+ Relation r = new Relation();
+ r.setSource(source);
+ r.setTarget(target);
+ r.setRelClass(rel_class);
+ r.setRelType(rel_type);
+ r.setSubRelType(subrel_type);
+ r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name));
+ return r;
+ }
+
+ public static String getConstraintList(String text, List constraints) {
+ String ret = " and (" + text + constraints.get(0) + "'";
+ for (int i = 1; i < constraints.size(); i++) {
+ ret += " OR " + text + constraints.get(i) + "'";
+ }
+ ret += ")";
+ return ret;
+ }
+
+ public static void removeOutputDir(SparkSession spark, String path) {
+ HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+ }
+
+ public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
+ return Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ }
+
+ public static Boolean isTest(ArgumentApplicationParser parser) {
+ return Optional
+ .ofNullable(parser.get("isTest"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.FALSE);
+ }
+
+ public static void createCfHbforResult(SparkSession spark) {
+ org.apache.spark.sql.Dataset cfhb = spark.sql(cfHbforResultQuery);
+ cfhb.createOrReplaceTempView("cfhb");
+ }
+
+ public static Dataset readPath(
+ SparkSession spark, String inputPath, Class clazz) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
new file mode 100644
index 000000000..75d85e2ba
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@@ -0,0 +1,120 @@
+
+package eu.dnetlib.dhp.bulktag;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.bulktag.community.*;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class SparkBulkTagJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob.class);
+ public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkBulkTagJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ Boolean isTest = Optional
+ .ofNullable(parser.get("isTest"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.FALSE);
+ log.info("isTest: {} ", isTest);
+
+ final String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class);
+ log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ CommunityConfiguration cc;
+
+ String taggingConf = parser.get("taggingConf");
+
+ if (isTest) {
+ cc = CommunityConfigurationFactory.newInstance(taggingConf);
+ } else {
+ cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookUpUrl"));
+ }
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
+ });
+ }
+
+ private static void execBulkTag(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ ProtoMap protoMappingParams,
+ Class resultClazz,
+ CommunityConfiguration communityConfiguration) {
+
+ ResultTagger resultTagger = new ResultTagger();
+ readPath(spark, inputPath, resultClazz)
+ .map(
+ (MapFunction) value -> resultTagger
+ .enrichContextCriteria(
+ value, communityConfiguration, protoMappingParams),
+ Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ public static Dataset readPath(
+ SparkSession spark, String inputPath, Class clazz) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java
new file mode 100644
index 000000000..0f45d3beb
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 01/08/2018. */
+public class Community implements Serializable {
+
+ private static final Log log = LogFactory.getLog(Community.class);
+
+ private String id;
+ private List subjects = new ArrayList<>();
+ private List providers = new ArrayList<>();
+ private List zenodoCommunities = new ArrayList<>();
+
+ public String toJson() {
+ final Gson g = new Gson();
+ return g.toJson(this);
+ }
+
+ public boolean isValid() {
+ return !getSubjects().isEmpty()
+ || !getProviders().isEmpty()
+ || !getZenodoCommunities().isEmpty();
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public List getSubjects() {
+ return subjects;
+ }
+
+ public void setSubjects(List subjects) {
+ this.subjects = subjects;
+ }
+
+ public List getProviders() {
+ return providers;
+ }
+
+ public void setProviders(List providers) {
+ this.providers = providers;
+ }
+
+ public List getZenodoCommunities() {
+ return zenodoCommunities;
+ }
+
+ public void setZenodoCommunities(List zenodoCommunities) {
+ this.zenodoCommunities = zenodoCommunities;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java
new file mode 100644
index 000000000..29ddde15f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java
@@ -0,0 +1,196 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
+import eu.dnetlib.dhp.bulktag.criteria.Selection;
+
+/** Created by miriam on 02/08/2018. */
+public class CommunityConfiguration implements Serializable {
+
+ private static final Log log = LogFactory.getLog(CommunityConfiguration.class);
+
+ private Map communities;
+
+ // map subject -> communityid
+ private Map>> subjectMap = new HashMap<>();
+ // map datasourceid -> communityid
+ private Map>> datasourceMap = new HashMap<>();
+ // map zenodocommunityid -> communityid
+ private Map>> zenodocommunityMap = new HashMap<>();
+
+ public Map>> getSubjectMap() {
+ return subjectMap;
+ }
+
+ public void setSubjectMap(Map>> subjectMap) {
+ this.subjectMap = subjectMap;
+ }
+
+ public Map>> getDatasourceMap() {
+ return datasourceMap;
+ }
+
+ public void setDatasourceMap(
+ Map>> datasourceMap) {
+ this.datasourceMap = datasourceMap;
+ }
+
+ public Map>> getZenodocommunityMap() {
+ return zenodocommunityMap;
+ }
+
+ public void setZenodocommunityMap(
+ Map>> zenodocommunityMap) {
+ this.zenodocommunityMap = zenodocommunityMap;
+ }
+
+ CommunityConfiguration(final Map communities) {
+ this.communities = communities;
+ init();
+ }
+
+ void init() {
+
+ if (subjectMap == null) {
+ subjectMap = Maps.newHashMap();
+ }
+ if (datasourceMap == null) {
+ datasourceMap = Maps.newHashMap();
+ }
+ if (zenodocommunityMap == null) {
+ zenodocommunityMap = Maps.newHashMap();
+ }
+
+ for (Community c : getCommunities().values()) {
+ // get subjects
+ final String id = c.getId();
+ for (String sbj : c.getSubjects()) {
+ Pair p = new Pair<>(id, new SelectionConstraints());
+ add(sbj.toLowerCase().trim(), p, subjectMap);
+ }
+ // get datasources
+ for (Provider d : c.getProviders()) {
+
+ add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
+ }
+ // get zenodo communities
+ for (ZenodoCommunity zc : c.getZenodoCommunities()) {
+ add(
+ zc.getZenodoCommunityId(),
+ new Pair<>(id, zc.getSelCriteria()),
+ zenodocommunityMap);
+ }
+ }
+ }
+
+ private void add(
+ String key,
+ Pair value,
+ Map>> map) {
+ List> values = map.get(key);
+
+ if (values == null) {
+ values = new ArrayList<>();
+ map.put(key, values);
+ }
+ values.add(value);
+ }
+
+ public List> getCommunityForSubject(String sbj) {
+ return subjectMap.get(sbj);
+ }
+
+ public List> getCommunityForDatasource(String dts) {
+ return datasourceMap.get(dts);
+ }
+
+ public List getCommunityForDatasource(
+ final String dts, final Map> param) {
+ List> lp = datasourceMap.get(dts);
+ if (lp == null)
+ return Lists.newArrayList();
+
+ return lp
+ .stream()
+ .map(
+ p -> {
+ if (p.getSnd() == null)
+ return p.getFst();
+ if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
+ return p.getFst();
+ else
+ return null;
+ })
+ .filter(st -> (st != null))
+ .collect(Collectors.toList());
+ }
+
+ public List> getCommunityForZenodoCommunity(String zc) {
+ return zenodocommunityMap.get(zc);
+ }
+
+ public List getCommunityForSubjectValue(String value) {
+
+ return getContextIds(subjectMap.get(value));
+ }
+
+ public List getCommunityForDatasourceValue(String value) {
+
+ return getContextIds(datasourceMap.get(value.toLowerCase()));
+ }
+
+ public List getCommunityForZenodoCommunityValue(String value) {
+
+ return getContextIds(zenodocommunityMap.get(value.toLowerCase()));
+ }
+
+ private List getContextIds(List> list) {
+ if (list != null) {
+ return list.stream().map(p -> p.getFst()).collect(Collectors.toList());
+ }
+ return Lists.newArrayList();
+ }
+
+ public Map getCommunities() {
+ return communities;
+ }
+
+ public void setCommunities(Map communities) {
+ this.communities = communities;
+ }
+
+ public String toJson() {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
+ Gson gson = builder.create();
+
+ return gson.toJson(this);
+ }
+
+ public int size() {
+ return communities.keySet().size();
+ }
+
+ public Community getCommunityById(String id) {
+ return communities.get(id);
+ }
+
+ public List getCommunityList() {
+ return Lists.newLinkedList(communities.values());
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java
new file mode 100644
index 000000000..607315f3f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java
@@ -0,0 +1,138 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Node;
+import org.dom4j.io.SAXReader;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
+import eu.dnetlib.dhp.bulktag.criteria.Selection;
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
+
+/** Created by miriam on 03/08/2018. */
+public class CommunityConfigurationFactory {
+
+ private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class);
+
+ private static VerbResolver resolver = VerbResolverFactory.newInstance();
+
+ public static CommunityConfiguration newInstance(final String xml) throws DocumentException {
+
+ log.debug(String.format("parsing community configuration from:\n%s", xml));
+
+ final Document doc = new SAXReader().read(new StringReader(xml));
+
+ final Map communities = Maps.newHashMap();
+
+ for (final Object o : doc.selectNodes("//community")) {
+
+ final Node node = (Node) o;
+
+ final Community community = parseCommunity(node);
+
+ if (community.isValid()) {
+ communities.put(community.getId(), community);
+ }
+ }
+
+ log.info(String.format("loaded %s community configuration profiles", communities.size()));
+ log.debug(String.format("loaded community configuration:\n%s", communities.toString()));
+
+ return new CommunityConfiguration(communities);
+ }
+
+ public static CommunityConfiguration fromJson(final String json) {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
+ Gson gson = builder.create();
+ final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class);
+ log.info(String.format("loaded %s community configuration profiles", conf.size()));
+ conf.init();
+ log.info("created inverse maps");
+
+ return conf;
+ }
+
+ private static Community parseCommunity(final Node node) {
+
+ final Community c = new Community();
+
+ c.setId(node.valueOf("./@id"));
+
+ log.info(String.format("community id: %s", c.getId()));
+
+ c.setSubjects(parseSubjects(node));
+ c.setProviders(parseDatasources(node));
+ c.setZenodoCommunities(parseZenodoCommunities(node));
+ return c;
+ }
+
+ private static List parseSubjects(final Node node) {
+
+ final List subjects = Lists.newArrayList();
+
+ final List list = node.selectNodes("./subjects/subject");
+
+ for (Node n : list) {
+ log.debug("text of the node " + n.getText());
+ subjects.add(StringUtils.trim(n.getText()));
+ }
+ log.info("size of the subject list " + subjects.size());
+ return subjects;
+ }
+
+ private static List parseDatasources(final Node node) {
+ final List list = node.selectNodes("./datasources/datasource");
+ final List providerList = new ArrayList<>();
+ for (Node n : list) {
+ Provider d = new Provider();
+ d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
+ d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
+ providerList.add(d);
+ }
+ log.info("size of the datasource list " + providerList.size());
+ return providerList;
+ }
+
+ private static List parseZenodoCommunities(final Node node) {
+ final Node oacommunitynode = node.selectSingleNode("./oacommunity");
+ String oacommunity = null;
+ if (oacommunitynode != null) {
+ String tmp = oacommunitynode.getText();
+ if (StringUtils.isNotBlank(tmp))
+ oacommunity = tmp;
+ }
+
+ final List list = node.selectNodes("./zenodocommunities/zenodocommunity");
+ final List zenodoCommunityList = new ArrayList<>();
+ for (Node n : list) {
+ ZenodoCommunity zc = new ZenodoCommunity();
+ zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText());
+ zc.setSelCriteria(n.selectSingleNode("./selcriteria"));
+
+ zenodoCommunityList.add(zc);
+ }
+ if (oacommunity != null) {
+ ZenodoCommunity zc = new ZenodoCommunity();
+ zc.setZenodoCommunityId(oacommunity);
+ zenodoCommunityList.add(zc);
+ }
+ log.info("size of the zenodo community list " + zenodoCommunityList.size());
+ return zenodoCommunityList;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
new file mode 100644
index 000000000..e0856ae8f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
@@ -0,0 +1,56 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+
+import eu.dnetlib.dhp.bulktag.criteria.Selection;
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+public class Constraint implements Serializable {
+ private String verb;
+ private String field;
+ private String value;
+ private Selection selection;
+
+ public Constraint() {
+ }
+
+ public String getVerb() {
+ return verb;
+ }
+
+ public void setVerb(String verb) {
+ this.verb = verb;
+ }
+
+ public String getField() {
+ return field;
+ }
+
+ public void setField(String field) {
+ this.field = field;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(String value) {
+ this.value = value;
+ }
+
+ public void setSelection(Selection sel) {
+ selection = sel;
+ }
+
+ public void setSelection(VerbResolver resolver)
+ throws InvocationTargetException, NoSuchMethodException, InstantiationException,
+ IllegalAccessException {
+ selection = resolver.getSelectionCriteria(verb, value);
+ }
+
+ public boolean verifyCriteria(String metadata) {
+ return selection.apply(metadata);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
new file mode 100644
index 000000000..b56dfaaa3
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
@@ -0,0 +1,74 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Type;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+/** Created by miriam on 02/08/2018. */
+public class Constraints implements Serializable {
+ private static final Log log = LogFactory.getLog(Constraints.class);
+ // private ConstraintEncapsulator ce;
+ private List constraint;
+
+ public Constraints() {
+ }
+
+ public List getConstraint() {
+ return constraint;
+ }
+
+ public void setConstraint(List constraint) {
+ this.constraint = constraint;
+ }
+
+ public void setSc(String json) {
+ Type collectionType = new TypeToken>() {
+ }.getType();
+ constraint = new Gson().fromJson(json, collectionType);
+ }
+
+ void setSelection(VerbResolver resolver) {
+ for (Constraint st : constraint) {
+
+ try {
+ st.setSelection(resolver);
+ } catch (NoSuchMethodException e) {
+ log.error(e.getMessage());
+ } catch (IllegalAccessException e) {
+ log.error(e.getMessage());
+ } catch (InvocationTargetException e) {
+ log.error(e.getMessage());
+ } catch (InstantiationException e) {
+ log.error(e.getMessage());
+ }
+ }
+ }
+
+ // Constraint in and
+ public boolean verifyCriteria(final Map> param) {
+
+ for (Constraint sc : constraint) {
+ boolean verified = false;
+ for (String value : param.get(sc.getField())) {
+ if (sc.verifyCriteria(value.trim())) {
+ verified = true;
+ }
+ }
+ if (!verified)
+ return verified;
+ }
+ return true;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java
new file mode 100644
index 000000000..50e1836fa
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java
@@ -0,0 +1,39 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 03/08/2018. */
+public class Pair implements Serializable {
+ private A fst;
+ private B snd;
+
+ public A getFst() {
+ return fst;
+ }
+
+ public Pair setFst(A fst) {
+ this.fst = fst;
+ return this;
+ }
+
+ public B getSnd() {
+ return snd;
+ }
+
+ public Pair setSnd(B snd) {
+ this.snd = snd;
+ return this;
+ }
+
+ public Pair(A a, B b) {
+ fst = a;
+ snd = b;
+ }
+
+ public String toJson() {
+ return new Gson().toJson(this);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java
new file mode 100644
index 000000000..fd7481719
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java
@@ -0,0 +1,12 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.util.HashMap;
+
+public class ProtoMap extends HashMap implements Serializable {
+
+ public ProtoMap() {
+ super();
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java
new file mode 100644
index 000000000..b9c37f4dc
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java
@@ -0,0 +1,61 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Node;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+/** Created by miriam on 01/08/2018. */
+public class Provider implements Serializable {
+ private static final Log log = LogFactory.getLog(Provider.class);
+
+ private String openaireId;
+
+ private SelectionConstraints selectionConstraints;
+
+ public SelectionConstraints getSelCriteria() {
+ return selectionConstraints;
+ }
+
+ public SelectionConstraints getSelectionConstraints() {
+ return selectionConstraints;
+ }
+
+ public void setSelectionConstraints(SelectionConstraints selectionConstraints) {
+ this.selectionConstraints = selectionConstraints;
+ }
+
+ public void setSelCriteria(SelectionConstraints selCriteria) {
+ this.selectionConstraints = selCriteria;
+ }
+
+ public String getOpenaireId() {
+ return openaireId;
+ }
+
+ public void setOpenaireId(String openaireId) {
+ this.openaireId = openaireId;
+ }
+
+ private void setSelCriteria(String json, VerbResolver resolver) {
+ log.info("Selection constraints for datasource = " + json);
+ selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class);
+
+ selectionConstraints.setSelection(resolver);
+ }
+
+ public void setSelCriteria(Node n, VerbResolver resolver) {
+ try {
+ setSelCriteria(n.getText(), resolver);
+ } catch (Exception e) {
+ log.info("not set selection criteria... ");
+ selectionConstraints = null;
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java
new file mode 100644
index 000000000..7ec2f916f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.util.List;
+
+import org.dom4j.DocumentException;
+
+import com.google.common.base.Joiner;
+
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class QueryInformationSystem {
+ private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+ + " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
+ + " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
+ + " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
+ + " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
+ + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] "
+ + " return "
+ + " "
+ + " { $x//CONFIGURATION/context/@id} "
+ + " "
+ + " {for $y in tokenize($subj,',') "
+ + " return "
+ + " {$y}} "
+ + " "
+ + " "
+ + " {for $d in $datasources "
+ + " where $d/param[./@name='enabled']/text()='true' "
+ + " return "
+ + " "
+ + " "
+ + " {$d//param[./@name='openaireId']/text()} "
+ + " "
+ + " "
+ + " {$d/param[./@name='selcriteria']/text()} "
+ + " "
+ + " } "
+ + " "
+ + " "
+ + " {for $zc in $communities "
+ + " return "
+ + " "
+ + " "
+ + " {$zc/param[./@name='zenodoid']/text()} "
+ + " "
+ + " "
+ + " {$zc/param[./@name='selcriteria']/text()} "
+ + " "
+ + " } "
+ + " "
+ + " ";
+
+ public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
+ throws ISLookUpException, DocumentException {
+ ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
+ final List res = isLookUp.quickSearchProfile(XQUERY);
+
+ final String xmlConf = "" + Joiner.on(" ").join(res) + "";
+
+ return CommunityConfigurationFactory.newInstance(xmlConf);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
new file mode 100644
index 000000000..f5a985d15
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
@@ -0,0 +1,247 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.gson.Gson;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.JsonPath;
+
+import eu.dnetlib.dhp.schema.oaf.*;
+
+/** Created by miriam on 02/08/2018. */
+public class ResultTagger implements Serializable {
+
+ private String trust = "0.8";
+
+ private boolean clearContext(Result result) {
+ int tmp = result.getContext().size();
+ List clist = result
+ .getContext()
+ .stream()
+ .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
+ .collect(Collectors.toList());
+ result.setContext(clist);
+ return (tmp != clist.size());
+ }
+
+ private Map> getParamMap(final Result result, Map params) {
+ Map> param = new HashMap<>();
+ String json = new Gson().toJson(result, Result.class);
+ DocumentContext jsonContext = JsonPath.parse(json);
+ if (params == null) {
+ params = new HashMap<>();
+ }
+ for (String key : params.keySet()) {
+ try {
+ param.put(key, jsonContext.read(params.get(key)));
+ } catch (com.jayway.jsonpath.PathNotFoundException e) {
+ param.put(key, new ArrayList<>());
+ // throw e;
+ }
+ }
+ return param;
+ }
+
+ public R enrichContextCriteria(
+ final R result, final CommunityConfiguration conf, final Map criteria) {
+
+ // }
+ // public Result enrichContextCriteria(final Result result, final CommunityConfiguration
+ // conf, final Map criteria) {
+ final Map> param = getParamMap(result, criteria);
+
+ // Verify if the entity is deletedbyinference. In case verify if to clean the context list
+ // from all the zenodo communities
+ if (result.getDataInfo().getDeletedbyinference()) {
+ clearContext(result);
+ return result;
+ }
+
+ // communities contains all the communities to be added as context for the result
+ final Set communities = new HashSet<>();
+
+ // tagging for Subject
+ final Set subjects = new HashSet<>();
+ Optional> oresultsubj = Optional.ofNullable(result.getSubject());
+ if (oresultsubj.isPresent()) {
+ oresultsubj
+ .get()
+ .stream()
+ .map(subject -> subject.getValue())
+ .filter(StringUtils::isNotBlank)
+ .map(String::toLowerCase)
+ .map(String::trim)
+ .collect(Collectors.toCollection(HashSet::new))
+ .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
+ }
+
+ communities.addAll(subjects);
+
+ // Tagging for datasource
+ final Set datasources = new HashSet<>();
+ final Set tmp = new HashSet<>();
+
+ Optional> oresultinstance = Optional.ofNullable(result.getInstance());
+ if (oresultinstance.isPresent()) {
+ for (Instance i : oresultinstance.get()) {
+ tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
+ tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
+ }
+
+ oresultinstance
+ .get()
+ .stream()
+ .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
+ .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
+ .map(s -> StringUtils.substringAfter(s, "|"))
+ .collect(Collectors.toCollection(HashSet::new))
+ .forEach(
+ dsId -> datasources
+ .addAll(
+ conf.getCommunityForDatasource(dsId, param)));
+ }
+
+ communities.addAll(datasources);
+
+ /* Tagging for Zenodo Communities */
+ final Set czenodo = new HashSet<>();
+
+ Optional> oresultcontext = Optional.ofNullable(result.getContext());
+ if (oresultcontext.isPresent()) {
+ oresultcontext
+ .get()
+ .stream()
+ .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
+ .collect(Collectors.toList())
+ .forEach(
+ c -> czenodo
+ .addAll(
+ conf
+ .getCommunityForZenodoCommunityValue(
+ c
+ .getId()
+ .substring(
+ c.getId().lastIndexOf("/") + 1)
+ .trim())));
+ }
+
+ communities.addAll(czenodo);
+
+ clearContext(result);
+
+ /* Verify if there is something to bulktag */
+ if (communities.isEmpty()) {
+ return result;
+ }
+
+ result
+ .getContext()
+ .stream()
+ .map(
+ c -> {
+ if (communities.contains(c.getId())) {
+ Optional> opt_dataInfoList = Optional.ofNullable(c.getDataInfo());
+ List dataInfoList;
+ if (opt_dataInfoList.isPresent())
+ dataInfoList = opt_dataInfoList.get();
+ else {
+ dataInfoList = new ArrayList<>();
+ c.setDataInfo(dataInfoList);
+ }
+ if (subjects.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_SUBJECT,
+ CLASS_NAME_BULKTAG_SUBJECT));
+ if (datasources.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_DATASOURCE,
+ CLASS_NAME_BULKTAG_DATASOURCE));
+ if (czenodo.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_CZENODO,
+ CLASS_NAME_BULKTAG_ZENODO));
+ }
+ return c;
+ })
+ .collect(Collectors.toList());
+
+ communities
+ .removeAll(
+ result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
+
+ if (communities.isEmpty())
+ return result;
+
+ List toaddcontext = communities
+ .stream()
+ .map(
+ c -> {
+ Context context = new Context();
+ context.setId(c);
+ List dataInfoList = new ArrayList<>();
+ if (subjects.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_SUBJECT,
+ CLASS_NAME_BULKTAG_SUBJECT));
+ if (datasources.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_DATASOURCE,
+ CLASS_NAME_BULKTAG_DATASOURCE));
+ if (czenodo.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_CZENODO,
+ CLASS_NAME_BULKTAG_ZENODO));
+ context.setDataInfo(dataInfoList);
+ return context;
+ })
+ .collect(Collectors.toList());
+
+ result.getContext().addAll(toaddcontext);
+ return result;
+ }
+
+ public static DataInfo getDataInfo(
+ String inference_provenance, String inference_class_id, String inference_class_name) {
+ DataInfo di = new DataInfo();
+ di.setInferred(true);
+ di.setInferenceprovenance(inference_provenance);
+ di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
+ return di;
+ }
+
+ public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
+ Qualifier pa = new Qualifier();
+ pa.setClassid(inference_class_id);
+ pa.setClassname(inference_class_name);
+ pa.setSchemeid(DNET_PROVENANCE_ACTIONS);
+ pa.setSchemename(DNET_PROVENANCE_ACTIONS);
+ return pa;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java
new file mode 100644
index 000000000..71ff61d1b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java
@@ -0,0 +1,51 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.lang.reflect.Type;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+public class SelectionConstraints implements Serializable {
+ private List criteria;
+
+ public SelectionConstraints() {
+ }
+
+ public List getCriteria() {
+ return criteria;
+ }
+
+ public void setCriteria(List criteria) {
+ this.criteria = criteria;
+ }
+
+ public void setSc(String json) {
+ Type collectionType = new TypeToken>() {
+ }.getType();
+ criteria = new Gson().fromJson(json, collectionType);
+ }
+
+ // Constraints in or
+ public boolean verifyCriteria(final Map> param) {
+ for (Constraints selc : criteria) {
+ if (selc.verifyCriteria(param)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public void setSelection(VerbResolver resolver) {
+
+ for (Constraints cs : criteria) {
+ cs.setSelection(resolver);
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java
new file mode 100644
index 000000000..3cdc7c941
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java
@@ -0,0 +1,17 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+public class TaggingConstants {
+
+ public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
+
+ public static final String CLASS_ID_SUBJECT = "community:subject";
+ public static final String CLASS_ID_DATASOURCE = "community:datasource";
+ public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
+
+ public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
+
+ public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
+ public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
+ public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java
new file mode 100644
index 000000000..bc6b75fba
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java
@@ -0,0 +1,45 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+
+import org.dom4j.Node;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 01/08/2018. */
+public class ZenodoCommunity implements Serializable {
+
+ private String zenodoCommunityId;
+
+ private SelectionConstraints selCriteria;
+
+ public String getZenodoCommunityId() {
+ return zenodoCommunityId;
+ }
+
+ public void setZenodoCommunityId(String zenodoCommunityId) {
+ this.zenodoCommunityId = zenodoCommunityId;
+ }
+
+ public SelectionConstraints getSelCriteria() {
+ return selCriteria;
+ }
+
+ public void setSelCriteria(SelectionConstraints selCriteria) {
+ this.selCriteria = selCriteria;
+ }
+
+ private void setSelCriteria(String json) {
+ // Type collectionType = new TypeToken>(){}.getType();
+ selCriteria = new Gson().fromJson(json, SelectionConstraints.class);
+ }
+
+ public void setSelCriteria(Node n) {
+ if (n == null) {
+ selCriteria = null;
+ } else {
+ setSelCriteria(n.getText());
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java
new file mode 100644
index 000000000..496630fa3
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("contains")
+public class ContainsVerb implements Selection, Serializable {
+
+ private String param;
+
+ public ContainsVerb() {
+ }
+
+ public ContainsVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.contains(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java
new file mode 100644
index 000000000..a4a6f5663
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("contains_ignorecase")
+public class ContainsVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public ContainsVerbIgnoreCase() {
+ }
+
+ public ContainsVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.toLowerCase().contains(param.toLowerCase());
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java
new file mode 100644
index 000000000..b9088d012
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("equals")
+public class EqualVerb implements Selection, Serializable {
+
+ private String param;
+
+ public EqualVerb() {
+ }
+
+ public EqualVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.equals(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java
new file mode 100644
index 000000000..c5f0ce070
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("equals_ignorecase")
+public class EqualVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public EqualVerbIgnoreCase() {
+ }
+
+ public EqualVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.equalsIgnoreCase(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java
new file mode 100644
index 000000000..e9b948b2b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java
@@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.lang.reflect.Type;
+
+import com.google.gson.*;
+
+public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
+
+ private static final String CLASSNAME = "CLASSNAME";
+ private static final String DATA = "DATA";
+
+ public Object deserialize(
+ JsonElement jsonElement,
+ Type type,
+ JsonDeserializationContext jsonDeserializationContext)
+ throws JsonParseException {
+
+ JsonObject jsonObject = jsonElement.getAsJsonObject();
+ JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME);
+ String className = prim.getAsString();
+ Class klass = getObjectClass(className);
+ return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass);
+ }
+
+ public JsonElement serialize(
+ Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) {
+ JsonObject jsonObject = new JsonObject();
+ jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName());
+ jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement));
+ return jsonObject;
+ }
+
+ /** **** Helper method to get the className of the object to be deserialized **** */
+ public Class getObjectClass(String className) {
+ try {
+ return Class.forName(className);
+ } catch (ClassNotFoundException e) {
+ // e.printStackTrace();
+ throw new JsonParseException(e.getMessage());
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java
new file mode 100644
index 000000000..03ec9804b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_contains")
+public class NotContainsVerb implements Selection, Serializable {
+
+ private String param;
+
+ public NotContainsVerb() {
+ }
+
+ public NotContainsVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.contains(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java
new file mode 100644
index 000000000..b21be83f0
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_contains_ignorecase")
+public class NotContainsVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public NotContainsVerbIgnoreCase() {
+ }
+
+ public NotContainsVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !(value.toLowerCase().contains(param.toLowerCase()));
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java
new file mode 100644
index 000000000..86bf00012
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_equals")
+public class NotEqualVerb implements Selection, Serializable {
+
+ private String param;
+
+ public NotEqualVerb(final String param) {
+ this.param = param;
+ }
+
+ public NotEqualVerb() {
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.equals(param);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java
new file mode 100644
index 000000000..c6958a641
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_equals_ignorecase")
+public class NotEqualVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public NotEqualVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ public NotEqualVerbIgnoreCase() {
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.equalsIgnoreCase(param);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java
new file mode 100644
index 000000000..ec9fb716d
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java
@@ -0,0 +1,7 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+public interface Selection {
+
+ boolean apply(String value);
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java
new file mode 100644
index 000000000..5b35919bd
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java
@@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+@interface VerbClass {
+
+ String value();
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java
new file mode 100644
index 000000000..3d0db2063
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java
@@ -0,0 +1,56 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import io.github.classgraph.ClassGraph;
+import io.github.classgraph.ClassInfo;
+import io.github.classgraph.ClassInfoList;
+import io.github.classgraph.ScanResult;
+
+public class VerbResolver implements Serializable {
+ private Map> map = null; // = new HashMap<>();
+ private final ClassGraph classgraph = new ClassGraph();
+
+ public VerbResolver() {
+
+ try (ScanResult scanResult = // Assign scanResult in try-with-resources
+ classgraph // Create a new ClassGraph instance
+ .verbose() // If you want to enable logging to stderr
+ .enableAllInfo() // Scan classes, methods, fields, annotations
+ .whitelistPackages(
+ "eu.dnetlib.dhp.bulktag.criteria") // Scan com.xyz and subpackages
+ .scan()) { // Perform the scan and return a ScanResult
+
+ ClassInfoList routeClassInfoList = scanResult
+ .getClassesWithAnnotation(
+ "eu.dnetlib.dhp.bulktag.criteria.VerbClass");
+
+ this.map = routeClassInfoList
+ .stream()
+ .collect(
+ Collectors
+ .toMap(
+ value -> (String) ((ClassInfo) value)
+ .getAnnotationInfo()
+ .get(0)
+ .getParameterValues()
+ .get(0)
+ .getValue(),
+ value -> (Class) ((ClassInfo) value).loadClass()));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ public Selection getSelectionCriteria(String name, String param)
+ throws NoSuchMethodException, IllegalAccessException, InvocationTargetException,
+ InstantiationException {
+
+ // return Class.forName(tmp_map.get(name)).
+ return map.get(name).getDeclaredConstructor((String.class)).newInstance(param);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java
new file mode 100644
index 000000000..0bb801999
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java
@@ -0,0 +1,10 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+public class VerbResolverFactory {
+
+ public static VerbResolver newInstance() {
+
+ return new VerbResolver();
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java
new file mode 100644
index 000000000..271cc6bb3
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java
@@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+
+public class CountrySbs implements Serializable {
+ private String classid;
+ private String classname;
+
+ public String getClassid() {
+ return classid;
+ }
+
+ public void setClassid(String classid) {
+ this.classid = classid;
+ }
+
+ public String getClassname() {
+ return classname;
+ }
+
+ public void setClassname(String classname) {
+ this.classname = classname;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java
new file mode 100644
index 000000000..642192f73
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java
@@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+
+public class DatasourceCountry implements Serializable {
+ private String dataSourceId;
+ private CountrySbs country;
+
+ public String getDataSourceId() {
+ return dataSourceId;
+ }
+
+ public void setDataSourceId(String dataSourceId) {
+ this.dataSourceId = dataSourceId;
+ }
+
+ public CountrySbs getCountry() {
+ return country;
+ }
+
+ public void setCountry(CountrySbs country) {
+ this.country = country;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
new file mode 100644
index 000000000..e91a1e48a
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
@@ -0,0 +1,121 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+/**
+ * For the association of the country to the datasource The association is computed only for datasource of specific type
+ * or having whitelisted ids The country is registered in the Organization associated to the Datasource, so the relation
+ * provides between Datasource and Organization is exploited to get the country for the datasource
+ */
+public class PrepareDatasourceCountryAssociation {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareDatasourceCountryAssociation.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ prepareDatasourceCountryAssociation(
+ spark,
+ Arrays.asList(parser.get("whitelist").split(";")),
+ Arrays.asList(parser.get("allowedtypes").split(";")),
+ inputPath,
+ outputPath);
+ });
+ }
+
+ private static void prepareDatasourceCountryAssociation(
+ SparkSession spark,
+ List whitelist,
+ List allowedtypes,
+ String inputPath,
+ String outputPath) {
+ String whitelisted = "";
+ for (String i : whitelist) {
+ whitelisted += " OR id = '" + i + "'";
+ }
+
+ Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
+ Dataset relation = readPath(spark, inputPath + "/relation", Relation.class);
+ Dataset organization = readPath(spark, inputPath + "/organization", Organization.class);
+
+ datasource.createOrReplaceTempView("datasource");
+ relation.createOrReplaceTempView("relation");
+ organization.createOrReplaceTempView("organization");
+
+ String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
+ + "FROM ( SELECT id "
+ + " FROM datasource "
+ + " WHERE (datainfo.deletedbyinference = false "
+ + whitelisted
+ + ") "
+ + getConstraintList("datasourcetype.classid = '", allowedtypes)
+ + ") d "
+ + "JOIN ( SELECT source, target "
+ + " FROM relation "
+ + " WHERE relclass = '"
+ + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
+ + "' "
+ + " AND datainfo.deletedbyinference = false ) rel "
+ + "ON d.id = rel.source "
+ + "JOIN (SELECT id, country "
+ + " FROM organization "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND length(country.classid) > 0) o "
+ + "ON o.id = rel.target";
+
+ spark
+ .sql(query)
+ .as(Encoders.bean(DatasourceCountry.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputPath);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
new file mode 100644
index 000000000..34b376413
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
@@ -0,0 +1,98 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class PrepareResultCountrySet {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
+
+ private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet "
+ + "FROM ( SELECT id, country "
+ + "FROM datasource_country JOIN cfhb ON cf = dataSourceId "
+ + "UNION ALL "
+ + "SELECT id, country FROM datasource_country "
+ + "JOIN cfhb ON hb = dataSourceId ) tmp "
+ + "GROUP BY id";
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCountrySet.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String datasourcecountrypath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", datasourcecountrypath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ getPotentialResultToUpdate(
+ spark,
+ inputPath,
+ outputPath,
+ datasourcecountrypath,
+ resultClazz);
+ });
+ }
+
+ private static void getPotentialResultToUpdate(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ String datasourcecountrypath,
+ Class resultClazz) {
+
+ Dataset result = readPath(spark, inputPath, resultClazz);
+ result.createOrReplaceTempView("result");
+ // log.info("number of results: {}", result.count());
+ createCfHbforResult(spark);
+
+ Dataset datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
+
+ datasource_country.createOrReplaceTempView("datasource_country");
+ // log.info("datasource_country number : {}", datasource_country.count());
+
+ spark
+ .sql(RESULT_COUNTRYSET_QUERY)
+ .as(Encoders.bean(ResultCountrySet.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Append)
+ .json(outputPath);
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java
new file mode 100644
index 000000000..8c29424f2
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultCountrySet implements Serializable {
+ private String resultId;
+ private ArrayList countrySet;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getCountrySet() {
+ return countrySet;
+ }
+
+ public void setCountrySet(ArrayList countrySet) {
+ this.countrySet = countrySet;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
new file mode 100644
index 000000000..9dc17701b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@@ -0,0 +1,132 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Country;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import scala.Tuple2;
+
+public class SparkCountryPropagationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkCountryPropagationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String sourcePath = parser.get("sourcePath");
+ log.info("sourcePath: {}", sourcePath);
+
+ String preparedInfoPath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", preparedInfoPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> execPropagation(
+ spark,
+ sourcePath,
+ preparedInfoPath,
+ outputPath,
+ resultClazz,
+ saveGraph));
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String sourcePath,
+ String preparedInfoPath,
+ String outputPath,
+ Class resultClazz,
+ boolean saveGraph) {
+
+ if (saveGraph) {
+ // updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
+ log.info("Reading Graph table from: {}", sourcePath);
+ Dataset res = readPath(spark, sourcePath, resultClazz);
+
+ log.info("Reading prepared info: {}", preparedInfoPath);
+ Dataset prepared = spark
+ .read()
+ .json(preparedInfoPath)
+ .as(Encoders.bean(ResultCountrySet.class));
+
+ res
+ .joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
+ .map(getCountryMergeFn(), Encoders.bean(resultClazz))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputPath);
+ }
+ }
+
+ private static MapFunction, R> getCountryMergeFn() {
+ return (MapFunction, R>) t -> {
+ Optional.ofNullable(t._2()).ifPresent(r -> {
+ t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
+ });
+ return t._1();
+ };
+ }
+
+ private static List merge(List c1, List c2) {
+ HashSet countries = c1
+ .stream()
+ .map(c -> c.getClassid())
+ .collect(Collectors.toCollection(HashSet::new));
+
+ return c2
+ .stream()
+ .filter(c -> !countries.contains(c.getClassid()))
+ .map(c -> getCountry(c.getClassid(), c.getClassname()))
+ .collect(Collectors.toList());
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java
new file mode 100644
index 000000000..a5fcab360
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java
@@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+public class AutoritativeAuthor {
+
+ private String name;
+ private String surname;
+ private String fullname;
+ private String orcid;
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getSurname() {
+ return surname;
+ }
+
+ public void setSurname(String surname) {
+ this.surname = surname;
+ }
+
+ public String getFullname() {
+ return fullname;
+ }
+
+ public void setFullname(String fullname) {
+ this.fullname = fullname;
+ }
+
+ public String getOrcid() {
+ return orcid;
+ }
+
+ public void setOrcid(String orcid) {
+ this.orcid = orcid;
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
new file mode 100644
index 000000000..3e16b4b4b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
@@ -0,0 +1,125 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class PrepareResultOrcidAssociationStep1 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConf = IOUtils
+ .toString(
+ PrepareResultOrcidAssociationStep1.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
+ log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
+
+ final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
+ log.info("resultType: {}", resultType);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ String inputRelationPath = inputPath + "/relation";
+ log.info("inputRelationPath: {}", inputRelationPath);
+
+ String inputResultPath = inputPath + "/" + resultType;
+ log.info("inputResultPath: {}", inputResultPath);
+
+ String outputResultPath = outputPath + "/" + resultType;
+ log.info("outputResultPath: {}", outputResultPath);
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ prepareInfo(
+ spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
+ });
+ }
+
+ private static void prepareInfo(
+ SparkSession spark,
+ String inputRelationPath,
+ String inputResultPath,
+ String outputResultPath,
+ Class resultClazz,
+ List allowedsemrel) {
+
+ Dataset relation = readPath(spark, inputRelationPath, Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ log.info("Reading Graph table from: {}", inputResultPath);
+ Dataset result = readPath(spark, inputResultPath, resultClazz);
+ result.createOrReplaceTempView("result");
+
+ String query = " select target resultId, author authorList"
+ + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
+ + " from ( "
+ + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
+ + " from result "
+ + " lateral view explode (author) a as MyT "
+ + " lateral view explode (MyT.pid) p as MyP "
+ + " where MyP.qualifier.classid = 'ORCID') tmp "
+ + " group by id) r_t "
+ + " join ("
+ + " select source, target "
+ + " from relation "
+ + " where datainfo.deletedbyinference = false "
+ + getConstraintList(" relclass = '", allowedsemrel)
+ + ") rel_rel "
+ + " on source = id";
+ spark
+ .sql(query)
+ .as(Encoders.bean(ResultOrcidList.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputResultPath);
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java
new file mode 100644
index 000000000..65d8811bc
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java
@@ -0,0 +1,97 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import scala.Tuple2;
+
+public class PrepareResultOrcidAssociationStep2 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class);
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultOrcidAssociationStep2.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ mergeInfo(spark, inputPath, outputPath);
+ });
+ }
+
+ private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {
+
+ Dataset resultOrcidAssoc = readPath(spark, inputPath + "/publication", ResultOrcidList.class)
+ .union(readPath(spark, inputPath + "/dataset", ResultOrcidList.class))
+ .union(readPath(spark, inputPath + "/otherresearchproduct", ResultOrcidList.class))
+ .union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
+
+ resultOrcidAssoc
+ .toJavaRDD()
+ .mapToPair(r -> new Tuple2<>(r.getResultId(), r))
+ .reduceByKey(
+ (a, b) -> {
+ if (a == null) {
+ return b;
+ }
+ if (b == null) {
+ return a;
+ }
+ Set orcid_set = new HashSet<>();
+ a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
+ b
+ .getAuthorList()
+ .stream()
+ .forEach(
+ aa -> {
+ if (!orcid_set.contains(aa.getOrcid())) {
+ a.getAuthorList().add(aa);
+ orcid_set.add(aa.getOrcid());
+ }
+ });
+ return a;
+ })
+ .map(c -> c._2())
+ .map(r -> OBJECT_MAPPER.writeValueAsString(r))
+ .saveAsTextFile(outputPath, GzipCodec.class);
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java
new file mode 100644
index 000000000..54b415d1c
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java
@@ -0,0 +1,27 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+public class ResultOrcidList implements Serializable {
+ String resultId;
+ List authorList = new ArrayList<>();
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public List getAuthorList() {
+ return authorList;
+ }
+
+ public void setAuthorList(List authorList) {
+ this.authorList = authorList;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java
new file mode 100644
index 000000000..ebb75a5a6
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java
@@ -0,0 +1,199 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import scala.Tuple2;
+
+public class SparkOrcidToResultFromSemRelJob {
+ private static final Logger log = LoggerFactory.getLogger(SparkOrcidToResultFromSemRelJob.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkOrcidToResultFromSemRelJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ final String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String possibleUpdates = parser.get("possibleUpdatesPath");
+ log.info("possibleUpdatesPath: {}", possibleUpdates);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ if (saveGraph)
+ execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String possibleUpdatesPath,
+ String inputPath,
+ String outputPath,
+ Class resultClazz) {
+
+ // read possible updates (resultId and list of possible orcid to add
+ Dataset possible_updates = readPath(spark, possibleUpdatesPath, ResultOrcidList.class);
+ // read the result we have been considering
+ Dataset result = readPath(spark, inputPath, resultClazz);
+ // make join result left_outer with possible updates
+
+ result
+ .joinWith(
+ possible_updates,
+ result.col("id").equalTo(possible_updates.col("resultId")),
+ "left_outer")
+ .map(authorEnrichFn(), Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static MapFunction, R> authorEnrichFn() {
+ return (MapFunction, R>) value -> {
+ R ret = value._1();
+ Optional rol = Optional.ofNullable(value._2());
+ if (rol.isPresent()) {
+ List toenrich_author = ret.getAuthor();
+ List autoritativeAuthors = rol.get().getAuthorList();
+ for (Author author : toenrich_author) {
+ if (!containsAllowedPid(author)) {
+ enrichAuthor(author, autoritativeAuthors);
+ }
+ }
+ }
+
+ return ret;
+ };
+ }
+
+ private static void enrichAuthor(Author a, List au) {
+ for (AutoritativeAuthor aa : au) {
+ if (enrichAuthor(aa, a)) {
+ return;
+ }
+ }
+ }
+
+ private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
+ boolean toaddpid = false;
+
+ if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) {
+ if (StringUtils.isNoneEmpty(author.getSurname())) {
+ if (autoritative_author
+ .getSurname()
+ .trim()
+ .equalsIgnoreCase(author.getSurname().trim())) {
+
+ // have the same surname. Check the name
+ if (StringUtils.isNoneEmpty(autoritative_author.getName())) {
+ if (StringUtils.isNoneEmpty(author.getName())) {
+ if (autoritative_author
+ .getName()
+ .trim()
+ .equalsIgnoreCase(author.getName().trim())) {
+ toaddpid = true;
+ }
+ // they could be differently written (i.e. only the initials of the name
+ // in one of the two
+ if (autoritative_author
+ .getName()
+ .trim()
+ .substring(0, 0)
+ .equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
+ toaddpid = true;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (toaddpid) {
+ StructuredProperty p = new StructuredProperty();
+ p.setValue(autoritative_author.getOrcid());
+ p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID));
+ p
+ .setDataInfo(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID,
+ PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME));
+
+ Optional> authorPid = Optional.ofNullable(author.getPid());
+ if (authorPid.isPresent()) {
+ authorPid.get().add(p);
+ } else {
+ author.setPid(Lists.newArrayList(p));
+ }
+
+ }
+ return toaddpid;
+ }
+
+ private static boolean containsAllowedPid(Author a) {
+ Optional> pids = Optional.ofNullable(a.getPid());
+ if (!pids.isPresent()) {
+ return false;
+ }
+ for (StructuredProperty pid : pids.get()) {
+ if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) {
+ return true;
+ }
+ }
+ return false;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java
new file mode 100644
index 000000000..05dcdc692
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java
@@ -0,0 +1,126 @@
+
+package eu.dnetlib.dhp.projecttoresult;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.PropagationConstant.getConstraintList;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareProjectResultsAssociation {
+ private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareProjectResultsAssociation.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String potentialUpdatePath = parser.get("potentialUpdatePath");
+ log.info("potentialUpdatePath {}: ", potentialUpdatePath);
+
+ String alreadyLinkedPath = parser.get("alreadyLinkedPath");
+ log.info("alreadyLinkedPath: {} ", alreadyLinkedPath);
+
+ final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
+ log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ prepareResultProjProjectResults(
+ spark,
+ inputPath,
+ potentialUpdatePath,
+ alreadyLinkedPath,
+ allowedsemrel);
+ });
+ }
+
+ private static void prepareResultProjProjectResults(
+ SparkSession spark,
+ String inputPath,
+ String potentialUpdatePath,
+ String alreadyLinkedPath,
+ List allowedsemrel) {
+
+ Dataset relation = readPath(spark, inputPath, Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ String resproj_relation_query = "SELECT source, target "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND relClass = '"
+ + RELATION_RESULT_PROJECT_REL_CLASS
+ + "'";
+
+ Dataset resproj_relation = spark.sql(resproj_relation_query);
+ resproj_relation.createOrReplaceTempView("resproj_relation");
+
+ String potential_update_query = "SELECT resultId, collect_set(projectId) projectSet "
+ + "FROM ( "
+ + "SELECT r1.target resultId, r2.target projectId "
+ + " FROM (SELECT source, target "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + getConstraintList(" relClass = '", allowedsemrel)
+ + " ) r1"
+ + " JOIN resproj_relation r2 "
+ + " ON r1.source = r2.source "
+ + " ) tmp "
+ + "GROUP BY resultId ";
+
+ spark
+ .sql(potential_update_query)
+ .as(Encoders.bean(ResultProjectSet.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(potentialUpdatePath);
+
+ String result_projectset_query = "SELECT source resultId, collect_set(target) projectSet "
+ + "FROM resproj_relation "
+ + "GROUP BY source";
+
+ spark
+ .sql(result_projectset_query)
+ .as(Encoders.bean(ResultProjectSet.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(alreadyLinkedPath);
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java
new file mode 100644
index 000000000..1d5280874
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.projecttoresult;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultProjectSet implements Serializable {
+ private String resultId;
+ private ArrayList projectSet;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getProjectSet() {
+ return projectSet;
+ }
+
+ public void setProjectSet(ArrayList project) {
+ this.projectSet = project;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java
new file mode 100644
index 000000000..36694b3dd
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java
@@ -0,0 +1,147 @@
+
+package eu.dnetlib.dhp.projecttoresult;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+public class SparkResultToProjectThroughSemRelJob {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkResultToProjectThroughSemRelJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ final String potentialUpdatePath = parser.get("potentialUpdatePath");
+ log.info("potentialUpdatePath {}: ", potentialUpdatePath);
+
+ final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
+ log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
+
+ final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
+ log.info("saveGraph: {}", saveGraph);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ execPropagation(
+ spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String outputPath,
+ String alreadyLinkedPath,
+ String potentialUpdatePath,
+ Boolean saveGraph) {
+
+ Dataset toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class);
+ Dataset alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class);
+
+ if (saveGraph) {
+ toaddrelations
+ .joinWith(
+ alreadyLinked,
+ toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")),
+ "left_outer")
+ .flatMap(mapRelationRn(), Encoders.bean(Relation.class))
+ .write()
+ .mode(SaveMode.Append)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+ }
+
+ private static FlatMapFunction, Relation> mapRelationRn() {
+ return (FlatMapFunction, Relation>) value -> {
+ List new_relations = new ArrayList<>();
+ ResultProjectSet potential_update = value._1();
+ Optional already_linked = Optional.ofNullable(value._2());
+ if (already_linked.isPresent()) {
+ already_linked
+ .get()
+ .getProjectSet()
+ .stream()
+ .forEach(
+ (p -> {
+ if (potential_update
+ .getProjectSet()
+ .contains(p)) {
+ potential_update.getProjectSet().remove(p);
+ }
+ }));
+ }
+ String resId = potential_update.getResultId();
+ potential_update
+ .getProjectSet()
+ .stream()
+ .forEach(
+ projectId -> {
+ new_relations
+ .add(
+ getRelation(
+ resId,
+ projectId,
+ RELATION_RESULT_PROJECT_REL_CLASS,
+ RELATION_RESULTPROJECT_REL_TYPE,
+ RELATION_RESULTPROJECT_SUBREL_TYPE,
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
+ new_relations
+ .add(
+ getRelation(
+ projectId,
+ resId,
+ RELATION_PROJECT_RESULT_REL_CLASS,
+ RELATION_RESULTPROJECT_REL_TYPE,
+ RELATION_RESULTPROJECT_SUBREL_TYPE,
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
+ });
+ return new_relations.iterator();
+ };
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java
new file mode 100644
index 000000000..7d786058a
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java
@@ -0,0 +1,21 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+public class OrganizationMap extends HashMap> {
+
+ public OrganizationMap() {
+ super();
+ }
+
+ public List get(String key) {
+
+ if (super.get(key) == null) {
+ return new ArrayList<>();
+ }
+ return super.get(key);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java
new file mode 100644
index 000000000..e2d4d5687
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java
@@ -0,0 +1,130 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.*;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareResultCommunitySet {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCommunitySet.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final OrganizationMap organizationMap = new Gson()
+ .fromJson(
+ parser.get("organizationtoresultcommunitymap"),
+ OrganizationMap.class);
+ log.info("organizationMap: {}", new Gson().toJson(organizationMap));
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ prepareInfo(spark, inputPath, outputPath, organizationMap);
+ });
+ }
+
+ private static void prepareInfo(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ OrganizationMap organizationMap) {
+
+ Dataset relation = readPath(spark, inputPath, Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges "
+ + "FROM (SELECT source, target "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND relClass = '"
+ + RELATION_RESULT_ORGANIZATION_REL_CLASS
+ + "') result_organization "
+ + "LEFT JOIN (SELECT source, collect_set(target) org_set "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND relClass = '"
+ + RELATION_REPRESENTATIVERESULT_RESULT_CLASS
+ + "' "
+ + " GROUP BY source) organization_organization "
+ + "ON result_organization.target = organization_organization.source ";
+
+ Dataset result_organizationset = spark
+ .sql(query)
+ .as(Encoders.bean(ResultOrganizations.class));
+
+ result_organizationset
+ .map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
+ .filter(Objects::nonNull)
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static MapFunction mapResultCommunityFn(
+ OrganizationMap organizationMap) {
+ return (MapFunction) value -> {
+ String rId = value.getResultId();
+ Optional> orgs = Optional.ofNullable(value.getMerges());
+ String oTarget = value.getOrgId();
+ Set communitySet = new HashSet<>();
+ if (organizationMap.containsKey(oTarget)) {
+ communitySet.addAll(organizationMap.get(oTarget));
+ }
+ if (orgs.isPresent())
+ for (String oId : orgs.get()) {
+ if (organizationMap.containsKey(oId)) {
+ communitySet.addAll(organizationMap.get(oId));
+ }
+ }
+ if (communitySet.size() > 0) {
+ ResultCommunityList rcl = new ResultCommunityList();
+ rcl.setResultId(rId);
+ ArrayList communityList = new ArrayList<>();
+ communityList.addAll(communitySet);
+ rcl.setCommunityList(communityList);
+ return rcl;
+ }
+ return null;
+ };
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java
new file mode 100644
index 000000000..e3275745d
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultCommunityList implements Serializable {
+ private String resultId;
+ private ArrayList communityList;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getCommunityList() {
+ return communityList;
+ }
+
+ public void setCommunityList(ArrayList communityList) {
+ this.communityList = communityList;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java
new file mode 100644
index 000000000..3ea9d41d6
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java
@@ -0,0 +1,35 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultOrganizations implements Serializable {
+ private String resultId;
+ private String orgId;
+ private ArrayList merges;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public String getOrgId() {
+ return orgId;
+ }
+
+ public void setOrgId(String orgId) {
+ this.orgId = orgId;
+ }
+
+ public ArrayList getMerges() {
+ return merges;
+ }
+
+ public void setMerges(ArrayList merges) {
+ this.merges = merges;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java
new file mode 100644
index 000000000..71275cc7f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java
@@ -0,0 +1,137 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+import scala.Tuple2;
+
+public class SparkResultToCommunityFromOrganizationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromOrganizationJob.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkResultToCommunityFromOrganizationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String possibleupdatespath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", possibleupdatespath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ if (saveGraph)
+ execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ Class resultClazz,
+ String possibleUpdatesPath) {
+
+ Dataset possibleUpdates = readPath(spark, possibleUpdatesPath, ResultCommunityList.class);
+ Dataset result = readPath(spark, inputPath, resultClazz);
+
+ result
+ .joinWith(
+ possibleUpdates,
+ result.col("id").equalTo(possibleUpdates.col("resultId")),
+ "left_outer")
+ .map(resultCommunityFn(), Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static MapFunction, R> resultCommunityFn() {
+ return (MapFunction, R>) value -> {
+ R ret = value._1();
+ Optional rcl = Optional.ofNullable(value._2());
+ if (rcl.isPresent()) {
+ ArrayList communitySet = rcl.get().getCommunityList();
+ List contextList = ret
+ .getContext()
+ .stream()
+ .map(con -> con.getId())
+ .collect(Collectors.toList());
+ Result res = new Result();
+ res.setId(ret.getId());
+ List propagatedContexts = new ArrayList<>();
+ for (String cId : communitySet) {
+ if (!contextList.contains(cId)) {
+ Context newContext = new Context();
+ newContext.setId(cId);
+ newContext
+ .setDataInfo(
+ Arrays
+ .asList(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID,
+ PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME)));
+ propagatedContexts.add(newContext);
+ }
+ }
+ res.setContext(propagatedContexts);
+ ret.mergeFrom(res);
+ }
+ return ret;
+ };
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java
new file mode 100644
index 000000000..4f5ac2552
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java
@@ -0,0 +1,167 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class PrepareResultCommunitySetStep1 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class);
+
+ private static final String COMMUNITY_LIST_XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')"
+ + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']"
+ + " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'"
+ + " return $x//CONFIGURATION/context/@id/string()";
+
+ /**
+ * associates to each result the set of community contexts they are associated to; associates to each target of a
+ * relation with allowed semantics the set of community context it could possibly inherit from the source of the
+ * relation
+ */
+ // TODO
+ private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context "
+ + "from (select id, collect_set(co.id) community_context "
+ + " from result "
+ + " lateral view explode (context) c as co "
+ + " where datainfo.deletedbyinference = false %s group by id) p "
+ + " JOIN "
+ + " (select source, target from relation "
+ + " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source";
+
+ /**
+ * a dataset for example could be linked to more than one publication. For each publication linked to that dataset
+ * the previous query will produce a row: targetId set of community context the target could possibly inherit with
+ * the following query there will be a single row for each result linked to more than one result of the result type
+ * currently being used
+ */
+ // TODO
+ private static final String RESULT_COMMUNITY_LIST_QUERY = "select resultId , collect_set(co) communityList "
+ + "from result_context "
+ + "lateral view explode (community_context) c as co "
+ + "where length(co) > 0 "
+ + "group by resultId";
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCommunitySetStep1.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
+ log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
+
+ final String isLookupUrl = parser.get("isLookUpUrl");
+ log.info("isLookupUrl: {}", isLookupUrl);
+
+ final List communityIdList = getCommunityList(isLookupUrl);
+ log.info("communityIdList: {}", new Gson().toJson(communityIdList));
+
+ final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
+ log.info("resultType: {}", resultType);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ prepareInfo(
+ spark,
+ inputPath,
+ outputPath,
+ allowedsemrel,
+ resultClazz,
+ resultType,
+ communityIdList);
+ });
+ }
+
+ private static void prepareInfo(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ List allowedsemrel,
+ Class resultClazz,
+ String resultType,
+ List communityIdList) {
+
+ final String inputResultPath = inputPath + "/" + resultType;
+ log.info("Reading Graph table from: {}", inputResultPath);
+
+ final String inputRelationPath = inputPath + "/relation";
+ log.info("Reading relation table from: {}", inputResultPath);
+
+ Dataset relation = readPath(spark, inputRelationPath, Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ Dataset result = readPath(spark, inputResultPath, resultClazz);
+ result.createOrReplaceTempView("result");
+
+ final String outputResultPath = outputPath + "/" + resultType;
+ log.info("writing output results to: {}", outputResultPath);
+
+ String resultContextQuery = String
+ .format(
+ RESULT_CONTEXT_QUERY_TEMPLATE,
+ getConstraintList(" co.id = '", communityIdList),
+ getConstraintList(" relClass = '", allowedsemrel));
+
+ Dataset result_context = spark.sql(resultContextQuery);
+ result_context.createOrReplaceTempView("result_context");
+
+ spark
+ .sql(RESULT_COMMUNITY_LIST_QUERY)
+ .as(Encoders.bean(ResultCommunityList.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputResultPath);
+ }
+
+ public static List getCommunityList(final String isLookupUrl) throws ISLookUpException {
+ ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
+ return isLookUp.quickSearchProfile(COMMUNITY_LIST_XQUERY);
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java
new file mode 100644
index 000000000..723aa8960
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java
@@ -0,0 +1,101 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
+import scala.Tuple2;
+
+public class PrepareResultCommunitySetStep2 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep2.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCommunitySetStep2.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ mergeInfo(spark, inputPath, outputPath);
+ });
+ }
+
+ private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {
+
+ Dataset resultOrcidAssocCommunityList = readPath(
+ spark, inputPath + "/publication", ResultCommunityList.class)
+ .union(readPath(spark, inputPath + "/dataset", ResultCommunityList.class))
+ .union(readPath(spark, inputPath + "/otherresearchproduct", ResultCommunityList.class))
+ .union(readPath(spark, inputPath + "/software", ResultCommunityList.class));
+
+ resultOrcidAssocCommunityList
+ .toJavaRDD()
+ .mapToPair(r -> new Tuple2<>(r.getResultId(), r))
+ .reduceByKey(
+ (a, b) -> {
+ if (a == null) {
+ return b;
+ }
+ if (b == null) {
+ return a;
+ }
+ Set community_set = new HashSet<>();
+ a.getCommunityList().stream().forEach(aa -> community_set.add(aa));
+ b
+ .getCommunityList()
+ .stream()
+ .forEach(
+ aa -> {
+ if (!community_set.contains(aa)) {
+ a.getCommunityList().add(aa);
+ community_set.add(aa);
+ }
+ });
+ return a;
+ })
+ .map(c -> c._2())
+ .map(r -> OBJECT_MAPPER.writeValueAsString(r))
+ .saveAsTextFile(outputPath, GzipCodec.class);
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java
new file mode 100644
index 000000000..0c613d1b4
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java
@@ -0,0 +1,143 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
+import eu.dnetlib.dhp.schema.oaf.*;
+import scala.Tuple2;
+
+public class SparkResultToCommunityThroughSemRelJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityThroughSemRelJob.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkResultToCommunityThroughSemRelJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String preparedInfoPath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", preparedInfoPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ if (saveGraph) {
+ execPropagation(
+ spark, inputPath, outputPath, preparedInfoPath, resultClazz);
+ }
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ String preparedInfoPath,
+ Class resultClazz) {
+
+ Dataset possibleUpdates = readPath(spark, preparedInfoPath, ResultCommunityList.class);
+ Dataset result = readPath(spark, inputPath, resultClazz);
+
+ result
+ .joinWith(
+ possibleUpdates,
+ result.col("id").equalTo(possibleUpdates.col("resultId")),
+ "left_outer")
+ .map(contextUpdaterFn(), Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static MapFunction, R> contextUpdaterFn() {
+ return (MapFunction, R>) value -> {
+ R ret = value._1();
+ Optional rcl = Optional.ofNullable(value._2());
+ if (rcl.isPresent()) {
+ Set context_set = new HashSet<>();
+ ret.getContext().stream().forEach(c -> context_set.add(c.getId()));
+ List contextList = rcl
+ .get()
+ .getCommunityList()
+ .stream()
+ .map(
+ c -> {
+ if (!context_set.contains(c)) {
+ Context newContext = new Context();
+ newContext.setId(c);
+ newContext
+ .setDataInfo(
+ Arrays
+ .asList(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID,
+ PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME)));
+ return newContext;
+ }
+ return null;
+ })
+ .filter(Objects::nonNull)
+ .collect(Collectors.toList());
+ Result r = new Result();
+ r.setId(ret.getId());
+ r.setContext(contextList);
+ ret.mergeFrom(r);
+ }
+
+ return ret;
+ };
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java
new file mode 100644
index 000000000..e6b13dfa4
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
+
+import java.io.Serializable;
+
+public class DatasourceOrganization implements Serializable {
+
+ private String datasourceId;
+ private String organizationId;
+
+ public String getDatasourceId() {
+ return datasourceId;
+ }
+
+ public void setDatasourceId(String datasourceId) {
+ this.datasourceId = datasourceId;
+ }
+
+ public String getOrganizationId() {
+ return organizationId;
+ }
+
+ public void setOrganizationId(String organizationId) {
+ this.organizationId = organizationId;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java
new file mode 100644
index 000000000..f8fe1668f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java
@@ -0,0 +1,122 @@
+
+package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Datasource;
+import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareResultInstRepoAssociation {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultInstRepoAssociation.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath");
+ log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath);
+
+ final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
+ log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ readNeededResources(spark, inputPath);
+ prepareDatasourceOrganization(spark, datasourceOrganizationPath);
+ prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
+ });
+ }
+
+ private static void prepareAlreadyLinkedAssociation(
+ SparkSession spark, String alreadyLinkedPath) {
+ String query = "Select source resultId, collect_set(target) organizationSet "
+ + "from relation "
+ + "where datainfo.deletedbyinference = false "
+ + "and relClass = '"
+ + RELATION_RESULT_ORGANIZATION_REL_CLASS
+ + "' "
+ + "group by source";
+
+ spark
+ .sql(query)
+ .as(Encoders.bean(ResultOrganizationSet.class))
+ // TODO retry to stick with datasets
+ .toJavaRDD()
+ .map(r -> OBJECT_MAPPER.writeValueAsString(r))
+ .saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
+ }
+
+ private static void readNeededResources(SparkSession spark, String inputPath) {
+ Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
+ datasource.createOrReplaceTempView("datasource");
+
+ Dataset relation = readPath(spark, inputPath + "/relation", Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ Dataset organization = readPath(spark, inputPath + "/organization", Organization.class);
+ organization.createOrReplaceTempView("organization");
+ }
+
+ private static void prepareDatasourceOrganization(
+ SparkSession spark, String datasourceOrganizationPath) {
+
+ String query = "SELECT source datasourceId, target organizationId "
+ + "FROM ( SELECT id "
+ + "FROM datasource "
+ + "WHERE datasourcetype.classid = '"
+ + INSTITUTIONAL_REPO_TYPE
+ + "' "
+ + "AND datainfo.deletedbyinference = false ) d "
+ + "JOIN ( SELECT source, target "
+ + "FROM relation "
+ + "WHERE relclass = '"
+ + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
+ + "' "
+ + "AND datainfo.deletedbyinference = false ) rel "
+ + "ON d.id = rel.source ";
+
+ spark
+ .sql(query)
+ .as(Encoders.bean(DatasourceOrganization.class))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(datasourceOrganizationPath);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java
new file mode 100644
index 000000000..3bce14cdb
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultOrganizationSet implements Serializable {
+ private String resultId;
+ private ArrayList organizationSet;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getOrganizationSet() {
+ return organizationSet;
+ }
+
+ public void setOrganizationSet(ArrayList organizationSet) {
+ this.organizationSet = organizationSet;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java
new file mode 100644
index 000000000..86634d43f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java
@@ -0,0 +1,193 @@
+
+package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.*;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.broadcast.Broadcast;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+import scala.Tuple2;
+
+public class SparkResultToOrganizationFromIstRepoJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromIstRepoJob.class);
+
+ private static final String RESULT_ORGANIZATIONSET_QUERY = "SELECT id resultId, collect_set(organizationId) organizationSet "
+ + "FROM ( SELECT id, organizationId "
+ + "FROM rels "
+ + "JOIN cfhb "
+ + " ON cf = datasourceId "
+ + "UNION ALL "
+ + "SELECT id , organizationId "
+ + "FROM rels "
+ + "JOIN cfhb "
+ + " ON hb = datasourceId ) tmp "
+ + "GROUP BY id";
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkResultToOrganizationFromIstRepoJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String datasourceorganization = parser.get("datasourceOrganizationPath");
+ log.info("datasourceOrganizationPath: {}", datasourceorganization);
+
+ final String alreadylinked = parser.get("alreadyLinkedPath");
+ log.info("alreadyLinkedPath: {}", alreadylinked);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ if (saveGraph)
+ execPropagation(
+ spark,
+ datasourceorganization,
+ alreadylinked,
+ inputPath,
+ outputPath,
+ resultClazz);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String datasourceorganization,
+ String alreadyLinkedPath,
+ String inputPath,
+ String outputPath,
+ Class extends Result> clazz) {
+
+ Dataset ds_org = readPath(spark, datasourceorganization, DatasourceOrganization.class);
+
+ Dataset potentialUpdates = getPotentialRelations(spark, inputPath, clazz, ds_org);
+
+ Dataset alreadyLinked = readPath(spark, alreadyLinkedPath, ResultOrganizationSet.class);
+
+ potentialUpdates
+ .joinWith(
+ alreadyLinked,
+ potentialUpdates.col("resultId").equalTo(alreadyLinked.col("resultId")),
+ "left_outer")
+ .flatMap(createRelationFn(), Encoders.bean(Relation.class))
+ .write()
+ .mode(SaveMode.Append)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static FlatMapFunction, Relation> createRelationFn() {
+ return (FlatMapFunction, Relation>) value -> {
+ List new_relations = new ArrayList<>();
+ ResultOrganizationSet potential_update = value._1();
+ Optional already_linked = Optional.ofNullable(value._2());
+ List organization_list = potential_update.getOrganizationSet();
+ if (already_linked.isPresent()) {
+ already_linked
+ .get()
+ .getOrganizationSet()
+ .stream()
+ .forEach(
+ rId -> {
+ if (organization_list.contains(rId)) {
+ organization_list.remove(rId);
+ }
+ });
+ }
+ String resultId = potential_update.getResultId();
+ organization_list
+ .stream()
+ .forEach(
+ orgId -> {
+ new_relations
+ .add(
+ getRelation(
+ orgId,
+ resultId,
+ RELATION_ORGANIZATION_RESULT_REL_CLASS,
+ RELATION_RESULTORGANIZATION_REL_TYPE,
+ RELATION_RESULTORGANIZATION_SUBREL_TYPE,
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
+ PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
+ new_relations
+ .add(
+ getRelation(
+ resultId,
+ orgId,
+ RELATION_RESULT_ORGANIZATION_REL_CLASS,
+ RELATION_RESULTORGANIZATION_REL_TYPE,
+ RELATION_RESULTORGANIZATION_SUBREL_TYPE,
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
+ PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
+ });
+ return new_relations.iterator();
+ };
+ }
+
+ private static Dataset getPotentialRelations(
+ SparkSession spark,
+ String inputPath,
+ Class resultClazz,
+ Dataset ds_org) {
+
+ Dataset result = readPath(spark, inputPath, resultClazz);
+ result.createOrReplaceTempView("result");
+ createCfHbforResult(spark);
+
+ ds_org.createOrReplaceTempView("rels");
+
+ return spark
+ .sql(RESULT_ORGANIZATIONSET_QUERY)
+ .as(Encoders.bean(ResultOrganizationSet.class));
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json
new file mode 100644
index 000000000..a37d7d168
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json
@@ -0,0 +1,51 @@
+[
+ {
+ "paramName":"is",
+ "paramLongName":"isLookUpUrl",
+ "paramDescription": "URL of the isLookUp Service",
+ "paramRequired": true
+ },
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pm",
+ "paramLongName":"pathMap",
+ "paramDescription": "the json path associated to each selection field",
+ "paramRequired": true
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "test",
+ "paramLongName": "isTest",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "tg",
+ "paramLongName": "taggingConf",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml
new file mode 100644
index 000000000..fe82ae194
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml
@@ -0,0 +1,54 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml
new file mode 100644
index 000000000..754aba4f2
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml
@@ -0,0 +1,216 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ isLookUpUrl
+ the isLookup service endpoint
+
+
+ pathMap
+ the json path associated to each selection field
+
+
+ outputPath
+ the output path
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-publication
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/publication
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${outputPath}/publication
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-dataset
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/dataset
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${outputPath}/dataset
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-orp
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/otherresearchproduct
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${outputPath}/otherresearchproduct
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-software
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/software
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${outputPath}/software
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json
new file mode 100644
index 000000000..984b40774
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json
@@ -0,0 +1,44 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": false
+ },
+ {
+ "paramName":"sg",
+ "paramLongName":"saveGraph",
+ "paramDescription": "true if the new version of the graph must be saved",
+ "paramRequired": false
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "p",
+ "paramLongName": "preparedInfoPath",
+ "paramDescription": "the path where prepared info have been stored",
+ "paramRequired": false
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json
new file mode 100644
index 000000000..95d4c1c60
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json
@@ -0,0 +1,38 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "w",
+ "paramLongName": "whitelist",
+ "paramDescription": "the datasource having a type different from the allowed ones but that we want to add anyway",
+ "paramRequired": true
+ },
+ {
+ "paramName": "at",
+ "paramLongName": "allowedtypes",
+ "paramDescription": "the allowed datasource types for country propagation",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json
new file mode 100644
index 000000000..5efa3dbd6
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json
@@ -0,0 +1,38 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"out",
+ "paramLongName":"outputPath",
+ "paramDescription": "the output path",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName": "p",
+ "paramLongName": "preparedInfoPath",
+ "paramDescription": "the path where prepared info have been stored",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml
new file mode 100644
index 000000000..2744ea92b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml
@@ -0,0 +1,58 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
+ spark2MaxExecutors
+ 50
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
new file mode 100644
index 000000000..fc877071d
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
@@ -0,0 +1,376 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ whitelist
+ the white list
+
+
+ allowedtypes
+ the allowed types
+
+
+ outputPath
+ the output path
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ PrepareDatasourceCountryAssociation
+ eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}
+ --whitelist${whitelist}
+ --allowedtypes${allowedtypes}
+ --hive_metastore_uris${hive_metastore_uris}
+ --outputPath${workingDir}/preparedInfo
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ prepareResultCountry-Publication
+ eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/publication
+ --outputPath${workingDir}/publication
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --preparedInfoPath${workingDir}/preparedInfo
+
+
+
+
+
+
+
+ yarn
+ cluster
+ prepareResultCountry-Dataset
+ eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/dataset
+ --outputPath${workingDir}/dataset
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --preparedInfoPath${workingDir}/preparedInfo
+
+
+
+
+
+
+
+ yarn
+ cluster
+ prepareResultCountry-ORP
+ eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/otherresearchproduct
+ --outputPath${workingDir}/otherresearchproduct
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --preparedInfoPath${workingDir}/preparedInfo
+
+
+
+
+
+
+
+ yarn
+ cluster
+ prepareResultCountry-Software
+ eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/software
+ --outputPath${workingDir}/software
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --preparedInfoPath${workingDir}/preparedInfo
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ countryPropagationForPublications
+ eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/publication
+ --preparedInfoPath${workingDir}/publication
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${outputPath}/publication
+
+
+
+
+
+
+
+ yarn
+ cluster
+ countryPropagationForDataset
+ eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/dataset
+ --preparedInfoPath${workingDir}/dataset
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${outputPath}/dataset
+
+
+
+
+
+
+
+ yarn
+ cluster
+ countryPropagationForORP
+ eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/otherresearchproduct
+ --preparedInfoPath${workingDir}/otherresearchproduct
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${outputPath}/otherresearchproduct
+
+
+
+
+
+
+
+ yarn
+ cluster
+ countryPropagationForSoftware
+ eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/software
+ --preparedInfoPath${workingDir}/software
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${outputPath}/software
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json
new file mode 100644
index 000000000..d8aa7eb9a
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json
@@ -0,0 +1,50 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"sg",
+ "paramLongName":"saveGraph",
+ "paramDescription": "true if the new version of the graph must be saved",
+ "paramRequired": false
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName":"pu",
+ "paramLongName":"possibleUpdatesPath",
+ "paramDescription": "the path the the association resultId orcid author list can be found",
+ "paramRequired": true
+ },
+ {
+ "paramName":"test",
+ "paramLongName":"isTest",
+ "paramDescription": "true if it is executing a test",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json
new file mode 100644
index 000000000..08648d61a
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json
@@ -0,0 +1,38 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"as",
+ "paramLongName":"allowedsemrels",
+ "paramDescription": "the allowed sematinc relations for propagation",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json
new file mode 100644
index 000000000..1a67134a6
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json
@@ -0,0 +1,20 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml
new file mode 100644
index 000000000..8d2c34105
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml
@@ -0,0 +1,58 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
+ spark2MaxExecutors
+ 50
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml
new file mode 100644
index 000000000..e4429b710
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml
@@ -0,0 +1,372 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ allowedsemrels
+ the semantic relationships allowed for propagation
+
+
+ outputPath
+ the output path
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-PreparePhase1-Publications
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${workingDir}/preparedInfo/targetOrcidAssoc
+ --allowedsemrels${allowedsemrels}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-PreparePhase1-Dataset
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${workingDir}/preparedInfo/targetOrcidAssoc
+ --allowedsemrels${allowedsemrels}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-PreparePhase1-ORP
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${workingDir}/preparedInfo/targetOrcidAssoc
+ --allowedsemrels${allowedsemrels}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-PreparePhase1-Software
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${workingDir}/preparedInfo/targetOrcidAssoc
+ --allowedsemrels${allowedsemrels}
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-PreparePhase2
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep2
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${workingDir}/preparedInfo/targetOrcidAssoc
+ --outputPath${workingDir}/preparedInfo/mergedOrcidAssoc
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-Publication
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+ --conf spark.sql.shuffle.partitions=3840
+
+ --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc
+ --sourcePath${sourcePath}/publication
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${outputPath}/publication
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-Dataset
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+
+ --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc
+ --sourcePath${sourcePath}/dataset
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${outputPath}/dataset
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-ORP
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+
+ --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc
+ --sourcePath${sourcePath}/otherresearchproduct
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${outputPath}/otherresearchproduct
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+ yarn
+ cluster
+ ORCIDPropagation-Software
+ eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+ --conf spark.speculation=false
+ --conf spark.hadoop.mapreduce.map.speculative=false
+ --conf spark.hadoop.mapreduce.reduce.speculative=false
+
+ --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc
+ --sourcePath${sourcePath}/software
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${outputPath}/software
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json
new file mode 100644
index 000000000..a70dbd6a0
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json
@@ -0,0 +1,33 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+
+ {
+ "paramName":"asr",
+ "paramLongName":"allowedsemrels",
+ "paramDescription": "the types of the allowed datasources. Split by ;",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName":"pu",
+ "paramLongName":"potentialUpdatePath",
+ "paramDescription": "the path of the potential updates ",
+ "paramRequired": true
+ },
+ {
+ "paramName":"al",
+ "paramLongName":"alreadyLinkedPath",
+ "paramDescription": "the path of the already linked project result_set",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json
new file mode 100644
index 000000000..7f44ba03c
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json
@@ -0,0 +1,44 @@
+[
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName":"sg",
+ "paramLongName":"saveGraph",
+ "paramDescription": "true if the new version of the graph must be saved",
+ "paramRequired": false
+ },
+ {
+ "paramName":"pu",
+ "paramLongName":"potentialUpdatePath",
+ "paramDescription": "the path of the potential updates ",
+ "paramRequired": true
+ },
+ {
+ "paramName":"al",
+ "paramLongName":"alreadyLinkedPath",
+ "paramDescription": "the path of the already linked project result_set",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "the path where prepared info have been stored",
+ "paramRequired": false
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "test",
+ "paramLongName": "isTest",
+ "paramDescription": "true if it is a test running",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml
new file mode 100644
index 000000000..caf3c6050
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml
@@ -0,0 +1,63 @@
+
+
+ jobTracker
+ yarnRM
+
+
+
+ nameNode
+
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
+ spark2MaxExecutors
+ 50
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml
new file mode 100644
index 000000000..24e1d3b7f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml
@@ -0,0 +1,187 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ allowedsemrels
+ the allowed semantics
+
+
+ outputPath
+ the output path
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/publication
+ ${nameNode}/${outputPath}/publication
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/dataset
+ ${nameNode}/${outputPath}/dataset
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/otherresearchproduct
+ ${nameNode}/${outputPath}/otherresearchproduct
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/software
+ ${nameNode}/${outputPath}/software
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ PrepareProjectResultsAssociation
+ eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/relation
+ --allowedsemrels${allowedsemrels}
+ --hive_metastore_uris${hive_metastore_uris}
+ --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates
+ --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ProjectToResultPropagation
+ eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --saveGraph${saveGraph}
+ --hive_metastore_uris${hive_metastore_uris}
+ --outputPath${outputPath}/relation
+ --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates
+ --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json
new file mode 100644
index 000000000..eebc1a0ca
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json
@@ -0,0 +1,51 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName":"sg",
+ "paramLongName":"saveGraph",
+ "paramDescription": "true if the new version of the graph must be saved",
+ "paramRequired": false
+ },
+ {
+ "paramName":"test",
+ "paramLongName":"isTest",
+ "paramDescription": "true if it is executing a test",
+ "paramRequired": false
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName": "p",
+ "paramLongName": "preparedInfoPath",
+ "paramDescription": "the path where prepared info have been stored",
+ "paramRequired": true
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json
new file mode 100644
index 000000000..8df509abf
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json
@@ -0,0 +1,33 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"ocm",
+ "paramLongName":"organizationtoresultcommunitymap",
+ "paramDescription": "the map for the association organization communities",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml
new file mode 100644
index 000000000..2744ea92b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml
@@ -0,0 +1,58 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
+ spark2MaxExecutors
+ 50
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml
new file mode 100644
index 000000000..d481cad05
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml
@@ -0,0 +1,239 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ organizationtoresultcommunitymap
+ organization community map
+
+
+ outputPath
+ the output path
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Prepare-Community-Result-Organization
+ eu.dnetlib.dhp.resulttocommunityfromorganization.PrepareResultCommunitySet
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}/relation
+ --hive_metastore_uris${hive_metastore_uris}
+ --outputPath${workingDir}/preparedInfo/resultCommunityList
+ --organizationtoresultcommunitymap${organizationtoresultcommunitymap}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ community2resultfromorganization-Publication
+ eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList
+ --sourcePath${sourcePath}/publication
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${outputPath}/publication
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ community2resultfromorganization-Dataset
+ eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList
+ --sourcePath${sourcePath}/dataset
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${outputPath}/dataset
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ community2resultfromorganization-ORP
+ eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList
+ --sourcePath${sourcePath}/otherresearchproduct
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${outputPath}/otherresearchproduct
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ community2resultfromorganization-Software
+ eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList
+ --sourcePath${sourcePath}/software
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${outputPath}/software
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json
new file mode 100644
index 000000000..a40ce375e
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json
@@ -0,0 +1,52 @@
+[
+
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"sg",
+ "paramLongName":"saveGraph",
+ "paramDescription": "true if the new version of the graph must be saved",
+ "paramRequired": false
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName": "p",
+ "paramLongName": "preparedInfoPath",
+ "paramDescription": "the path where prepared info have been stored",
+ "paramRequired": true
+ },
+ {
+ "paramName":"test",
+ "paramLongName":"isTest",
+ "paramDescription": "true if it is executing a test",
+ "paramRequired": false
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json
new file mode 100644
index 000000000..3ba3c8e9c
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json
@@ -0,0 +1,20 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json
new file mode 100644
index 000000000..8c99da673
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json
@@ -0,0 +1,44 @@
+[
+ {
+ "paramName":"is",
+ "paramLongName":"isLookUpUrl",
+ "paramDescription": "URL of the isLookUp Service",
+ "paramRequired": true
+ },
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"as",
+ "paramLongName":"allowedsemrels",
+ "paramDescription": "the allowed semantic relations for propagation",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml
new file mode 100644
index 000000000..2744ea92b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml
@@ -0,0 +1,58 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
+ spark2MaxExecutors
+ 50
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml
new file mode 100644
index 000000000..81b51443c
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml
@@ -0,0 +1,366 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ allowedsemrels
+ the semantic relationships allowed for propagation
+
+
+ isLookUpUrl
+ the isLookup service endpoint
+
+
+ outputPath
+ the output path
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ResultToCommunitySemRel-PreparePhase1-Publications
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${workingDir}/preparedInfo/targetCommunityAssoc
+ --allowedsemrels${allowedsemrels}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ResultToCommunitySemRel-PreparePhase1-Dataset
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${workingDir}/preparedInfo/targetCommunityAssoc
+ --allowedsemrels${allowedsemrels}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ResultToCommunitySemRel-PreparePhase1-ORP
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${workingDir}/preparedInfo/targetCommunityAssoc
+ --allowedsemrels${allowedsemrels}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ResultToCommunitySemRel-PreparePhase1-Software
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${workingDir}/preparedInfo/targetCommunityAssoc
+ --allowedsemrels${allowedsemrels}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ResultToCommunityEmRelPropagation-PreparePhase2
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep2
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${workingDir}/preparedInfo/targetCommunityAssoc
+ --outputPath${workingDir}/preparedInfo/mergedCommunityAssoc
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Result2CommunitySemRelPropagation-Publication
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc
+ --sourcePath${sourcePath}/publication
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${outputPath}/publication
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Result2CommunitySemRelPropagation-Dataset
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc
+ --sourcePath${sourcePath}/dataset
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${outputPath}/dataset
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Result2CommunitySemRelPropagation-ORP
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc
+ --sourcePath${sourcePath}/otherresearchproduct
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${outputPath}/otherresearchproduct
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ Result2CommunitySemRelPropagation-Software
+ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc
+ --sourcePath${sourcePath}/software
+ --hive_metastore_uris${hive_metastore_uris}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${outputPath}/software
+ --saveGraph${saveGraph}
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json
new file mode 100644
index 000000000..c74496350
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json
@@ -0,0 +1,32 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName":"dop",
+ "paramLongName":"datasourceOrganizationPath",
+ "paramDescription": "path where to store/find association from datasource and organization",
+ "paramRequired": true
+ },
+ {
+ "paramName":"alp",
+ "paramLongName":"alreadyLinkedPath",
+ "paramDescription": "path where to store/find already linked results and organizations",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "the path where prepared info have been stored",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json
new file mode 100644
index 000000000..d2b076c82
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json
@@ -0,0 +1,56 @@
+[
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName":"sg",
+ "paramLongName":"saveGraph",
+ "paramDescription": "true if the new version of the graph must be saved",
+ "paramRequired": false
+ },
+ {
+ "paramName":"dop",
+ "paramLongName":"datasourceOrganizationPath",
+ "paramDescription": "path where to store/find association from datasource and organization",
+ "paramRequired": true
+ },
+ {
+ "paramName":"alp",
+ "paramLongName":"alreadyLinkedPath",
+ "paramDescription": "path where to store/find already linked results and organizations",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "the path where prepared info have been stored",
+ "paramRequired": false
+ },
+ {
+ "paramName": "test",
+ "paramLongName": "isTest",
+ "paramDescription": "true if it is a test running",
+ "paramRequired": false
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml
new file mode 100644
index 000000000..2744ea92b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml
@@ -0,0 +1,58 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
+ spark2MaxExecutors
+ 50
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml
new file mode 100644
index 000000000..a1b7f4ad7
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml
@@ -0,0 +1,284 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ outputPath
+ sets the outputPath
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/publication
+ ${nameNode}/${outputPath}/publication
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/dataset
+ ${nameNode}/${outputPath}/dataset
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/otherresearchproduct
+ ${nameNode}/${outputPath}/otherresearchproduct
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/software
+ ${nameNode}/${outputPath}/software
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ PrepareResultOrganizationAssociation
+ eu.dnetlib.dhp.resulttoorganizationfrominstrepo.PrepareResultInstRepoAssociation
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}
+ --hive_metastore_uris${hive_metastore_uris}
+ --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization
+ --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ resultToOrganizationFromInstRepoPropagationForPublications
+ eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}/publication
+ --hive_metastore_uris${hive_metastore_uris}
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${outputPath}/relation
+ --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization
+ --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked
+
+
+
+
+
+
+
+ yarn
+ cluster
+ resultToOrganizationFromInstRepoPropagationForDataset
+ eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}/dataset
+ --hive_metastore_uris${hive_metastore_uris}
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${outputPath}/relation
+ --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization
+ --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked
+
+
+
+
+
+
+
+ yarn
+ cluster
+ resultToOrganizationFromInstRepoPropagationForORP
+ eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}/otherresearchproduct
+ --hive_metastore_uris${hive_metastore_uris}
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${outputPath}/relation
+ --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization
+ --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked
+
+
+
+
+
+
+
+ yarn
+ cluster
+ resultToOrganizationFromInstRepoPropagationForSoftware
+ eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob
+ dhp-enrichment-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+
+ --sourcePath${sourcePath}/software
+ --hive_metastore_uris${hive_metastore_uris}
+ --saveGraph${saveGraph}
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${outputPath}/relation
+ --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization
+ --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java
new file mode 100644
index 000000000..72e0a63fa
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java
@@ -0,0 +1,772 @@
+
+package eu.dnetlib.dhp.bulktag;
+
+import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Software;
+
+public class BulkTagJobTest {
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static final String MOCK_IS_LOOK_UP_URL = "BASEURL:8280/is/services/isLookUp";
+
+ public static final String pathMap = "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}";
+
+ private static SparkSession spark;
+
+ private static Path workingDir;
+
+ private static final Logger log = LoggerFactory.getLogger(BulkTagJobTest.class);
+
+ private static String taggingConf = "";
+
+ static {
+ try {
+ taggingConf = IOUtils
+ .toString(
+ BulkTagJobTest.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml"));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ @BeforeAll
+ public static void beforeAll() throws IOException {
+ workingDir = Files.createTempDirectory(BulkTagJobTest.class.getSimpleName());
+ log.info("using work dir {}", workingDir);
+
+ SparkConf conf = new SparkConf();
+ conf.setAppName(BulkTagJobTest.class.getSimpleName());
+
+ conf.setMaster("local[*]");
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("spark.ui.enabled", "false");
+ conf.set("spark.sql.warehouse.dir", workingDir.toString());
+ conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+ spark = SparkSession
+ .builder()
+ .appName(BulkTagJobTest.class.getSimpleName())
+ .config(conf)
+ .getOrCreate();
+ }
+
+ @AfterAll
+ public static void afterAll() throws IOException {
+ FileUtils.deleteDirectory(workingDir.toFile());
+ spark.stop();
+ }
+
+ @Test
+ public void noUpdatesTest() throws Exception {
+ final String pathMap = BulkTagJobTest.pathMap;
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest", Boolean.TRUE.toString(),
+ "-isSparkSessionManaged", Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates").getPath(),
+ "-taggingConf", taggingConf,
+ "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
+ "-outputPath", workingDir.toString() + "/dataset",
+ "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
+ "-pathMap", pathMap
+ });
+
+ final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/dataset")
+ .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+
+ verificationDataset.createOrReplaceTempView("dataset");
+
+ String query = "select id, MyT.id community "
+ + "from dataset "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ Assertions.assertEquals(0, spark.sql(query).count());
+ }
+
+ @Test
+ public void bulktagBySubjectNoPreviousContextTest() throws Exception {
+ final String sourcePath = getClass()
+ .getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext")
+ .getPath();
+ final String pathMap = BulkTagJobTest.pathMap;
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest", Boolean.TRUE.toString(),
+ "-isSparkSessionManaged", Boolean.FALSE.toString(),
+ "-sourcePath", sourcePath,
+ "-taggingConf", taggingConf,
+ "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
+ "-outputPath", workingDir.toString() + "/dataset",
+ "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
+ "-pathMap", pathMap
+ });
+
+ final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/dataset")
+ .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+
+ verificationDataset.createOrReplaceTempView("dataset");
+
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ + "from dataset "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ Assertions.assertEquals(5, spark.sql(query).count());
+
+ org.apache.spark.sql.Dataset