diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index c1d6e1b5b..c7cb11b08 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -83,6 +83,10 @@
com.jayway.jsonpath
json-path
+
+ org.postgresql
+ postgresql
+
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
similarity index 95%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java
rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
index 94f17aad5..cedc9bd4d 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.common;
import java.io.Closeable;
import java.io.IOException;
@@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
- private final Connection connection;
+ private Connection connection;
public DbClient(final String address, final String login, final String password) {
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
index cdde37fd4..7b0b9a1e2 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
@@ -1,4 +1,3 @@
-
package eu.dnetlib.dhp.schema.common;
import java.util.Map;
@@ -13,7 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class ModelSupport {
/** Defines the mapping between the actual entity type and the main entity type */
- private static final Map entityMapping = Maps.newHashMap();
+ private static Map entityMapping = Maps.newHashMap();
static {
entityMapping.put(EntityType.publication, MainEntityType.result);
@@ -53,6 +52,232 @@ public class ModelSupport {
oafTypes.put("relation", Relation.class);
}
+ public static final Map entityIdPrefix = Maps.newHashMap();
+
+ static {
+ entityIdPrefix.put("datasource", "10");
+ entityIdPrefix.put("organization", "20");
+ entityIdPrefix.put("project", "40");
+ entityIdPrefix.put("result", "50");
+ }
+
+ public static final Map relationInverseMap = Maps.newHashMap();
+
+ static {
+ relationInverseMap
+ .put(
+ "personResult_authorship_isAuthorOf", new RelationInverse()
+ .setRelation("isAuthorOf")
+ .setInverse("hasAuthor")
+ .setRelType("personResult")
+ .setSubReltype("authorship"));
+ relationInverseMap
+ .put(
+ "personResult_authorship_hasAuthor", new RelationInverse()
+ .setInverse("isAuthorOf")
+ .setRelation("hasAuthor")
+ .setRelType("personResult")
+ .setSubReltype("authorship"));
+ relationInverseMap
+ .put(
+ "projectOrganization_participation_isParticipant", new RelationInverse()
+ .setRelation("isParticipant")
+ .setInverse("hasParticipant")
+ .setRelType("projectOrganization")
+ .setSubReltype("participation"));
+ relationInverseMap
+ .put(
+ "projectOrganization_participation_hasParticipant", new RelationInverse()
+ .setInverse("isParticipant")
+ .setRelation("hasParticipant")
+ .setRelType("projectOrganization")
+ .setSubReltype("participation"));
+ relationInverseMap
+ .put(
+ "resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
+ .setRelation("hasAuthorInstitution")
+ .setInverse("isAuthorInstitutionOf")
+ .setRelType("resultOrganization")
+ .setSubReltype("affiliation"));
+ relationInverseMap
+ .put(
+ "resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
+ .setInverse("hasAuthorInstitution")
+ .setRelation("isAuthorInstitutionOf")
+ .setRelType("resultOrganization")
+ .setSubReltype("affiliation"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedup_merges", new RelationInverse()
+ .setRelation("merges")
+ .setInverse("isMergedIn")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedupSimilarity"));
+
+ relationInverseMap
+ .put(
+ "resultProject_outcome_isProducedBy", new RelationInverse()
+ .setRelation("isProducedBy")
+ .setInverse("produces")
+ .setRelType("resultProject")
+ .setSubReltype("outcome"));
+ relationInverseMap
+ .put(
+ "resultProject_outcome_produces", new RelationInverse()
+ .setInverse("isProducedBy")
+ .setRelation("produces")
+ .setRelType("resultProject")
+ .setSubReltype("outcome"));
+ relationInverseMap
+ .put(
+ "projectPerson_contactPerson_isContact", new RelationInverse()
+ .setRelation("isContact")
+ .setInverse("hasContact")
+ .setRelType("projectPerson")
+ .setSubReltype("contactPerson"));
+ relationInverseMap
+ .put(
+ "projectPerson_contactPerson_hasContact", new RelationInverse()
+ .setInverse("isContact")
+ .setRelation("hasContact")
+ .setRelType("personPerson")
+ .setSubReltype("coAuthorship"));
+ relationInverseMap
+ .put(
+ "personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
+ .setInverse("isCoAuthorOf")
+ .setRelation("isCoAuthorOf")
+ .setRelType("personPerson")
+ .setSubReltype("coAuthorship"));
+ relationInverseMap
+ .put(
+ "personPerson_dedup_merges", new RelationInverse()
+ .setInverse("isMergedIn")
+ .setRelation("merges")
+ .setRelType("personPerson")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "personPerson_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("personPerson")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("personPerson")
+ .setSubReltype("dedupSimilarity"));
+ relationInverseMap
+ .put(
+ "datasourceOrganization_provision_isProvidedBy", new RelationInverse()
+ .setInverse("provides")
+ .setRelation("isProvidedBy")
+ .setRelType("datasourceOrganization")
+ .setSubReltype("provision"));
+ relationInverseMap
+ .put(
+ "datasourceOrganization_provision_provides", new RelationInverse()
+ .setInverse("isProvidedBy")
+ .setRelation("provides")
+ .setRelType("datasourceOrganization")
+ .setSubReltype("provision"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("isAmongTopNSimilarDocuments")
+ .setRelation("hasAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("hasAmongTopNSimilarDocuments")
+ .setRelation("isAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_relationship_isRelatedTo", new RelationInverse()
+ .setInverse("isRelatedTo")
+ .setRelation("isRelatedTo")
+ .setRelType("resultResult")
+ .setSubReltype("relationship"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("hasAmongTopNSimilarDocuments")
+ .setRelation("isAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_supplement_isSupplementTo", new RelationInverse()
+ .setInverse("isSupplementedBy")
+ .setRelation("isSupplementTo")
+ .setRelType("resultResult")
+ .setSubReltype("supplement"));
+ relationInverseMap
+ .put(
+ "resultResult_supplement_isSupplementedBy", new RelationInverse()
+ .setInverse("isSupplementTo")
+ .setRelation("isSupplementedBy")
+ .setRelType("resultResult")
+ .setSubReltype("supplement"));
+ relationInverseMap
+ .put(
+ "resultResult_part_isPartOf", new RelationInverse()
+ .setInverse("hasPart")
+ .setRelation("isPartOf")
+ .setRelType("resultResult")
+ .setSubReltype("part"));
+ relationInverseMap
+ .put(
+ "resultResult_part_hasPart", new RelationInverse()
+ .setInverse("isPartOf")
+ .setRelation("hasPart")
+ .setRelType("resultResult")
+ .setSubReltype("part"));
+ relationInverseMap
+ .put(
+ "resultResult_dedup_merges", new RelationInverse()
+ .setInverse("isMergedIn")
+ .setRelation("merges")
+ .setRelType("resultResult")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "resultResult_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("resultResult")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("resultResult")
+ .setSubReltype("dedupSimilarity"));
+
+ }
+
private static final String schemeTemplate = "dnet:%s_%s_relations";
private ModelSupport() {
@@ -68,7 +293,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y
*/
public static Boolean isSubClass(
- X subClazzObject, Y superClazzObject) {
+ X subClazzObject, Y superClazzObject) {
return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
}
@@ -82,7 +307,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y
*/
public static Boolean isSubClass(
- X subClazzObject, Class superClazz) {
+ X subClazzObject, Class superClazz) {
return isSubClass(subClazzObject.getClass(), superClazz);
}
@@ -96,7 +321,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y
*/
public static Boolean isSubClass(
- Class subClazz, Class superClazz) {
+ Class subClazz, Class superClazz) {
return superClazz.isAssignableFrom(subClazz);
}
@@ -108,32 +333,32 @@ public class ModelSupport {
*/
public static Class[] getOafModelClasses() {
return new Class[] {
- Author.class,
- Context.class,
- Country.class,
- DataInfo.class,
- Dataset.class,
- Datasource.class,
- ExternalReference.class,
- ExtraInfo.class,
- Field.class,
- GeoLocation.class,
- Instance.class,
- Journal.class,
- KeyValue.class,
- Oaf.class,
- OafEntity.class,
- OAIProvenance.class,
- Organization.class,
- OriginDescription.class,
- OtherResearchProduct.class,
- Project.class,
- Publication.class,
- Qualifier.class,
- Relation.class,
- Result.class,
- Software.class,
- StructuredProperty.class
+ Author.class,
+ Context.class,
+ Country.class,
+ DataInfo.class,
+ Dataset.class,
+ Datasource.class,
+ ExternalReference.class,
+ ExtraInfo.class,
+ Field.class,
+ GeoLocation.class,
+ Instance.class,
+ Journal.class,
+ KeyValue.class,
+ Oaf.class,
+ OafEntity.class,
+ OAIProvenance.class,
+ Organization.class,
+ OriginDescription.class,
+ OtherResearchProduct.class,
+ Project.class,
+ Publication.class,
+ Qualifier.class,
+ Relation.class,
+ Result.class,
+ Software.class,
+ StructuredProperty.class
};
}
@@ -147,10 +372,10 @@ public class ModelSupport {
public static String getScheme(final String sourceType, final String targetType) {
return String
- .format(
- schemeTemplate,
- entityMapping.get(EntityType.valueOf(sourceType)).name(),
- entityMapping.get(EntityType.valueOf(targetType)).name());
+ .format(
+ schemeTemplate,
+ entityMapping.get(EntityType.valueOf(sourceType)).name(),
+ entityMapping.get(EntityType.valueOf(targetType)).name());
}
public static Function idFn() {
@@ -165,38 +390,38 @@ public class ModelSupport {
private static String idFnForRelation(T t) {
Relation r = (Relation) t;
return Optional
- .ofNullable(r.getSource())
- .map(
- source -> Optional
- .ofNullable(r.getTarget())
- .map(
- target -> Optional
- .ofNullable(r.getRelType())
- .map(
- relType -> Optional
- .ofNullable(r.getSubRelType())
- .map(
- subRelType -> Optional
- .ofNullable(r.getRelClass())
- .map(
- relClass -> String
- .join(
- source,
- target,
- relType,
- subRelType,
- relClass))
- .orElse(
- String
- .join(
- source,
- target,
- relType,
- subRelType)))
- .orElse(String.join(source, target, relType)))
- .orElse(String.join(source, target)))
- .orElse(source))
- .orElse(null);
+ .ofNullable(r.getSource())
+ .map(
+ source -> Optional
+ .ofNullable(r.getTarget())
+ .map(
+ target -> Optional
+ .ofNullable(r.getRelType())
+ .map(
+ relType -> Optional
+ .ofNullable(r.getSubRelType())
+ .map(
+ subRelType -> Optional
+ .ofNullable(r.getRelClass())
+ .map(
+ relClass -> String
+ .join(
+ source,
+ target,
+ relType,
+ subRelType,
+ relClass))
+ .orElse(
+ String
+ .join(
+ source,
+ target,
+ relType,
+ subRelType)))
+ .orElse(String.join(source, target, relType)))
+ .orElse(String.join(source, target)))
+ .orElse(source))
+ .orElse(null);
}
private static String idFnForOafEntity(T t) {
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java
new file mode 100644
index 000000000..4757c637e
--- /dev/null
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java
@@ -0,0 +1,46 @@
+
+package eu.dnetlib.dhp.schema.common;
+
+public class RelationInverse {
+ private String relation;
+ private String inverse;
+ private String relType;
+ private String subReltype;
+
+ public String getRelType() {
+ return relType;
+ }
+
+ public RelationInverse setRelType(String relType) {
+ this.relType = relType;
+ return this;
+ }
+
+ public String getSubReltype() {
+ return subReltype;
+ }
+
+ public RelationInverse setSubReltype(String subReltype) {
+ this.subReltype = subReltype;
+ return this;
+ }
+
+ public String getRelation() {
+ return relation;
+ }
+
+ public RelationInverse setRelation(String relation) {
+ this.relation = relation;
+ return this;
+ }
+
+ public String getInverse() {
+ return inverse;
+ }
+
+ public RelationInverse setInverse(String inverse) {
+ this.inverse = inverse;
+ return this;
+ }
+
+}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
index b9bd4c5f0..231fb1e60 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
@@ -2,8 +2,7 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
-import java.util.List;
-import java.util.Objects;
+import java.util.*;
public class Author implements Serializable {
@@ -86,4 +85,5 @@ public class Author implements Serializable {
public int hashCode() {
return Objects.hash(fullname, name, surname, rank, pid, affiliation);
}
+
}
diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml
new file mode 100644
index 000000000..37abc22f6
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/pom.xml
@@ -0,0 +1,36 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.1-SNAPSHOT
+
+ 4.0.0
+
+ dhp-blacklist
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java
new file mode 100644
index 000000000..d5c2b518a
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java
@@ -0,0 +1,100 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareMergedRelationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareMergedRelationJob.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareMergedRelationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ String outputPath = parser.get("outputPath");
+ log.info("outputPath: {} ", outputPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ selectMergesRelations(
+ spark,
+ inputPath,
+ outputPath);
+ });
+ }
+
+ private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
+
+ Dataset relation = readRelations(spark, inputPath);
+
+ relation
+ .filter("relclass = 'merges' and datainfo.deletedbyinference=false")
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+// relation.createOrReplaceTempView("relation");
+//
+// spark
+// .sql(
+// "Select * from relation " +
+// "where relclass = 'merges' " +
+// "and datainfo.deletedbyinference = false")
+// .as(Encoders.bean(Relation.class))
+// .toJSON()
+// .write()
+// .mode(SaveMode.Overwrite)
+// .option("compression", "gzip")
+// .text(outputPath);
+ }
+
+ public static org.apache.spark.sql.Dataset readRelations(
+ SparkSession spark, String inputPath) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map(
+ (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class),
+ Encoders.bean(Relation.class));
+ }
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java
new file mode 100644
index 000000000..704cab375
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java
@@ -0,0 +1,142 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import java.io.BufferedWriter;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.sql.ResultSet;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.function.Function;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.DbClient;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.common.RelationInverse;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class ReadBlacklistFromDB implements Closeable {
+
+ private final DbClient dbClient;
+ private static final Log log = LogFactory.getLog(ReadBlacklistFromDB.class);
+ private final Configuration conf;
+ private final BufferedWriter writer;
+ private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private final static String query = "SELECT source_type, unnest(original_source_objects) as source, " +
+ "target_type, unnest(original_target_objects) as target, " +
+ "relationship FROM blacklist WHERE status = 'ACCEPTED'";
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ ReadBlacklistFromDB.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/blacklist_parameters.json")));
+
+ parser.parseArgument(args);
+
+ final String dbUrl = parser.get("postgresUrl");
+ final String dbUser = parser.get("postgresUser");
+ final String dbPassword = parser.get("postgresPassword");
+ final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
+ final String hdfsNameNode = parser.get("hdfsNameNode");
+
+ try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
+ dbPassword)) {
+
+ log.info("Processing blacklist...");
+ rbl.execute(query, rbl::processBlacklistEntry);
+
+ }
+ }
+
+ public void execute(final String sql, final Function> producer)
+ throws Exception {
+
+ final Consumer consumer = rs -> producer.apply(rs).forEach(r -> writeRelation(r));
+
+ dbClient.processResults(sql, consumer);
+ }
+
+ public List processBlacklistEntry(ResultSet rs) {
+ try {
+ Relation direct = new Relation();
+ Relation inverse = new Relation();
+
+ String source_prefix = ModelSupport.entityIdPrefix.get(rs.getString("source_type"));
+ String target_prefix = ModelSupport.entityIdPrefix.get(rs.getString("target_type"));
+
+ String source_direct = source_prefix + "|" + rs.getString("source");
+ direct.setSource(source_direct);
+ inverse.setTarget(source_direct);
+
+ String target_direct = target_prefix + "|" + rs.getString("target");
+ direct.setTarget(target_direct);
+ inverse.setSource(target_direct);
+
+ String encoding = rs.getString("relationship");
+ RelationInverse ri = ModelSupport.relationInverseMap.get(encoding);
+ direct.setRelClass(ri.getRelation());
+ inverse.setRelClass(ri.getInverse());
+ direct.setRelType(ri.getRelType());
+ inverse.setRelType(ri.getRelType());
+ direct.setSubRelType(ri.getSubReltype());
+ inverse.setSubRelType(ri.getSubReltype());
+
+ return Arrays.asList(direct, inverse);
+
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ dbClient.close();
+ writer.close();
+ }
+
+ public ReadBlacklistFromDB(
+ final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword)
+ throws Exception {
+
+ this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
+ this.conf = new Configuration();
+ this.conf.set("fs.defaultFS", hdfsNameNode);
+ FileSystem fileSystem = FileSystem.get(this.conf);
+ Path hdfsWritePath = new Path(hdfsPath);
+ FSDataOutputStream fsDataOutputStream = null;
+ if (fileSystem.exists(hdfsWritePath)) {
+ fsDataOutputStream = fileSystem.append(hdfsWritePath);
+ } else {
+ fsDataOutputStream = fileSystem.create(hdfsWritePath);
+ }
+
+ this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
+ }
+
+ protected void writeRelation(final Relation r) {
+ try {
+ writer.write(OBJECT_MAPPER.writeValueAsString(r));
+ writer.newLine();
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java
new file mode 100644
index 000000000..c5104058c
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java
@@ -0,0 +1,151 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Objects;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+public class SparkRemoveBlacklistedRelationJob {
+ private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkRemoveBlacklistedRelationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ final String blacklistPath = parser.get("hdfsPath");
+ log.info("blacklistPath {}: ", blacklistPath);
+
+ final String mergesPath = parser.get("mergesPath");
+ log.info("mergesPath {}: ", mergesPath);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeBlacklistedRelations(
+ spark,
+ blacklistPath,
+ inputPath,
+ outputPath,
+ mergesPath);
+ });
+
+ }
+
+ private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
+ String outputPath, String mergesPath) {
+ Dataset blackListed = readRelations(spark, blacklistPath + "/blacklist");
+ Dataset inputRelation = readRelations(spark, inputPath);
+ Dataset mergesRelation = readRelations(spark, mergesPath);
+
+ log.info("InputRelationCount: {}", inputRelation.count());
+
+ Dataset dedupSource = blackListed
+ .joinWith(
+ mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
+ "left_outer")
+ .map(c -> {
+ Optional
+ .ofNullable(c._2())
+ .ifPresent(mr -> c._1().setSource(mr.getSource()));
+ return c._1();
+ }, Encoders.bean(Relation.class));
+
+ Dataset dedupBL = dedupSource
+ .joinWith(
+ mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")),
+ "left_outer")
+ .map(c -> {
+ Optional
+ .ofNullable(c._2())
+ .ifPresent(mr -> c._1().setTarget(mr.getSource()));
+ return c._1();
+ }, Encoders.bean(Relation.class));
+
+ dedupBL
+ .write()
+ .mode(SaveMode.Overwrite)
+ .json(blacklistPath + "/deduped");
+
+
+ inputRelation
+ .joinWith(
+ dedupBL, (inputRelation
+ .col("source")
+ .equalTo(dedupBL.col("source"))
+ .and(
+ inputRelation
+ .col("target")
+ .equalTo(dedupBL.col("target")))),
+ "left_outer")
+ .map(c -> {
+ Relation ir = c._1();
+ Optional obl = Optional.ofNullable(c._2());
+ if (obl.isPresent()) {
+ if (ir.equals(obl.get())) {
+ return null;
+ }
+ }
+ return ir;
+
+ }, Encoders.bean(Relation.class))
+ .filter(Objects::nonNull)
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+
+ }
+
+
+ public static org.apache.spark.sql.Dataset readRelations(
+ SparkSession spark, String inputPath) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map(
+ (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class),
+ Encoders.bean(Relation.class));
+ }
+
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json
new file mode 100644
index 000000000..9a2eadaa7
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json
@@ -0,0 +1,32 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nn",
+ "paramLongName": "hdfsNameNode",
+ "paramDescription": "the name node on hdfs",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pgurl",
+ "paramLongName": "postgresUrl",
+ "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pguser",
+ "paramLongName": "postgresUser",
+ "paramDescription": "postgres user",
+ "paramRequired": false
+ },
+ {
+ "paramName": "pgpasswd",
+ "paramLongName": "postgresPassword",
+ "paramDescription": "postgres password",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json
new file mode 100644
index 000000000..4a3d21f4d
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json
@@ -0,0 +1,26 @@
+[
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the path to the graph used to remove the relations ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path where to store the temporary result ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "issm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": false
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml
new file mode 100644
index 000000000..fe82ae194
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml
@@ -0,0 +1,54 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
new file mode 100644
index 000000000..855cac65e
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
@@ -0,0 +1,98 @@
+
+
+
+ postgresURL
+ the url of the postgress server to query
+
+
+ postgresUser
+ the username to access the postgres db
+
+
+ postgresPassword
+ the postgres password
+
+
+ sourcePath
+ the source path
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB
+ --hdfsPath${workingDir}/blacklist
+ --hdfsNameNode${nameNode}
+ --postgresUrl${postgresURL}
+ --postgresUser${postgresUser}
+ --postgresPassword${postgresPassword}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ PrepareMergedRelation
+ eu.dnetlib.dhp.blacklist.PrepareMergedRelationJob
+ dhp-blacklist-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/relation
+ --outputPath${workingDir}/mergesRelation
+ --hive_metastore_uris${hive_metastore_uris}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ApplyBlacklist
+ eu.dnetlib.dhp.blacklist.SparkRemoveBlacklistedRelationJob
+ dhp-blacklist-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/relation
+ --outputPath${workingDir}/relation
+ --hdfsPath${workingDir}/blacklist
+ --mergesPath${workingDir}/mergesRelation
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json
new file mode 100644
index 000000000..91a87b8b5
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json
@@ -0,0 +1,33 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the path to the graph used to remove the relations ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path where to store the temporary result ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "issm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": false
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "mergesPath",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": true
+
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java
new file mode 100644
index 000000000..2d6b1061b
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java
@@ -0,0 +1,166 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class BlackListTest {
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private static final ClassLoader cl = eu.dnetlib.dhp.blacklist.BlackListTest.class.getClassLoader();
+
+ private static SparkSession spark;
+
+ private static Path workingDir;
+ private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.blacklist.BlackListTest.class);
+
+ @BeforeAll
+ public static void beforeAll() throws IOException {
+ workingDir = Files.createTempDirectory(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
+ log.info("using work dir {}", workingDir);
+
+ SparkConf conf = new SparkConf();
+ conf.setAppName(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
+
+ conf.setMaster("local[*]");
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("spark.ui.enabled", "false");
+ conf.set("spark.sql.warehouse.dir", workingDir.toString());
+ conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+ spark = SparkSession
+ .builder()
+ .appName(BlackListTest.class.getSimpleName())
+ .config(conf)
+ .getOrCreate();
+ }
+
+ @AfterAll
+ public static void afterAll() throws IOException {
+ FileUtils.deleteDirectory(workingDir.toFile());
+ spark.stop();
+ }
+
+ /*
+ * String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath =
+ * parser.get("outputPath"); log.info("outputPath {}: ", outputPath); final String blacklistPath =
+ * parser.get("hdfsPath"); log.info("blacklistPath {}: ", blacklistPath); final String mergesPath =
+ * parser.get("mergesPath"); log.info("mergesPath {}: ", mergesPath);
+ */
+ @Test
+ public void noRemoveTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsNoRemoval").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(13, tmp.count());
+
+ }
+
+ @Test
+ public void removeNoMergeMatchTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsOneRemoval").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(12, tmp.count());
+
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
+
+ Assertions
+ .assertEquals(
+ 0, verificationDataset
+ .filter(
+ "source = '40|corda__h2020::5161f53ab205d803c36b4c888fe7deef' and " +
+ "target = '20|dedup_wf_001::157af406bc653aa4d9749318b644de43'")
+ .count());
+
+ Assertions.assertEquals(0, verificationDataset.filter("relClass = 'hasParticipant'").count());
+ }
+
+ @Test
+ public void removeMergeMatchTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRelOneMerge").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(12, tmp.count());
+
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
+
+ Assertions.assertEquals(12, verificationDataset.filter("relClass = 'isProvidedBy'").count());
+
+ }
+}
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist
new file mode 100644
index 000000000..ea95130af
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist
@@ -0,0 +1,20 @@
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"hasParticipant","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"isParticipant","source":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43","target":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::a727cc288016db7132ef9a799aa83350","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::a727cc288016db7132ef9a799aa83350"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::062cf091d5c7a7d730001c34177042e3","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::062cf091d5c7a7d730001c34177042e3"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::1b172ab34639e7935e2357119cf20830","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|od_______908::1b172ab34639e7935e2357119cf20830"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json
new file mode 100644
index 000000000..8f0d296d6
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json
@@ -0,0 +1,14 @@
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______177::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json
new file mode 100644
index 000000000..3d74ffa6e
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json
@@ -0,0 +1,14 @@
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json
new file mode 100644
index 000000000..761cba478
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProducedBy","relType":"resultProject","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"outcome","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json
new file mode 100644
index 000000000..a79d1d8eb
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::018cb61ed43c01704decc66183ce5d60","subRelType":"provision","target":"20|dedup_wf_001::b9fff055ce5efacecbe4ef918c127f86"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json
new file mode 100644
index 000000000..f809acfeb
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","subRelType":"participation","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/pom.xml b/dhp-workflows/dhp-bulktag/pom.xml
new file mode 100644
index 000000000..7c2afa0cc
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/pom.xml
@@ -0,0 +1,65 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.1-SNAPSHOT
+
+ 4.0.0
+
+ dhp-bulktag
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+ dom4j
+ dom4j
+
+
+ jaxen
+ jaxen
+
+
+ com.jayway.jsonpath
+ json-path
+
+
+ org.reflections
+ reflections
+ 0.9.11
+ compile
+
+
+ com.google.guava
+ guava
+ 23.3-jre
+
+
+ io.github.classgraph
+ classgraph
+ 4.8.71
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/project-default.properties b/dhp-workflows/dhp-bulktag/project-default.properties
new file mode 100644
index 000000000..84a56f19f
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/project-default.properties
@@ -0,0 +1,7 @@
+#sandboxName when not provided explicitly will be generated
+sandboxName=${sandboxName}
+sandboxDir=/user/${dhp.hadoop.frontend.user.name}/${sandboxName}
+workingDir=${sandboxDir}/working_dir
+oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir}
+oozieTopWfApplicationPath = ${oozie.wf.application.path}
+
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
new file mode 100644
index 000000000..e62b4b4fc
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@@ -0,0 +1,120 @@
+
+package eu.dnetlib.dhp.bulktag;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.community.*;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class SparkBulkTagJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob.class);
+ public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkBulkTagJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ Boolean isTest = Optional
+ .ofNullable(parser.get("isTest"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.FALSE);
+ log.info("isTest: {} ", isTest);
+
+ final String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class);
+ log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ CommunityConfiguration cc;
+
+ String taggingConf = parser.get("taggingConf");
+
+ if (isTest) {
+ cc = CommunityConfigurationFactory.newInstance(taggingConf);
+ } else {
+ cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookUpUrl"));
+ }
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
+ });
+ }
+
+ private static void execBulkTag(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ ProtoMap protoMappingParams,
+ Class resultClazz,
+ CommunityConfiguration communityConfiguration) {
+
+ ResultTagger resultTagger = new ResultTagger();
+ readPath(spark, inputPath, resultClazz)
+ .map(
+ (MapFunction) value -> resultTagger
+ .enrichContextCriteria(
+ value, communityConfiguration, protoMappingParams),
+ Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ public static Dataset readPath(
+ SparkSession spark, String inputPath, Class clazz) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+ }
+
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java
new file mode 100644
index 000000000..a73ff4d3e
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 01/08/2018. */
+public class Community implements Serializable {
+
+ private static final Log log = LogFactory.getLog(Community.class);
+
+ private String id;
+ private List subjects = new ArrayList<>();
+ private List datasources = new ArrayList<>();
+ private List zenodoCommunities = new ArrayList<>();
+
+ public String toJson() {
+ final Gson g = new Gson();
+ return g.toJson(this);
+ }
+
+ public boolean isValid() {
+ return !getSubjects().isEmpty()
+ || !getDatasources().isEmpty()
+ || !getZenodoCommunities().isEmpty();
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public List getSubjects() {
+ return subjects;
+ }
+
+ public void setSubjects(List subjects) {
+ this.subjects = subjects;
+ }
+
+ public List getDatasources() {
+ return datasources;
+ }
+
+ public void setDatasources(List datasources) {
+ this.datasources = datasources;
+ }
+
+ public List getZenodoCommunities() {
+ return zenodoCommunities;
+ }
+
+ public void setZenodoCommunities(List zenodoCommunities) {
+ this.zenodoCommunities = zenodoCommunities;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java
new file mode 100644
index 000000000..c5bbb66eb
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java
@@ -0,0 +1,196 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
+import eu.dnetlib.dhp.selectioncriteria.Selection;
+
+/** Created by miriam on 02/08/2018. */
+public class CommunityConfiguration implements Serializable {
+
+ private static final Log log = LogFactory.getLog(CommunityConfiguration.class);
+
+ private Map communities;
+
+ // map subject -> communityid
+ private Map>> subjectMap = new HashMap<>();
+ // map datasourceid -> communityid
+ private Map>> datasourceMap = new HashMap<>();
+ // map zenodocommunityid -> communityid
+ private Map>> zenodocommunityMap = new HashMap<>();
+
+ public Map>> getSubjectMap() {
+ return subjectMap;
+ }
+
+ public void setSubjectMap(Map>> subjectMap) {
+ this.subjectMap = subjectMap;
+ }
+
+ public Map>> getDatasourceMap() {
+ return datasourceMap;
+ }
+
+ public void setDatasourceMap(
+ Map>> datasourceMap) {
+ this.datasourceMap = datasourceMap;
+ }
+
+ public Map>> getZenodocommunityMap() {
+ return zenodocommunityMap;
+ }
+
+ public void setZenodocommunityMap(
+ Map>> zenodocommunityMap) {
+ this.zenodocommunityMap = zenodocommunityMap;
+ }
+
+ CommunityConfiguration(final Map communities) {
+ this.communities = communities;
+ init();
+ }
+
+ void init() {
+
+ if (subjectMap == null) {
+ subjectMap = Maps.newHashMap();
+ }
+ if (datasourceMap == null) {
+ datasourceMap = Maps.newHashMap();
+ }
+ if (zenodocommunityMap == null) {
+ zenodocommunityMap = Maps.newHashMap();
+ }
+
+ for (Community c : getCommunities().values()) {
+ // get subjects
+ final String id = c.getId();
+ for (String sbj : c.getSubjects()) {
+ Pair p = new Pair<>(id, new SelectionConstraints());
+ add(sbj.toLowerCase().trim(), p, subjectMap);
+ }
+ // get datasources
+ for (Datasource d : c.getDatasources()) {
+
+ add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
+ }
+ // get zenodo communities
+ for (ZenodoCommunity zc : c.getZenodoCommunities()) {
+ add(
+ zc.getZenodoCommunityId(),
+ new Pair<>(id, zc.getSelCriteria()),
+ zenodocommunityMap);
+ }
+ }
+ }
+
+ private void add(
+ String key,
+ Pair value,
+ Map>> map) {
+ List> values = map.get(key);
+
+ if (values == null) {
+ values = new ArrayList<>();
+ map.put(key, values);
+ }
+ values.add(value);
+ }
+
+ public List> getCommunityForSubject(String sbj) {
+ return subjectMap.get(sbj);
+ }
+
+ public List> getCommunityForDatasource(String dts) {
+ return datasourceMap.get(dts);
+ }
+
+ public List getCommunityForDatasource(
+ final String dts, final Map> param) {
+ List> lp = datasourceMap.get(dts);
+ if (lp == null)
+ return Lists.newArrayList();
+
+ return lp
+ .stream()
+ .map(
+ p -> {
+ if (p.getSnd() == null)
+ return p.getFst();
+ if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
+ return p.getFst();
+ else
+ return null;
+ })
+ .filter(st -> (st != null))
+ .collect(Collectors.toList());
+ }
+
+ public List> getCommunityForZenodoCommunity(String zc) {
+ return zenodocommunityMap.get(zc);
+ }
+
+ public List getCommunityForSubjectValue(String value) {
+
+ return getContextIds(subjectMap.get(value));
+ }
+
+ public List getCommunityForDatasourceValue(String value) {
+
+ return getContextIds(datasourceMap.get(value.toLowerCase()));
+ }
+
+ public List getCommunityForZenodoCommunityValue(String value) {
+
+ return getContextIds(zenodocommunityMap.get(value.toLowerCase()));
+ }
+
+ private List getContextIds(List> list) {
+ if (list != null) {
+ return list.stream().map(p -> p.getFst()).collect(Collectors.toList());
+ }
+ return Lists.newArrayList();
+ }
+
+ public Map getCommunities() {
+ return communities;
+ }
+
+ public void setCommunities(Map communities) {
+ this.communities = communities;
+ }
+
+ public String toJson() {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
+ Gson gson = builder.create();
+
+ return gson.toJson(this);
+ }
+
+ public int size() {
+ return communities.keySet().size();
+ }
+
+ public Community getCommunityById(String id) {
+ return communities.get(id);
+ }
+
+ public List getCommunityList() {
+ return Lists.newLinkedList(communities.values());
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java
new file mode 100644
index 000000000..508f0663d
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java
@@ -0,0 +1,138 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Node;
+import org.dom4j.io.SAXReader;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
+import eu.dnetlib.dhp.selectioncriteria.Selection;
+import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
+import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory;
+
+/** Created by miriam on 03/08/2018. */
+public class CommunityConfigurationFactory {
+
+ private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class);
+
+ private static VerbResolver resolver = VerbResolverFactory.newInstance();
+
+ public static CommunityConfiguration newInstance(final String xml) throws DocumentException {
+
+ log.debug(String.format("parsing community configuration from:\n%s", xml));
+
+ final Document doc = new SAXReader().read(new StringReader(xml));
+
+ final Map communities = Maps.newHashMap();
+
+ for (final Object o : doc.selectNodes("//community")) {
+
+ final Node node = (Node) o;
+
+ final Community community = parseCommunity(node);
+
+ if (community.isValid()) {
+ communities.put(community.getId(), community);
+ }
+ }
+
+ log.info(String.format("loaded %s community configuration profiles", communities.size()));
+ log.debug(String.format("loaded community configuration:\n%s", communities.toString()));
+
+ return new CommunityConfiguration(communities);
+ }
+
+ public static CommunityConfiguration fromJson(final String json) {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
+ Gson gson = builder.create();
+ final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class);
+ log.info(String.format("loaded %s community configuration profiles", conf.size()));
+ conf.init();
+ log.info("created inverse maps");
+
+ return conf;
+ }
+
+ private static Community parseCommunity(final Node node) {
+
+ final Community c = new Community();
+
+ c.setId(node.valueOf("./@id"));
+
+ log.info(String.format("community id: %s", c.getId()));
+
+ c.setSubjects(parseSubjects(node));
+ c.setDatasources(parseDatasources(node));
+ c.setZenodoCommunities(parseZenodoCommunities(node));
+ return c;
+ }
+
+ private static List parseSubjects(final Node node) {
+
+ final List subjects = Lists.newArrayList();
+
+ final List list = node.selectNodes("./subjects/subject");
+
+ for (Node n : list) {
+ log.debug("text of the node " + n.getText());
+ subjects.add(StringUtils.trim(n.getText()));
+ }
+ log.info("size of the subject list " + subjects.size());
+ return subjects;
+ }
+
+ private static List parseDatasources(final Node node) {
+ final List list = node.selectNodes("./datasources/datasource");
+ final List datasourceList = new ArrayList<>();
+ for (Node n : list) {
+ Datasource d = new Datasource();
+ d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
+ d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
+ datasourceList.add(d);
+ }
+ log.info("size of the datasource list " + datasourceList.size());
+ return datasourceList;
+ }
+
+ private static List parseZenodoCommunities(final Node node) {
+ final Node oacommunitynode = node.selectSingleNode("./oacommunity");
+ String oacommunity = null;
+ if (oacommunitynode != null) {
+ String tmp = oacommunitynode.getText();
+ if (StringUtils.isNotBlank(tmp))
+ oacommunity = tmp;
+ }
+
+ final List list = node.selectNodes("./zenodocommunities/zenodocommunity");
+ final List zenodoCommunityList = new ArrayList<>();
+ for (Node n : list) {
+ ZenodoCommunity zc = new ZenodoCommunity();
+ zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText());
+ zc.setSelCriteria(n.selectSingleNode("./selcriteria"));
+
+ zenodoCommunityList.add(zc);
+ }
+ if (oacommunity != null) {
+ ZenodoCommunity zc = new ZenodoCommunity();
+ zc.setZenodoCommunityId(oacommunity);
+ zenodoCommunityList.add(zc);
+ }
+ log.info("size of the zenodo community list " + zenodoCommunityList.size());
+ return zenodoCommunityList;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java
new file mode 100644
index 000000000..54f381d4a
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java
@@ -0,0 +1,56 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+
+import eu.dnetlib.dhp.selectioncriteria.Selection;
+import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
+
+public class Constraint implements Serializable {
+ private String verb;
+ private String field;
+ private String value;
+ private Selection selection;
+
+ public Constraint() {
+ }
+
+ public String getVerb() {
+ return verb;
+ }
+
+ public void setVerb(String verb) {
+ this.verb = verb;
+ }
+
+ public String getField() {
+ return field;
+ }
+
+ public void setField(String field) {
+ this.field = field;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(String value) {
+ this.value = value;
+ }
+
+ public void setSelection(Selection sel) {
+ selection = sel;
+ }
+
+ public void setSelection(VerbResolver resolver)
+ throws InvocationTargetException, NoSuchMethodException, InstantiationException,
+ IllegalAccessException {
+ selection = resolver.getSelectionCriteria(verb, value);
+ }
+
+ public boolean verifyCriteria(String metadata) {
+ return selection.apply(metadata);
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java
new file mode 100644
index 000000000..af095c513
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java
@@ -0,0 +1,74 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Type;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+
+import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
+
+/** Created by miriam on 02/08/2018. */
+public class Constraints implements Serializable {
+ private static final Log log = LogFactory.getLog(Constraints.class);
+ // private ConstraintEncapsulator ce;
+ private List constraint;
+
+ public Constraints() {
+ }
+
+ public List getConstraint() {
+ return constraint;
+ }
+
+ public void setConstraint(List constraint) {
+ this.constraint = constraint;
+ }
+
+ public void setSc(String json) {
+ Type collectionType = new TypeToken>() {
+ }.getType();
+ constraint = new Gson().fromJson(json, collectionType);
+ }
+
+ void setSelection(VerbResolver resolver) {
+ for (Constraint st : constraint) {
+
+ try {
+ st.setSelection(resolver);
+ } catch (NoSuchMethodException e) {
+ log.error(e.getMessage());
+ } catch (IllegalAccessException e) {
+ log.error(e.getMessage());
+ } catch (InvocationTargetException e) {
+ log.error(e.getMessage());
+ } catch (InstantiationException e) {
+ log.error(e.getMessage());
+ }
+ }
+ }
+
+ // Constraint in and
+ public boolean verifyCriteria(final Map> param) {
+
+ for (Constraint sc : constraint) {
+ boolean verified = false;
+ for (String value : param.get(sc.getField())) {
+ if (sc.verifyCriteria(value.trim())) {
+ verified = true;
+ }
+ }
+ if (!verified)
+ return verified;
+ }
+ return true;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java
new file mode 100644
index 000000000..a3d343087
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java
@@ -0,0 +1,61 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Node;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
+
+/** Created by miriam on 01/08/2018. */
+public class Datasource implements Serializable {
+ private static final Log log = LogFactory.getLog(Datasource.class);
+
+ private String openaireId;
+
+ private SelectionConstraints selectionConstraints;
+
+ public SelectionConstraints getSelCriteria() {
+ return selectionConstraints;
+ }
+
+ public SelectionConstraints getSelectionConstraints() {
+ return selectionConstraints;
+ }
+
+ public void setSelectionConstraints(SelectionConstraints selectionConstraints) {
+ this.selectionConstraints = selectionConstraints;
+ }
+
+ public void setSelCriteria(SelectionConstraints selCriteria) {
+ this.selectionConstraints = selCriteria;
+ }
+
+ public String getOpenaireId() {
+ return openaireId;
+ }
+
+ public void setOpenaireId(String openaireId) {
+ this.openaireId = openaireId;
+ }
+
+ private void setSelCriteria(String json, VerbResolver resolver) {
+ log.info("Selection constraints for datasource = " + json);
+ selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class);
+
+ selectionConstraints.setSelection(resolver);
+ }
+
+ public void setSelCriteria(Node n, VerbResolver resolver) {
+ try {
+ setSelCriteria(n.getText(), resolver);
+ } catch (Exception e) {
+ log.info("not set selection criteria... ");
+ selectionConstraints = null;
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java
new file mode 100644
index 000000000..01cd3ce22
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java
@@ -0,0 +1,39 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 03/08/2018. */
+public class Pair implements Serializable {
+ private A fst;
+ private B snd;
+
+ public A getFst() {
+ return fst;
+ }
+
+ public Pair setFst(A fst) {
+ this.fst = fst;
+ return this;
+ }
+
+ public B getSnd() {
+ return snd;
+ }
+
+ public Pair setSnd(B snd) {
+ this.snd = snd;
+ return this;
+ }
+
+ public Pair(A a, B b) {
+ fst = a;
+ snd = b;
+ }
+
+ public String toJson() {
+ return new Gson().toJson(this);
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java
new file mode 100644
index 000000000..d48dce2c6
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java
@@ -0,0 +1,12 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+import java.util.HashMap;
+
+public class ProtoMap extends HashMap implements Serializable {
+
+ public ProtoMap() {
+ super();
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java
new file mode 100644
index 000000000..2c18392c7
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.util.List;
+
+import org.dom4j.DocumentException;
+
+import com.google.common.base.Joiner;
+
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class QueryInformationSystem {
+ private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+ + " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
+ + " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
+ + " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
+ + " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
+ + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] "
+ + " return "
+ + " "
+ + " { $x//CONFIGURATION/context/@id} "
+ + " "
+ + " {for $y in tokenize($subj,',') "
+ + " return "
+ + " {$y}} "
+ + " "
+ + " "
+ + " {for $d in $datasources "
+ + " where $d/param[./@name='enabled']/text()='true' "
+ + " return "
+ + " "
+ + " "
+ + " {$d//param[./@name='openaireId']/text()} "
+ + " "
+ + " "
+ + " {$d/param[./@name='selcriteria']/text()} "
+ + " "
+ + " } "
+ + " "
+ + " "
+ + " {for $zc in $communities "
+ + " return "
+ + " "
+ + " "
+ + " {$zc/param[./@name='zenodoid']/text()} "
+ + " "
+ + " "
+ + " {$zc/param[./@name='selcriteria']/text()} "
+ + " "
+ + " } "
+ + " "
+ + " ";
+
+ public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
+ throws ISLookUpException, DocumentException {
+ ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
+ final List res = isLookUp.quickSearchProfile(XQUERY);
+
+ final String xmlConf = "" + Joiner.on(" ").join(res) + "";
+
+ return CommunityConfigurationFactory.newInstance(xmlConf);
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java
new file mode 100644
index 000000000..eb531c6b1
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java
@@ -0,0 +1,246 @@
+
+package eu.dnetlib.dhp.community;
+
+import static eu.dnetlib.dhp.community.TagginConstants.*;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.gson.Gson;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.JsonPath;
+
+import eu.dnetlib.dhp.schema.oaf.*;
+
+/** Created by miriam on 02/08/2018. */
+public class ResultTagger implements Serializable {
+
+ private String trust = "0.8";
+
+ private boolean clearContext(Result result) {
+ int tmp = result.getContext().size();
+ List clist = result
+ .getContext()
+ .stream()
+ .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
+ .collect(Collectors.toList());
+ result.setContext(clist);
+ return (tmp != clist.size());
+ }
+
+ private Map> getParamMap(final Result result, Map params) {
+ Map> param = new HashMap<>();
+ String json = new Gson().toJson(result, Result.class);
+ DocumentContext jsonContext = JsonPath.parse(json);
+ if (params == null) {
+ params = new HashMap<>();
+ }
+ for (String key : params.keySet()) {
+ try {
+ param.put(key, jsonContext.read(params.get(key)));
+ } catch (com.jayway.jsonpath.PathNotFoundException e) {
+ param.put(key, new ArrayList<>());
+ // throw e;
+ }
+ }
+ return param;
+ }
+
+ public R enrichContextCriteria(
+ final R result, final CommunityConfiguration conf, final Map criteria) {
+
+ // }
+ // public Result enrichContextCriteria(final Result result, final CommunityConfiguration
+ // conf, final Map criteria) {
+ final Map> param = getParamMap(result, criteria);
+
+ // Verify if the entity is deletedbyinference. In case verify if to clean the context list
+ // from all the zenodo communities
+ if (result.getDataInfo().getDeletedbyinference()) {
+ clearContext(result);
+ return result;
+ }
+
+ // communities contains all the communities to be added as context for the result
+ final Set communities = new HashSet<>();
+
+ // tagging for Subject
+ final Set subjects = new HashSet<>();
+ Optional> oresultsubj = Optional.ofNullable(result.getSubject());
+ if (oresultsubj.isPresent()) {
+ oresultsubj
+ .get()
+ .stream()
+ .map(subject -> subject.getValue())
+ .filter(StringUtils::isNotBlank)
+ .map(String::toLowerCase)
+ .map(String::trim)
+ .collect(Collectors.toCollection(HashSet::new))
+ .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
+ }
+
+ communities.addAll(subjects);
+
+ // Tagging for datasource
+ final Set datasources = new HashSet<>();
+ final Set tmp = new HashSet<>();
+
+ Optional> oresultinstance = Optional.ofNullable(result.getInstance());
+ if (oresultinstance.isPresent()) {
+ for (Instance i : oresultinstance.get()) {
+ tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
+ tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
+ }
+
+ oresultinstance
+ .get()
+ .stream()
+ .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
+ .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
+ .map(s -> StringUtils.substringAfter(s, "|"))
+ .collect(Collectors.toCollection(HashSet::new))
+ .forEach(
+ dsId -> datasources
+ .addAll(
+ conf.getCommunityForDatasource(dsId, param)));
+ }
+
+ communities.addAll(datasources);
+
+ /* Tagging for Zenodo Communities */
+ final Set czenodo = new HashSet<>();
+
+ Optional> oresultcontext = Optional.ofNullable(result.getContext());
+ if (oresultcontext.isPresent()) {
+ oresultcontext
+ .get()
+ .stream()
+ .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
+ .collect(Collectors.toList())
+ .forEach(
+ c -> czenodo
+ .addAll(
+ conf
+ .getCommunityForZenodoCommunityValue(
+ c
+ .getId()
+ .substring(
+ c.getId().lastIndexOf("/") + 1)
+ .trim())));
+ }
+
+ communities.addAll(czenodo);
+
+ clearContext(result);
+
+ /* Verify if there is something to bulktag */
+ if (communities.isEmpty()) {
+ return result;
+ }
+
+ result
+ .getContext()
+ .stream()
+ .map(
+ c -> {
+ if (communities.contains(c.getId())) {
+ Optional> opt_dataInfoList = Optional.ofNullable(c.getDataInfo());
+ List dataInfoList;
+ if (opt_dataInfoList.isPresent())
+ dataInfoList = opt_dataInfoList.get();
+ else {
+ dataInfoList = new ArrayList<>();
+ c.setDataInfo(dataInfoList);
+ }
+ if (subjects.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_SUBJECT,
+ CLASS_NAME_BULKTAG_SUBJECT));
+ if (datasources.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_DATASOURCE,
+ CLASS_NAME_BULKTAG_DATASOURCE));
+ if (czenodo.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_CZENODO,
+ CLASS_NAME_BULKTAG_ZENODO));
+ }
+ return c;
+ })
+ .collect(Collectors.toList());
+
+ communities
+ .removeAll(
+ result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
+
+ if (communities.isEmpty())
+ return result;
+
+ List toaddcontext = communities
+ .stream()
+ .map(
+ c -> {
+ Context context = new Context();
+ context.setId(c);
+ List dataInfoList = new ArrayList<>();
+ if (subjects.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_SUBJECT,
+ CLASS_NAME_BULKTAG_SUBJECT));
+ if (datasources.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_DATASOURCE,
+ CLASS_NAME_BULKTAG_DATASOURCE));
+ if (czenodo.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_CZENODO,
+ CLASS_NAME_BULKTAG_ZENODO));
+ context.setDataInfo(dataInfoList);
+ return context;
+ })
+ .collect(Collectors.toList());
+
+ result.getContext().addAll(toaddcontext);
+ return result;
+ }
+
+ public static DataInfo getDataInfo(
+ String inference_provenance, String inference_class_id, String inference_class_name) {
+ DataInfo di = new DataInfo();
+ di.setInferred(true);
+ di.setInferenceprovenance(inference_provenance);
+ di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
+ return di;
+ }
+
+ public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
+ Qualifier pa = new Qualifier();
+ pa.setClassid(inference_class_id);
+ pa.setClassname(inference_class_name);
+ pa.setSchemeid(DNET_SCHEMA_ID);
+ pa.setSchemename(DNET_SCHEMA_NAME);
+ return pa;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java
new file mode 100644
index 000000000..802e2f5d6
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java
@@ -0,0 +1,51 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+import java.lang.reflect.Type;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+
+import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
+
+public class SelectionConstraints implements Serializable {
+ private List criteria;
+
+ public SelectionConstraints() {
+ }
+
+ public List getCriteria() {
+ return criteria;
+ }
+
+ public void setCriteria(List criteria) {
+ this.criteria = criteria;
+ }
+
+ public void setSc(String json) {
+ Type collectionType = new TypeToken>() {
+ }.getType();
+ criteria = new Gson().fromJson(json, collectionType);
+ }
+
+ // Constraints in or
+ public boolean verifyCriteria(final Map> param) {
+ for (Constraints selc : criteria) {
+ if (selc.verifyCriteria(param)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public void setSelection(VerbResolver resolver) {
+
+ for (Constraints cs : criteria) {
+ cs.setSelection(resolver);
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java
new file mode 100644
index 000000000..92d37d089
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java
@@ -0,0 +1,23 @@
+
+package eu.dnetlib.dhp.community;
+
+public class TagginConstants {
+
+ public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
+
+ public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
+ public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
+
+ public static final String CLASS_ID_SUBJECT = "community:subject";
+ public static final String CLASS_ID_DATASOURCE = "community:datasource";
+ public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
+
+ public static final String SCHEMA_ID = "dnet:provenanceActions";
+ public static final String COUNTER_GROUP = "Bulk Tagging";
+
+ public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
+
+ public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
+ public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
+ public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java
new file mode 100644
index 000000000..e1492f6a5
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java
@@ -0,0 +1,45 @@
+
+package eu.dnetlib.dhp.community;
+
+import java.io.Serializable;
+
+import org.dom4j.Node;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 01/08/2018. */
+public class ZenodoCommunity implements Serializable {
+
+ private String zenodoCommunityId;
+
+ private SelectionConstraints selCriteria;
+
+ public String getZenodoCommunityId() {
+ return zenodoCommunityId;
+ }
+
+ public void setZenodoCommunityId(String zenodoCommunityId) {
+ this.zenodoCommunityId = zenodoCommunityId;
+ }
+
+ public SelectionConstraints getSelCriteria() {
+ return selCriteria;
+ }
+
+ public void setSelCriteria(SelectionConstraints selCriteria) {
+ this.selCriteria = selCriteria;
+ }
+
+ private void setSelCriteria(String json) {
+ // Type collectionType = new TypeToken>(){}.getType();
+ selCriteria = new Gson().fromJson(json, SelectionConstraints.class);
+ }
+
+ public void setSelCriteria(Node n) {
+ if (n == null) {
+ selCriteria = null;
+ } else {
+ setSelCriteria(n.getText());
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java
new file mode 100644
index 000000000..a6ef2d908
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("contains")
+public class ContainsVerb implements Selection, Serializable {
+
+ private String param;
+
+ public ContainsVerb() {
+ }
+
+ public ContainsVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.contains(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java
new file mode 100644
index 000000000..b8b0262e9
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("contains_ignorecase")
+public class ContainsVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public ContainsVerbIgnoreCase() {
+ }
+
+ public ContainsVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.toLowerCase().contains(param.toLowerCase());
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java
new file mode 100644
index 000000000..3f17a6bb3
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("equals")
+public class EqualVerb implements Selection, Serializable {
+
+ private String param;
+
+ public EqualVerb() {
+ }
+
+ public EqualVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.equals(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java
new file mode 100644
index 000000000..934406859
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("equals_ignorecase")
+public class EqualVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public EqualVerbIgnoreCase() {
+ }
+
+ public EqualVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.equalsIgnoreCase(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java
new file mode 100644
index 000000000..9ef3bd60c
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java
@@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.lang.reflect.Type;
+
+import com.google.gson.*;
+
+public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
+
+ private static final String CLASSNAME = "CLASSNAME";
+ private static final String DATA = "DATA";
+
+ public Object deserialize(
+ JsonElement jsonElement,
+ Type type,
+ JsonDeserializationContext jsonDeserializationContext)
+ throws JsonParseException {
+
+ JsonObject jsonObject = jsonElement.getAsJsonObject();
+ JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME);
+ String className = prim.getAsString();
+ Class klass = getObjectClass(className);
+ return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass);
+ }
+
+ public JsonElement serialize(
+ Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) {
+ JsonObject jsonObject = new JsonObject();
+ jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName());
+ jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement));
+ return jsonObject;
+ }
+
+ /** **** Helper method to get the className of the object to be deserialized **** */
+ public Class getObjectClass(String className) {
+ try {
+ return Class.forName(className);
+ } catch (ClassNotFoundException e) {
+ // e.printStackTrace();
+ throw new JsonParseException(e.getMessage());
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java
new file mode 100644
index 000000000..eb83b256e
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_contains")
+public class NotContainsVerb implements Selection, Serializable {
+
+ private String param;
+
+ public NotContainsVerb() {
+ }
+
+ public NotContainsVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.contains(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java
new file mode 100644
index 000000000..fab3efef3
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_contains_ignorecase")
+public class NotContainsVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public NotContainsVerbIgnoreCase() {
+ }
+
+ public NotContainsVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !(value.toLowerCase().contains(param.toLowerCase()));
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java
new file mode 100644
index 000000000..2311c2987
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_equals")
+public class NotEqualVerb implements Selection, Serializable {
+
+ private String param;
+
+ public NotEqualVerb(final String param) {
+ this.param = param;
+ }
+
+ public NotEqualVerb() {
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.equals(param);
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java
new file mode 100644
index 000000000..de2f682a5
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_equals_ignorecase")
+public class NotEqualVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public NotEqualVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ public NotEqualVerbIgnoreCase() {
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.equalsIgnoreCase(param);
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java
new file mode 100644
index 000000000..b488bda01
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java
@@ -0,0 +1,7 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+public interface Selection {
+
+ boolean apply(String value);
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java
new file mode 100644
index 000000000..d467f934f
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java
@@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+@interface VerbClass {
+
+ String value();
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java
new file mode 100644
index 000000000..6a8ceebc3
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java
@@ -0,0 +1,56 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import io.github.classgraph.ClassGraph;
+import io.github.classgraph.ClassInfo;
+import io.github.classgraph.ClassInfoList;
+import io.github.classgraph.ScanResult;
+
+public class VerbResolver implements Serializable {
+ private Map> map = null; // = new HashMap<>();
+ private final ClassGraph classgraph = new ClassGraph();
+
+ public VerbResolver() {
+
+ try (ScanResult scanResult = // Assign scanResult in try-with-resources
+ classgraph // Create a new ClassGraph instance
+ .verbose() // If you want to enable logging to stderr
+ .enableAllInfo() // Scan classes, methods, fields, annotations
+ .whitelistPackages(
+ "eu.dnetlib.dhp.selectioncriteria") // Scan com.xyz and subpackages
+ .scan()) { // Perform the scan and return a ScanResult
+
+ ClassInfoList routeClassInfoList = scanResult
+ .getClassesWithAnnotation(
+ "eu.dnetlib.dhp.selectioncriteria.VerbClass");
+
+ this.map = routeClassInfoList
+ .stream()
+ .collect(
+ Collectors
+ .toMap(
+ value -> (String) ((ClassInfo) value)
+ .getAnnotationInfo()
+ .get(0)
+ .getParameterValues()
+ .get(0)
+ .getValue(),
+ value -> (Class) ((ClassInfo) value).loadClass()));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ public Selection getSelectionCriteria(String name, String param)
+ throws NoSuchMethodException, IllegalAccessException, InvocationTargetException,
+ InstantiationException {
+
+ // return Class.forName(tmp_map.get(name)).
+ return map.get(name).getDeclaredConstructor((String.class)).newInstance(param);
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java
new file mode 100644
index 000000000..58bf60d42
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java
@@ -0,0 +1,10 @@
+
+package eu.dnetlib.dhp.selectioncriteria;
+
+public class VerbResolverFactory {
+
+ public static VerbResolver newInstance() {
+
+ return new VerbResolver();
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json b/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json
new file mode 100644
index 000000000..a37d7d168
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json
@@ -0,0 +1,51 @@
+[
+ {
+ "paramName":"is",
+ "paramLongName":"isLookUpUrl",
+ "paramDescription": "URL of the isLookUp Service",
+ "paramRequired": true
+ },
+ {
+ "paramName":"s",
+ "paramLongName":"sourcePath",
+ "paramDescription": "the path of the sequencial file to read",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pm",
+ "paramLongName":"pathMap",
+ "paramDescription": "the json path associated to each selection field",
+ "paramRequired": true
+ },
+ {
+ "paramName":"tn",
+ "paramLongName":"resultTableName",
+ "paramDescription": "the name of the result table we are currently working on",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path used to store temporary output files",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ssm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "test",
+ "paramLongName": "isTest",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ },
+ {
+ "paramName": "tg",
+ "paramLongName": "taggingConf",
+ "paramDescription": "true if the spark session is managed, false otherwise",
+ "paramRequired": false
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml b/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml
new file mode 100644
index 000000000..fe82ae194
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml
@@ -0,0 +1,54 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml
new file mode 100644
index 000000000..524281bc9
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml
@@ -0,0 +1,216 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ isLookUpUrl
+ the isLookup service endpoint
+
+
+ pathMap
+ the json path associated to each selection field
+
+
+ outputPath
+ the output path
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/relation
+ ${nameNode}/${outputPath}/relation
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-publication
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-bulktag-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/publication
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication
+ --outputPath${outputPath}/publication
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-dataset
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-bulktag-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/dataset
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset
+ --outputPath${outputPath}/dataset
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-orp
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-bulktag-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/otherresearchproduct
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
+ --outputPath${outputPath}/otherresearchproduct
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ bulkTagging-software
+ eu.dnetlib.dhp.bulktag.SparkBulkTagJob
+ dhp-bulktag-${projectVersion}.jar
+
+ --num-executors=${sparkExecutorNumber}
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}/software
+ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software
+ --outputPath${outputPath}/software
+ --pathMap${pathMap}
+ --isLookUpUrl${isLookUpUrl}
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java
new file mode 100644
index 000000000..75ecb0298
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java
@@ -0,0 +1,853 @@
+
+package eu.dnetlib.dhp;
+
+import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.mortbay.util.IO;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.bulktag.SparkBulkTagJob;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Software;
+
+public class BulkTagJobTest {
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader();
+
+ private static SparkSession spark;
+
+ private static Path workingDir;
+ private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class);
+
+ private static String taggingConf = "";
+
+ static {
+ try {
+ taggingConf = IO
+ .toString(
+ BulkTagJobTest.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml"));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ @BeforeAll
+ public static void beforeAll() throws IOException {
+ workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
+ log.info("using work dir {}", workingDir);
+
+ SparkConf conf = new SparkConf();
+ conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
+
+ conf.setMaster("local[*]");
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("spark.ui.enabled", "false");
+ conf.set("spark.sql.warehouse.dir", workingDir.toString());
+ conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+ spark = SparkSession
+ .builder()
+ .appName(BulkTagJobTest.class.getSimpleName())
+ .config(conf)
+ .getOrCreate();
+ }
+
+ @AfterAll
+ public static void afterAll() throws IOException {
+ FileUtils.deleteDirectory(workingDir.toFile());
+ spark.stop();
+ }
+
+ @Test
+ public void noUpdatesTest() throws Exception {
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/sample/dataset/no_updates").getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.Dataset",
+ "-outputPath",
+ workingDir.toString() + "/dataset",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ // "-preparedInfoPath",
+ // getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath()
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/dataset")
+ .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+
+ verificationDataset.createOrReplaceTempView("dataset");
+
+ String query = "select id, MyT.id community "
+ + "from dataset "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ Assertions.assertEquals(0, spark.sql(query).count());
+ }
+
+ @Test
+ public void bulktagBySubjectNoPreviousContextTest() throws Exception {
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass()
+ .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext")
+ .getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.Dataset",
+ "-outputPath",
+ workingDir.toString() + "/dataset",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/dataset")
+ .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+
+ verificationDataset.createOrReplaceTempView("dataset");
+
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ + "from dataset "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ Assertions.assertEquals(5, spark.sql(query).count());
+
+ org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query);
+ Assertions
+ .assertEquals(
+ 5, idExplodeCommunity.filter("provenance = 'community:subject'").count());
+ Assertions
+ .assertEquals(
+ 5,
+ idExplodeCommunity.filter("name = 'Bulktagging for Community - Subject'").count());
+
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count());
+
+ Assertions
+ .assertEquals(
+ 1,
+ idExplodeCommunity
+ .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'")
+ .count());
+ Assertions
+ .assertEquals(
+ 1,
+ idExplodeCommunity
+ .filter(
+ "community = 'covid-19' and id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'")
+ .count());
+
+ Assertions
+ .assertEquals(
+ 2,
+ idExplodeCommunity
+ .filter("id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'")
+ .count());
+ Assertions
+ .assertEquals(
+ 2,
+ idExplodeCommunity
+ .filter(
+ "(community = 'covid-19' or community = 'aginfra') and id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'")
+ .count());
+
+ Assertions
+ .assertEquals(
+ 2,
+ idExplodeCommunity
+ .filter("id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'")
+ .count());
+ Assertions
+ .assertEquals(
+ 2,
+ idExplodeCommunity
+ .filter(
+ "(community = 'mes' or community = 'fam') and id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'")
+ .count());
+ }
+
+ @Test
+ public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception {
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass()
+ .getResource(
+ "/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance")
+ .getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.Dataset",
+ "-outputPath",
+ workingDir.toString() + "/dataset",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/dataset")
+ .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+
+ verificationDataset.createOrReplaceTempView("dataset");
+
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance "
+ + "from dataset "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyT.id = 'covid-19' ";
+
+ Assertions.assertEquals(3, spark.sql(query).count());
+
+ org.apache.spark.sql.Dataset communityContext = spark.sql(query);
+
+ Assertions
+ .assertEquals(
+ 2,
+ communityContext
+ .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'")
+ .count());
+ Assertions
+ .assertEquals(
+ 1,
+ communityContext
+ .filter(
+ "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'community:subject'")
+ .count());
+ Assertions
+ .assertEquals(
+ 1,
+ communityContext
+ .filter(
+ "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'propagation:community:productsthroughsemrel'")
+ .count());
+
+ query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ + "from dataset "
+ + "lateral view explode (context) as MyT "
+ + "where size(MyT.datainfo) > 0";
+
+ Assertions
+ .assertEquals(
+ 2,
+ spark
+ .sql(query)
+ .select("datainfosize")
+ .where(
+ "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' a"
+ + "nd community = 'covid-19'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+ }
+
+ @Test
+ public void bulktagByDatasourceTest() throws Exception {
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass()
+ .getResource("/eu/dnetlib/dhp/sample/publication/update_datasource")
+ .getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.Publication",
+ "-outputPath",
+ workingDir.toString() + "/publication",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/publication")
+ .map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Publication.class));
+
+ verificationDataset.createOrReplaceTempView("publication");
+
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ + "from publication "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query);
+
+ Assertions.assertEquals(5, idExplodeCommunity.count());
+ Assertions
+ .assertEquals(
+ 5, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
+ Assertions
+ .assertEquals(
+ 5,
+ idExplodeCommunity
+ .filter("name = 'Bulktagging for Community - Datasource'")
+ .count());
+
+ Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'fam'").count());
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count());
+
+ Assertions
+ .assertEquals(
+ 3,
+ idExplodeCommunity
+ .filter(
+ "community = 'fam' and (id = '50|ec_fp7health::000085c89f4b96dc2269bd37edb35306' "
+ + "or id = '50|ec_fp7health::000b9e61f83f5a4b0c35777b7bccdf38' "
+ + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')")
+ .count());
+
+ Assertions
+ .assertEquals(
+ 2,
+ idExplodeCommunity
+ .filter(
+ "community = 'aginfra' and (id = '50|ec_fp7health::000c8195edd542e4e64ebb32172cbf89' "
+ + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')")
+ .count());
+ }
+
+ @Test
+ public void bulktagByZenodoCommunityTest() throws Exception {
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass()
+ .getResource(
+ "/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity")
+ .getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
+ "-outputPath",
+ workingDir.toString() + "/orp",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/orp")
+ .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(OtherResearchProduct.class));
+
+ verificationDataset.createOrReplaceTempView("orp");
+
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ + "from orp "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query);
+ Assertions.assertEquals(8, idExplodeCommunity.count());
+
+ Assertions
+ .assertEquals(
+ 8, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count());
+ Assertions
+ .assertEquals(
+ 8,
+ idExplodeCommunity.filter("name = 'Bulktagging for Community - Zenodo'").count());
+
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'covid-19'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count());
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'beopen'").count());
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count());
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'mes'").count());
+
+ Assertions
+ .assertEquals(
+ 1,
+ idExplodeCommunity
+ .filter(
+ "id = '50|od______2017::0750a4d0782265873d669520f5e33c07' "
+ + "and community = 'covid-19'")
+ .count());
+ Assertions
+ .assertEquals(
+ 3,
+ idExplodeCommunity
+ .filter(
+ "id = '50|od______2017::1bd97baef19dbd2db3203b112bb83bc5' and "
+ + "(community = 'aginfra' or community = 'mes' or community = 'fam')")
+ .count());
+ Assertions
+ .assertEquals(
+ 1,
+ idExplodeCommunity
+ .filter(
+ "id = '50|od______2017::1e400f1747487fd15998735c41a55c72' "
+ + "and community = 'beopen'")
+ .count());
+ Assertions
+ .assertEquals(
+ 3,
+ idExplodeCommunity
+ .filter(
+ "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' and "
+ + "(community = 'beopen' or community = 'fam' or community = 'mes')")
+ .count());
+
+ query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ + "from orp "
+ + "lateral view explode (context) as MyT "
+ + "where size(MyT.datainfo) > 0";
+
+ Assertions
+ .assertEquals(
+ 2,
+ spark
+ .sql(query)
+ .select("datainfosize")
+ .where(
+ "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' a"
+ + "nd community = 'beopen'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+
+ // verify the zenodo community context is not present anymore in the records
+ query = "select id, MyT.id community "
+ + "from orp "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD ";
+
+ org.apache.spark.sql.Dataset tmp2 = spark.sql(query);
+
+ Assertions
+ .assertEquals(
+ 0,
+ tmp2
+ .select("community")
+ .where(tmp2.col("community").contains(ZENODO_COMMUNITY_INDICATOR))
+ .count());
+ }
+
+ @Test
+ public void bulktagBySubjectDatasourceTest() throws Exception {
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass()
+ .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject_datasource")
+ .getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.Dataset",
+ "-outputPath",
+ workingDir.toString() + "/dataset",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/dataset")
+ .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+
+ verificationDataset.createOrReplaceTempView("dataset");
+
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ + "from dataset "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query);
+ Assertions.assertEquals(7, idExplodeCommunity.count());
+
+ Assertions
+ .assertEquals(
+ 5, idExplodeCommunity.filter("provenance = 'community:subject'").count());
+ Assertions
+ .assertEquals(
+ 2, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count());
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count());
+ Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count());
+
+ query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ + "from dataset "
+ + "lateral view explode (context) as MyT "
+ + "where size(MyT.datainfo) > 0";
+
+ org.apache.spark.sql.Dataset tmp2 = spark.sql(query);
+
+ Assertions
+ .assertEquals(
+ 2,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and "
+ + "community = 'aginfra'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+
+ Assertions
+ .assertEquals(
+ 1,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and "
+ + "community = 'covid-19'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+
+ Assertions
+ .assertEquals(
+ 2,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and "
+ + "community = 'fam'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+ Assertions
+ .assertEquals(
+ 2,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and "
+ + "community = 'covid-19'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+
+ Assertions
+ .assertEquals(
+ 1,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and "
+ + "community = 'fam'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+ Assertions
+ .assertEquals(
+ 1,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and "
+ + "community = 'mes'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+ }
+
+ @Test
+ public void bulktagBySubjectDatasourceZenodoCommunityTest() throws Exception {
+
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/sample/software/").getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.Software",
+ "-outputPath",
+ workingDir.toString() + "/software",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/software")
+ .map(item -> OBJECT_MAPPER.readValue(item, Software.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Software.class));
+
+ verificationDataset.createOrReplaceTempView("software");
+
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ + "from software "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query);
+ Assertions.assertEquals(10, idExplodeCommunity.count());
+
+ idExplodeCommunity.show(false);
+ Assertions
+ .assertEquals(
+ 3, idExplodeCommunity.filter("provenance = 'community:subject'").count());
+ Assertions
+ .assertEquals(
+ 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
+ Assertions
+ .assertEquals(
+ 4, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count());
+
+ Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'covid-19'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dh-ch'").count());
+ Assertions.assertEquals(4, idExplodeCommunity.filter("community = 'aginfra'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dariah'").count());
+ Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count());
+
+ Assertions
+ .assertEquals(
+ 2,
+ idExplodeCommunity
+ .filter(
+ "provenance = 'community:zenodocommunity' and "
+ + "id = '50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4' and ("
+ + "community = 'dh-ch' or community = 'dariah')")
+ .count());
+
+ query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ + "from software "
+ + "lateral view explode (context) as MyT "
+ + "where size(MyT.datainfo) > 0";
+
+ org.apache.spark.sql.Dataset tmp2 = spark.sql(query);
+
+ Assertions
+ .assertEquals(
+ 2,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______1582::501b25d420f808c8eddcd9b16e917f11' and "
+ + "community = 'covid-19'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+
+ Assertions
+ .assertEquals(
+ 3,
+ tmp2
+ .select("datainfosize")
+ .where(
+ "id = '50|od______1582::581621232a561b7e8b4952b18b8b0e56' and "
+ + "community = 'aginfra'")
+ .collectAsList()
+ .get(0)
+ .getInt(0));
+ }
+
+ @Test
+ public void bulktagDatasourcewithConstraintsTest() throws Exception {
+
+ SparkBulkTagJob
+ .main(
+ new String[] {
+ "-isTest",
+ Boolean.TRUE.toString(),
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass()
+ .getResource(
+ "/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints")
+ .getPath(),
+ "-taggingConf",
+ taggingConf,
+ "-resultTableName",
+ "eu.dnetlib.dhp.schema.oaf.Dataset",
+ "-outputPath",
+ workingDir.toString() + "/dataset",
+ "-isLookUpUrl",
+ "http://beta.services.openaire.eu:8280/is/services/isLookUp",
+ "-pathMap",
+ "{ \"author\" : \"$['author'][*]['fullname']\","
+ + " \"title\" : \"$['title'][*]['value']\","
+ + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ + " \"contributor\" : \"$['contributor'][*]['value']\","
+ + " \"description\" : \"$['description'][*]['value']\"}"
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/dataset")
+ .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+ Assertions.assertEquals(10, tmp.count());
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+
+ verificationDataset.createOrReplaceTempView("dataset");
+ String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ + "from dataset "
+ + "lateral view explode(context) c as MyT "
+ + "lateral view explode(MyT.datainfo) d as MyD "
+ + "where MyD.inferenceprovenance = 'bulktagging'";
+
+ org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query);
+
+ idExplodeCommunity.show(false);
+ Assertions.assertEquals(3, idExplodeCommunity.count());
+
+ Assertions
+ .assertEquals(
+ 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java
new file mode 100644
index 000000000..3aae9ebee
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java
@@ -0,0 +1,166 @@
+
+package eu.dnetlib.dhp;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.util.*;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.dom4j.DocumentException;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.community.CommunityConfiguration;
+import eu.dnetlib.dhp.community.CommunityConfigurationFactory;
+import eu.dnetlib.dhp.community.Constraint;
+import eu.dnetlib.dhp.community.SelectionConstraints;
+import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
+
+/** Created by miriam on 03/08/2018. */
+public class CommunityConfigurationFactoryTest {
+
+ private final VerbResolver resolver = new VerbResolver();
+
+ @Test
+ public void parseTest() throws DocumentException, IOException {
+ String xml = IOUtils
+ .toString(
+ getClass()
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml"));
+ final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
+ Assertions.assertEquals(5, cc.size());
+ cc
+ .getCommunityList()
+ .forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId())));
+ }
+
+ @Test
+ public void applyVerb()
+ throws InvocationTargetException, IllegalAccessException, NoSuchMethodException,
+ InstantiationException {
+ Constraint sc = new Constraint();
+ sc.setVerb("not_contains");
+ sc.setField("contributor");
+ sc.setValue("DARIAH");
+ sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue()));
+ String metadata = "This work has been partially supported by DARIAH-EU infrastructure";
+ Assertions.assertFalse(sc.verifyCriteria(metadata));
+ }
+
+ @Test
+ public void loadSelCriteriaTest() throws DocumentException, IOException {
+ String xml = IOUtils
+ .toString(
+ getClass()
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml"));
+ final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
+ Map> param = new HashMap<>();
+ param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi")));
+ param
+ .put(
+ "description",
+ new ArrayList<>(
+ Collections
+ .singletonList(
+ "This work has been partially supported by DARIAH-EU infrastructure")));
+ param
+ .put(
+ "contributor",
+ new ArrayList<>(
+ Collections
+ .singletonList(
+ "Pallino ha aiutato a scrivere il paper. Pallino lavora per DARIAH")));
+ List comm = cc
+ .getCommunityForDatasource(
+ "openaire____::1cfdb2e14977f31a98e0118283401f32", param);
+ Assertions.assertEquals(1, comm.size());
+ Assertions.assertEquals("dariah", comm.get(0));
+ }
+
+ @Test
+ public void test4() throws DocumentException, IOException {
+ final CommunityConfiguration cc = CommunityConfigurationFactory
+ .fromJson(
+ IOUtils
+ .toString(
+ getClass()
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json")));
+ cc.toString();
+ }
+
+ @Test
+ public void test5() throws IOException, DocumentException {
+
+ // final CommunityConfiguration cc =
+ // CommunityConfigurationFactory.newInstance(IOUtils.toString(getClass().getResourceAsStream("test.xml")));
+ final CommunityConfiguration cc = CommunityConfigurationFactory
+ .fromJson(
+ IOUtils
+ .toString(
+ getClass()
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/communityconfiguration/community_configuration.json")));
+
+ System.out.println(cc.toJson());
+ }
+
+ @Test
+ public void test6() {
+ String json = "{\"criteria\":[{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}]}";
+
+ String step1 = "{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}";
+
+ Constraint c = new Gson().fromJson(step1, Constraint.class);
+ //
+ // String step2 =
+ // "{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}";
+ //
+ // ConstraintEncapsulator ce = new
+ // Gson().fromJson(step2,ConstraintEncapsulator.class);
+ //
+ //
+ // String step3 =
+ // "{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}";
+ //
+ // Constraints cons = new Gson().fromJson(step3,Constraints.class);
+ //
+ // String step4 =
+ // "{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}";
+ //
+ // ConstraintsList cl = new Gson().fromJson(step4,ConstraintsList.class);
+ //
+ // String step5 =
+ // "{\"cl\":{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}}";
+ SelectionConstraints sl = new Gson().fromJson(json, SelectionConstraints.class);
+ }
+
+ @Test
+ public void test7() throws IOException {
+ final CommunityConfiguration cc = CommunityConfigurationFactory
+ .fromJson(
+ IOUtils
+ .toString(
+ getClass()
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json")));
+
+ System.out.println(cc.toJson());
+ }
+
+ @Test
+ public void temporaneo() throws Exception {
+ String xml = IOUtils
+ .toString(
+ getClass()
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml"));
+ final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
+ System.out.println(cc.toJson());
+ }
+}
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.json b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.json
new file mode 100644
index 000000000..d21dc4ced
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.json
@@ -0,0 +1,694 @@
+{"communities": {
+ "clarin": {
+ "id": "clarin",
+ "subjects": [],
+ "datasources": [
+ {
+ "openaireId": "re3data_____::a507cdacc5bbcc08761c92185dee5cab"
+ }
+ ],
+ "zenodoCommunities": [
+
+ ]
+ },
+ "ee": {
+ "id": "ee",
+ "subjects": [
+ "SDG13 - Climate action",
+ "SDG8 - Decent work and economic\n\t\t\t\t\tgrowth",
+ "SDG15 - Life on land",
+ "SDG2 - Zero hunger",
+ "SDG17 - Partnerships for the\n\t\t\t\t\tgoals",
+ "SDG10 - Reduced inequalities",
+ "SDG5 - Gender equality",
+ "SDG12 - Responsible\n\t\t\t\t\tconsumption and production",
+ "SDG14 - Life below water",
+ "SDG6 - Clean water and\n\t\t\t\t\tsanitation",
+ "SDG11 - Sustainable cities and communities",
+ "SDG1 - No poverty",
+ "SDG3 -\n\t\t\t\t\tGood health and well being",
+ "SDG7 - Affordable and clean energy",
+ "SDG4 - Quality\n\t\t\t\t\teducation",
+ "SDG9 - Industry innovation and infrastructure",
+ "SDG16 - Peace justice\n\t\t\t\t\tand strong institutions"
+ ],
+ "datasources": [
+
+ ],
+ "zenodoCommunities": [
+
+ ]
+ },
+ "aginfra": {
+ "id": "aginfra",
+ "subjects": [
+ "animal production and health",
+ "fisheries and aquaculture",
+ "food safety and human nutrition",
+ "information management",
+ "food technology",
+ "agri-food education and extension",
+ "natural resources and environment",
+ "food system",
+ "engineering technology and Research",
+ "agriculture",
+ "food safety risk assessment",
+ "food security",
+ "farming practices and systems",
+ "plant production and protection",
+ "agri-food economics and policy",
+ "food distribution",
+ "forestry"
+ ],
+ "datasources": [
+ {
+ "openaireId": "opendoar____::1a551829d50f1400b0dab21fdd969c04"
+ },
+ {
+ "openaireId": "opendoar____::49af6c4e558a7569d80eee2e035e2bd7"
+ },
+ {
+ "openaireId": "opendoar____::0266e33d3f546cb5436a10798e657d97"
+ },
+ {
+ "openaireId": "opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06"
+ },
+ {
+ "openaireId": "opendoar____::41bfd20a38bb1b0bec75acf0845530a7"
+ },
+ {
+ "openaireId": "opendoar____::87ae6fb631f7c8a627e8e28785d9992d"
+ }
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "edenis"
+ },
+ {
+ "zenodoCommunityId": "efsa-pilot"
+ },
+ {
+ "zenodoCommunityId": "egene3"
+ },
+ {
+ "zenodoCommunityId": "efsa-kj"
+ },
+ {
+ "zenodoCommunityId": "euromixproject"
+ },
+ {
+ "zenodoCommunityId": "discardless"
+ },
+ {
+ "zenodoCommunityId": "sedinstcjfst"
+ },
+ {
+ "zenodoCommunityId": "afinet-kc"
+ },
+ {
+ "zenodoCommunityId": "2231-4784"
+ },
+ {
+ "zenodoCommunityId": "2231-0606"
+ },
+ {
+ "zenodoCommunityId": "solace"
+ },
+ {
+ "zenodoCommunityId": "pa17"
+ },
+ {
+ "zenodoCommunityId": "smartakis"
+ },
+ {
+ "zenodoCommunityId": "sedinstcjae"
+ },
+ {
+ "zenodoCommunityId": "phenology_camera"
+ },
+ {
+ "zenodoCommunityId": "aginfra"
+ },
+ {
+ "zenodoCommunityId": "erosa"
+ },
+ {
+ "zenodoCommunityId": "bigdatagrapes"
+ }
+ ]
+ },
+ "fam": {
+ "id": "fam",
+ "subjects": [
+ "Stock Assessment",
+ "pelagic",
+ "Fish farming",
+ "EMFF",
+ "Fisheries",
+ "Fishermen",
+ "maximum sustainable yield",
+ "trawler",
+ "Fishing vessel",
+ "Fisherman",
+ "Fishing gear",
+ "RFMO",
+ "Fish Aggregating Device",
+ "Bycatch",
+ "Fishery",
+ "common fisheries policy",
+ "Fishing fleet",
+ "Aquaculture"
+ ],
+ "datasources": [
+ {
+ "openaireId": "doajarticles::8cec81178926caaca531afbd8eb5d64c"
+ },
+ {
+ "openaireId": "doajarticles::0f7a7f30b5400615cae1829f3e743982"
+ },
+ {
+ "openaireId": "doajarticles::9740f7f5af3e506d2ad2c215cdccd51a"
+ },
+ {
+ "openaireId": "doajarticles::9f3fbaae044fa33cb7069b72935a3254"
+ },
+ {
+ "openaireId": "doajarticles::cb67f33eb9819f5c624ce0313957f6b3"
+ },
+ {
+ "openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906"
+ },
+ {
+ "openaireId": "doajarticles::554cde3be9e5c4588b4c4f9f503120cb"
+ },
+ {
+ "openaireId": "tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b"
+ },
+ {
+ "openaireId": "doajarticles::57c5d3837da943e93b28ec4db82ec7a5"
+ },
+ {
+ "openaireId": "doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1"
+ },
+ {
+ "openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906"
+ },
+ {
+ "openaireId": "doajarticles::dca64612dfe0963fffc119098a319957"
+ },
+ {
+ "openaireId": "doajarticles::dd70e44479f0ade25aa106aef3e87a0a"
+ }
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "discardless"
+ },
+ {
+ "zenodoCommunityId": "farfish2020"
+ },
+ {
+ "zenodoCommunityId": "facts"
+ },
+ {
+ "zenodoCommunityId": "climefish"
+ },
+ {
+ "zenodoCommunityId": "proeel"
+ },
+ {
+ "zenodoCommunityId": "primefish"
+ },
+ {
+ "zenodoCommunityId": "h2020_vicinaqua"
+ },
+ {
+ "zenodoCommunityId": "meece"
+ },
+ {
+ "zenodoCommunityId": "rlsadb"
+ }
+ ]
+ },
+ "instruct": {
+ "id": "instruct",
+ "subjects": [
+
+ ],
+ "datasources": [
+
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "instruct"
+ },
+ {
+ "zenodoCommunityId": "west-life"
+ }
+ ]
+ },
+ "mes": {
+ "id": "mes",
+ "subjects": [
+ "marine",
+ "ocean",
+ "fish",
+ "aqua",
+ "sea"
+ ],
+ "datasources": [
+
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "adriplan"
+ },
+ {
+ "zenodoCommunityId": "devotes-project"
+ },
+ {
+ "zenodoCommunityId": "euro-basin"
+ },
+ {
+ "zenodoCommunityId": "naclim"
+ },
+ {
+ "zenodoCommunityId": "discardless"
+ },
+ {
+ "zenodoCommunityId": "assisibf"
+ },
+ {
+ "zenodoCommunityId": "meece"
+ },
+ {
+ "zenodoCommunityId": "facts"
+ },
+ {
+ "zenodoCommunityId": "proeel"
+ },
+ {
+ "zenodoCommunityId": "aquatrace"
+ },
+ {
+ "zenodoCommunityId": "myfish"
+ },
+ {
+ "zenodoCommunityId": "atlas"
+ },
+ {
+ "zenodoCommunityId": "blue-actionh2020"
+ },
+ {
+ "zenodoCommunityId": "sponges"
+ },
+ {
+ "zenodoCommunityId": "merces_project"
+ },
+ {
+ "zenodoCommunityId": "bigdataocean"
+ },
+ {
+ "zenodoCommunityId": "columbus"
+ },
+ {
+ "zenodoCommunityId": "h2020-aquainvad-ed"
+ },
+ {
+ "zenodoCommunityId": "aquarius"
+ },
+ {
+ "zenodoCommunityId": "southern-ocean-observing-system"
+ },
+ {
+ "zenodoCommunityId": "eawag"
+ },
+ {
+ "zenodoCommunityId": "mossco"
+ },
+ {
+ "zenodoCommunityId": "onc"
+ },
+ {
+ "zenodoCommunityId": "oceanbiogeochemistry"
+ },
+ {
+ "zenodoCommunityId": "oceanliteracy"
+ },
+ {
+ "zenodoCommunityId": "openearth"
+ },
+ {
+ "zenodoCommunityId": "ocean"
+ },
+ {
+ "zenodoCommunityId": "calcifierraman"
+ },
+ {
+ "zenodoCommunityId": "bermudabream"
+ },
+ {
+ "zenodoCommunityId": "brcorp1"
+ },
+ {
+ "zenodoCommunityId": "mce"
+ },
+ {
+ "zenodoCommunityId": "biogeochem"
+ },
+ {
+ "zenodoCommunityId": "ecc2014"
+ },
+ {
+ "zenodoCommunityId": "fisheries"
+ },
+ {
+ "zenodoCommunityId": "sedinstcjfas"
+ },
+ {
+ "zenodoCommunityId": "narmada"
+ },
+ {
+ "zenodoCommunityId": "umr-entropie"
+ },
+ {
+ "zenodoCommunityId": "farfish2020"
+ },
+ {
+ "zenodoCommunityId": "primefish"
+ },
+ {
+ "zenodoCommunityId": "zf-ilcs"
+ },
+ {
+ "zenodoCommunityId": "climefish"
+ },
+ {
+ "zenodoCommunityId": "afrimed_eu"
+ },
+ {
+ "zenodoCommunityId": "spi-ace"
+ },
+ {
+ "zenodoCommunityId": "cice-consortium"
+ },
+ {
+ "zenodoCommunityId": "nemo-ocean"
+ },
+ {
+ "zenodoCommunityId": "mesopp-h2020"
+ },
+ {
+ "zenodoCommunityId": "marxiv"
+ }
+ ]
+ },
+ "ni": {
+ "id": "ni",
+ "subjects": [
+ "brain mapping",
+ "brain imaging",
+ "electroencephalography",
+ "arterial spin labelling",
+ "brain fingerprinting",
+ "brain",
+ "neuroimaging",
+ "Multimodal Brain Image Analysis",
+ "fMRI",
+ "neuroinformatics",
+ "fetal brain",
+ "brain ultrasonic imaging",
+ "topographic brain mapping",
+ "diffusion tensor imaging",
+ "computerized knowledge assessment",
+ "connectome mapping",
+ "brain magnetic resonance imaging",
+ "brain abnormalities"
+ ],
+ "datasources": [
+ {
+ "openaireId": "re3data_____::5b9bf9171d92df854cf3c520692e9122"
+ },
+ {
+ "openaireId": "doajarticles::c7d3de67dc77af72f6747157441252ec"
+ },
+ {
+ "openaireId": "re3data_____::8515794670370f49c1d176c399c714f5"
+ },
+ {
+ "openaireId": "doajarticles::d640648c84b10d425f96f11c3de468f3"
+ },
+ {
+ "openaireId": "doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"
+ },
+ {
+ "openaireId": "rest________::fb1a3d4523c95e63496e3bc7ba36244b"
+ }
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "neuroinformatics"
+ },
+ {
+ "zenodoCommunityId": "hbp"
+ },
+ {
+ "zenodoCommunityId": "from_neuroscience_to_machine_learning"
+ },
+ {
+ "zenodoCommunityId": "ci2c"
+ },
+ {
+ "zenodoCommunityId": "opensourcebrain"
+ },
+ {
+ "zenodoCommunityId": "brainspeak"
+ },
+ {
+ "zenodoCommunityId": "braincom"
+ },
+ {
+ "zenodoCommunityId": "nextgenvis"
+ },
+ {
+ "zenodoCommunityId": "meso-brain"
+ },
+ {
+ "zenodoCommunityId": "neuroplasticity-workshop"
+ },
+ {
+ "zenodoCommunityId": "bionics"
+ },
+ {
+ "zenodoCommunityId": "brainmattrain-676408"
+ },
+ {
+ "zenodoCommunityId": "repronim"
+ },
+ {
+ "zenodoCommunityId": "affectiveneuro"
+ },
+ {
+ "zenodoCommunityId": "con"
+ },
+ {
+ "zenodoCommunityId": "lab_neurol_sperim_irfmn_irccs_milano_it"
+ }
+ ]
+ },
+ "dariah": {
+ "id": "dariah",
+ "subjects": [
+
+ ],
+ "datasources": [
+ {
+ "openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a",
+ "sc": {
+ "cl": {
+ "criteria": [
+ {
+ "ce": {
+ "constraint": [
+ {
+ "verb": "contains",
+ "field": "contributor",
+ "value": "DARIAH"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "dimpo"
+ }
+ ]
+ },
+ "rda": {
+ "id": "rda",
+ "subjects": [
+
+ ],
+ "datasources": [
+
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "rda"
+ }
+ ]
+ },
+ "dh-ch": {
+ "id": "dh-ch",
+ "subjects": [
+ "modern art",
+ "metadata",
+ "monuments",
+ "sites",
+ "field walking",
+ "frescoes",
+ "excavation",
+ "ontologies",
+ "mapping",
+ "cities",
+ "temples",
+ "lithics",
+ "roads",
+ "digital cultural heritage",
+ "interoperability",
+ "archaeological reports",
+ "churches",
+ "standards",
+ "archaeological stratigraphy",
+ "buidings",
+ "digital humanities",
+ "survey",
+ "archaeological sites",
+ "CIDOC CRM",
+ "decorations",
+ "classic art",
+ "stratigraphy",
+ "digital archaeology",
+ "walls",
+ "data science",
+ "chapels",
+ "paintings",
+ "archaeology",
+ "fair data",
+ "mosaics",
+ "data visualization",
+ "burials",
+ "medieval art",
+ "castles",
+ "statues",
+ "natural language processing",
+ "inscriptions",
+ "vaults",
+ "open data",
+ "contemporary art",
+ "3D",
+ "pottery",
+ "site",
+ "metadata schema",
+ "architectural",
+ "vessels"
+ ],
+ "datasources": [
+ {
+ "openaireId": "re3data_____::9ebe127e5f3a0bf401875690f3bb6b81"
+ },
+ {
+ "openaireId": "doajarticles::c6cd4b532e12868c1d760a8d7cda6815"
+ },
+ {
+ "openaireId": "doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b"
+ },
+ {
+ "openaireId": "doajarticles::6eb31d13b12bc06bbac06aef63cf33c9"
+ },
+ {
+ "openaireId": "doajarticles::0da84e9dfdc8419576169e027baa8028"
+ },
+ {
+ "openaireId": "re3data_____::84e123776089ce3c7a33db98d9cd15a8"
+ },
+ {
+ "openaireId": "openaire____::c5502a43e76feab55dd00cf50f519125"
+ },
+ {
+ "openaireId": "re3data_____::a48f09c562b247a9919acfe195549b47"
+ },
+ {
+ "openaireId": "opendoar____::97275a23ca44226c9964043c8462be96"
+ }
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "storm"
+ },
+ {
+ "zenodoCommunityId": "crosscult"
+ },
+ {
+ "zenodoCommunityId": "wholodance_eu"
+ },
+ {
+ "zenodoCommunityId": "digcur2013"
+ },
+ {
+ "zenodoCommunityId": "gravitate"
+ },
+ {
+ "zenodoCommunityId": "dipp2014"
+ },
+ {
+ "zenodoCommunityId": "digitalhumanities"
+ },
+ {
+ "zenodoCommunityId": "dimpo"
+ },
+ {
+ "zenodoCommunityId": "adho"
+ },
+ {
+ "zenodoCommunityId": "chc"
+ },
+ {
+ "zenodoCommunityId": "wahr"
+ },
+ {
+ "zenodoCommunityId": "ibe"
+ },
+ {
+ "zenodoCommunityId": "ariadne"
+ },
+ {
+ "zenodoCommunityId": "parthenos-hub"
+ },
+ {
+ "zenodoCommunityId": "parthenos-training"
+ },
+ {
+ "zenodoCommunityId": "gandhara"
+ },
+ {
+ "zenodoCommunityId": "cmsouthasia"
+ },
+ {
+ "zenodoCommunityId": "nilgirihills"
+ },
+ {
+ "zenodoCommunityId": "shamsa_mustecio"
+ },
+ {
+ "zenodoCommunityId": "bodhgaya"
+ }
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml
new file mode 100644
index 000000000..8fec18593
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml
@@ -0,0 +1,176 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SDG13 - Climate action
+ SDG8 - Decent work and economic growth
+ SDG15 - Life on land
+ SDG2 - Zero hunger
+ SDG17 - Partnerships for the goals
+ SDG10 - Reduced inequalities
+ SDG5 - Gender equality
+ SDG12 - Responsible consumption and production
+ SDG14 - Life below water
+ SDG6 - Clean water and sanitation
+ SDG11 - Sustainable cities and communities
+ SDG1 - No poverty
+ SDG3 - Good health and well being
+ SDG7 - Affordable and clean energy
+ SDG4 - Quality education
+ SDG9 - Industry innovation and infrastructure
+ SDG16 - Peace justice and strong institutions
+
+
+
+
+ 123
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ brain mapping
+ brain imaging
+ electroencephalography
+ arterial spin labelling
+ brain fingerprinting
+ brain
+ neuroimaging
+ Multimodal Brain Image Analysis
+ fMRI
+ neuroinformatics
+ fetal brain
+ brain ultrasonic imaging
+ topographic brain mapping
+ diffusion tensor imaging
+ computerized knowledge assessment
+ connectome mapping
+ brain magnetic resonance imaging
+ brain abnormalities
+
+
+
+ re3data_____::5b9bf9171d92df854cf3c520692e9122
+
+
+
+ doajarticles::c7d3de67dc77af72f6747157441252ec
+
+
+
+ re3data_____::8515794670370f49c1d176c399c714f5
+
+
+
+ doajarticles::d640648c84b10d425f96f11c3de468f3
+
+
+
+ doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a
+
+
+
+
+
+
+
+ marine
+ ocean
+ fish
+ aqua
+ sea
+
+
+
+ re3data_____::9633d1e8c4309c833c2c442abeb0cfeb
+
+
+
+
+
+
+
+ animal production and health
+ fisheries and aquaculture
+ food safety and human nutrition
+ information management
+ food technology
+ agri-food education and extension
+ natural resources and environment
+ food system
+ engineering technology and Research
+ agriculture
+ food safety risk assessment
+ food security
+ farming practices and systems
+ plant production and protection
+ agri-food economics and policy
+ food distribution
+ forestry
+
+
+
+ opendoar____::1a551829d50f1400b0dab21fdd969c04
+
+
+
+ opendoar____::49af6c4e558a7569d80eee2e035e2bd7
+
+
+
+ opendoar____::0266e33d3f546cb5436a10798e657d97
+
+
+
+ opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06
+
+
+
+ opendoar____::41bfd20a38bb1b0bec75acf0845530a7
+
+
+
+ opendoar____::87ae6fb631f7c8a627e8e28785d9992d
+
+
+
+
+
+
+ oac_clarin
+
+
+
+ re3data_____::a507cdacc5bbcc08761c92185dee5cab
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json
new file mode 100644
index 000000000..6aa4275d6
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json
@@ -0,0 +1,37 @@
+{
+ "communities": {
+ "dariah": {
+ "id": "dariah",
+ "subjects": [
+
+ ],
+ "datasources": [
+ {
+ "openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a",
+ "sc": {
+ "cl": {
+ "criteria": [
+ {
+ "ce": {
+ "constraint": [
+ {
+ "verb": "contains",
+ "field": "contributor",
+ "value": "DARIAH"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ ],
+ "zenodoCommunities": [
+ {
+ "zenodoCommunityId": "dimpo"
+ }
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml
new file mode 100644
index 000000000..ad31e1763
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml
@@ -0,0 +1,193 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SDG13 - Climate action
+ SDG8 - Decent work and economic growth
+ SDG15 - Life on land
+ SDG2 - Zero hunger
+ SDG17 - Partnerships for the goals
+ SDG10 - Reduced inequalities
+ SDG5 - Gender equality
+ SDG12 - Responsible consumption and production
+ SDG14 - Life below water
+ SDG6 - Clean water and sanitation
+ SDG11 - Sustainable cities and communities
+ SDG1 - No poverty
+ SDG3 - Good health and well being
+ SDG7 - Affordable and clean energy
+ SDG4 - Quality education
+ SDG9 - Industry innovation and infrastructure
+ SDG16 - Peace justice and strong institutions
+
+
+
+
+ 123
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ brain mapping
+ brain imaging
+ electroencephalography
+ arterial spin labelling
+ brain fingerprinting
+ brain
+ neuroimaging
+ Multimodal Brain Image Analysis
+ fMRI
+ neuroinformatics
+ fetal brain
+ brain ultrasonic imaging
+ topographic brain mapping
+ diffusion tensor imaging
+ computerized knowledge assessment
+ connectome mapping
+ brain magnetic resonance imaging
+ brain abnormalities
+
+
+
+ re3data_____::5b9bf9171d92df854cf3c520692e9122
+
+
+
+ doajarticles::c7d3de67dc77af72f6747157441252ec
+
+
+
+ re3data_____::8515794670370f49c1d176c399c714f5
+
+
+
+ doajarticles::d640648c84b10d425f96f11c3de468f3
+
+
+
+ doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a
+
+
+
+
+
+
+
+ marine
+ ocean
+ fish
+ aqua
+ sea
+
+
+
+ re3data_____::9633d1e8c4309c833c2c442abeb0cfeb
+
+
+
+
+
+
+
+ animal production and health
+ fisheries and aquaculture
+ food safety and human nutrition
+ information management
+ food technology
+ agri-food education and extension
+ natural resources and environment
+ food system
+ engineering technology and Research
+ agriculture
+ food safety risk assessment
+ food security
+ farming practices and systems
+ plant production and protection
+ agri-food economics and policy
+ food distribution
+ forestry
+
+
+
+ opendoar____::1a551829d50f1400b0dab21fdd969c04
+
+
+
+ opendoar____::49af6c4e558a7569d80eee2e035e2bd7
+
+
+
+ opendoar____::0266e33d3f546cb5436a10798e657d97
+
+
+
+ opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06
+
+
+
+ opendoar____::41bfd20a38bb1b0bec75acf0845530a7
+
+
+
+ opendoar____::87ae6fb631f7c8a627e8e28785d9992d
+
+
+
+
+
+
+ oac_clarin
+
+
+
+ re3data_____::a507cdacc5bbcc08761c92185dee5cab
+
+
+
+
+
+
+ oaa_dariah
+
+
+
+ openaire____::1cfdb2e14977f31a98e0118283401f32
+ {"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}
+
+
+
+
+
+ dimpo
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json
new file mode 100644
index 000000000..411a64fed
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json
@@ -0,0 +1,44 @@
+{"communities":
+ {"ee":
+ {"id":"ee",
+ "subjects":["SDG13 - Climate action","SDG8 - Decent work and economic growth","SDG15 - Life on land","SDG2 - Zero hunger","SDG17 - Partnerships for the goals","SDG10 - Reduced inequalities","SDG5 - Gender equality","SDG12 - Responsible consumption and production","SDG14 - Life below water","SDG6 - Clean water and sanitation","SDG11 - Sustainable cities and communities","SDG1 - No poverty","SDG3 - Good health and well being","SDG7 - Affordable and clean energy","SDG4 - Quality education","SDG9 - Industry innovation and infrastructure","SDG16 - Peace justice and strong institutions"],
+ "datasources":[],
+ "zenodoCommunities":[],
+ "organizationCommunity":[]
+ },
+ "instruct":
+ {"id":"instruct",
+ "subjects":[],
+ "datasources":[],
+ "zenodoCommunities":[{"zenodoCommunityId":"instruct"},{"zenodoCommunityId":"west-life"}],"organizationCommunity":[]},
+ "egi":{"id":"egi","subjects":[],"datasources":[],"zenodoCommunities":[{"zenodoCommunityId":"zenodo"}],"organizationCommunity":[]},
+ "covid-19":{"id":"covid-19","subjects":["COVID-19","SARS-CoV-2","2019-nCoV","Severe acute respiratory syndrome coronavirus 2","2019 novel coronavirus","coronavirus disease 2019","coronavirus disease-19","HCoV-19","mesh:COVID-19","mesh:C000657245"],
+ "datasources":[{"openaireId":"opendoar____::358aee4cc897452c00244351e4d91f69","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"re3data_____::7b0ad08687b2c960d5aeef06f811d5e6","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}
+ ]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"driver______::bee53aa31dc2cbb538c10c2b65fa5824","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"openaire____::437f4b072b1aa198adcbc35910ff3b98","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"openaire____::081b82f96300b6a6e3d282bad31cb6e2","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"openaire____::9e3be59865b2c1c335d32dae2fe7b254","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]}
+ ,{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"share_______::4719356ec8d7d55d3feb384ce879ad6c","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"share_______::bbd802baad85d1fd440f32a7a3a2c2b1","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"opendoar____::6f4922f45568161a8cdf4ad2299f6d23","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}}],"zenodoCommunities":[{"zenodoCommunityId":"chicago-covid-19"},{"zenodoCommunityId":"covid-19-senacyt-panama-sample"},{"zenodoCommunityId":"covid-19-tx-rct-stats-review"},{"zenodoCommunityId":"covid_19_senacyt_abc_panama"}],"organizationCommunity":[]},
+ "dariah":{"id":"dariah","subjects":[],"datasources":[{"openaireId":"opendoar____::7e7757b1e12abcb736ab9a754ffb617a","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}},{"openaireId":"opendoar____::96da2f590cd7246bbde0051047b0d6f7","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}}],"zenodoCommunities":[{"zenodoCommunityId":"dimpo"}],"organizationCommunity":[]},"rda":{"id":"rda","subjects":[],"datasources":[],"zenodoCommunities":[{"zenodoCommunityId":"rda"}],"organizationCommunity":[]},"clarin":{"id":"clarin","subjects":[],"datasources":[{"openaireId":"re3data_____::a507cdacc5bbcc08761c92185dee5cab"}],"zenodoCommunities":[],"organizationCommunity":[]},"aginfra":{"id":"aginfra","subjects":["animal production and health","fisheries and aquaculture","food safety and human nutrition","information management","food technology","agri-food education and extension","natural resources and environment","food system","engineering technology and Research","agriculture","food safety risk assessment","food security","farming practices and systems","plant production and protection","agri-food economics and policy","Agri-food","food distribution","forestry"],"datasources":[{"openaireId":"opendoar____::1a551829d50f1400b0dab21fdd969c04"},{"openaireId":"opendoar____::49af6c4e558a7569d80eee2e035e2bd7"},{"openaireId":"opendoar____::0266e33d3f546cb5436a10798e657d97"},{"openaireId":"opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06"},{"openaireId":"opendoar____::41bfd20a38bb1b0bec75acf0845530a7"},{"openaireId":"opendoar____::87ae6fb631f7c8a627e8e28785d9992d"}],"zenodoCommunities":[{"zenodoCommunityId":"edenis"},{"zenodoCommunityId":"efsa-pilot"},{"zenodoCommunityId":"egene3"},{"zenodoCommunityId":"efsa-kj"},{"zenodoCommunityId":"euromixproject"},{"zenodoCommunityId":"discardless"},{"zenodoCommunityId":"sedinstcjfst"},{"zenodoCommunityId":"afinet-kc"},{"zenodoCommunityId":"2231-4784"},{"zenodoCommunityId":"2231-0606"},{"zenodoCommunityId":"solace"},{"zenodoCommunityId":"pa17"},{"zenodoCommunityId":"smartakis"},{"zenodoCommunityId":"sedinstcjae"},{"zenodoCommunityId":"phenology_camera"},{"zenodoCommunityId":"aginfra"},{"zenodoCommunityId":"erosa"},{"zenodoCommunityId":"bigdatagrapes"}],"organizationCommunity":[]},"fam":{"id":"fam","subjects":["Stock Assessment","pelagic","Acoustic","Fish farming","Fisheries","Fishermen","maximum sustainable yield","trawler","Fishing vessel","Fisherman","Fishing gear","mackerel","RFMO","Fish Aggregating Device","Bycatch","Fishery","common fisheries policy","Fishing fleet","Aquaculture"],"datasources":[{"openaireId":"doajarticles::8cec81178926caaca531afbd8eb5d64c"},{"openaireId":"doajarticles::0f7a7f30b5400615cae1829f3e743982"},{"openaireId":"doajarticles::9740f7f5af3e506d2ad2c215cdccd51a"},{"openaireId":"doajarticles::9f3fbaae044fa33cb7069b72935a3254"},{"openaireId":"doajarticles::cb67f33eb9819f5c624ce0313957f6b3"},{"openaireId":"doajarticles::e21c97cbb7a209afc75703681c462906"},{"openaireId":"doajarticles::554cde3be9e5c4588b4c4f9f503120cb"},{"openaireId":"tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b"},{"openaireId":"doajarticles::57c5d3837da943e93b28ec4db82ec7a5"},{"openaireId":"doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1"},{"openaireId":"doajarticles::e21c97cbb7a209afc75703681c462906"},{"openaireId":"doajarticles::dca64612dfe0963fffc119098a319957"},{"openaireId":"doajarticles::dd70e44479f0ade25aa106aef3e87a0a"}],"zenodoCommunities":[{"zenodoCommunityId":"discardless"},{"zenodoCommunityId":"farfish2020"},{"zenodoCommunityId":"facts"},{"zenodoCommunityId":"climefish"},{"zenodoCommunityId":"proeel"},{"zenodoCommunityId":"primefish"},{"zenodoCommunityId":"h2020_vicinaqua"},{"zenodoCommunityId":"meece"},{"zenodoCommunityId":"rlsadb"},{"zenodoCommunityId":"iotc_ctoi"}],"organizationCommunity":[]},"beopen":{"id":"beopen","subjects":["Green Transport","City mobility systems","Vulnerable road users","Traffic engineering","Transport electrification","Mobility","Intermodal freight transport","Clean vehicle fleets","Intelligent mobility","Inflight refueling","District mobility systems","Navigation and control systems for optimised planning and routing","European Space Technology Platform","European Transport networks","Green cars","Inter-modality infrastructures","Advanced Take Off and Landing Ideas","Sustainable urban systems","port-area railway networks","Innovative forms of urban transport","Alliance for Logistics Innovation through Collaboration in Europe","Advisory Council for Aeronautics Research in Europe","Mobility services for people and goods","Guidance and traffic management","Passenger mobility","Smart mobility and services","transport innovation","high-speed railway","Vehicle design","Inland shipping","public transportation","aviation’s climate impact","Road transport","On-demand public transport","Personal Air Transport","Transport","transport vulnerability","Pipeline transport","European Association of Aviation Training and Education Organisations","Defrosting of railway infrastructure","Inclusive and affordable transport","River Information Services","jel:L92","Increased use of public transport","Seamless mobility","STRIA","trolleybus transport","Intelligent Transport System","Low-emission alternative energy for transport","Shared mobility for people and goods","Business model for urban mobility","Interoperability of transport systems","Cross-border train slot booking","Air transport","Transport pricing","Sustainable transport","European Rail Transport Research Advisory Council","Alternative aircraft configurations","Transport and Mobility","Railways applications","urban transport","Environmental impact of transport","urban freight delivery systems","Automated Road Transport","Alternative fuels in public transport","Active LIDAR-sensor for GHG-measurements","Autonomous logistics operations","Rational use of motorised transport","Network and traffic management systems","electrification of railway wagons","Single European Sky","Electrified road systems","transportation planning","Railway dynamics","Motorway of the Sea","smart railway communications","Maritime transport","Environmental- friendly transport","Combined transport","Connected automated driving technology","Innovative freight logistics services","automated and shared vehicles","Alternative Aircraft Systems","Land-use and transport interaction","Public transport system","Business plan for shared mobility","Shared mobility","Growing of mobility demand","European Road Transport Research Advisory Council","WATERBORNE ETP","Effective transport management system","Short Sea Shipping","air traffic management","Sea hubs and the motorways of the sea","Urban mobility solutions","Smart city planning","Maritime spatial planning","EUropean rail Research Network of Excellence","Transport governance","ENERGY CONSUMPTION BY THE TRANSPORT SECTOR","Integrated urban plan","inland waterway services","European Conference of Transport Research Institutes","air vehicles","E-freight","Automated Driving","Automated ships","pricing for cross-border passenger transport","Vehicle efficiency","Railway transport","Electric vehicles","Road traffic monitoring","Deep sea shipping","Circular economy in transport","Traffic congestion","air transport system","Urban logistics","Rail transport","OpenStreetMap","high speed rail","Transportation engineering","Intermodal travel information","Flight Data Recorders","Advanced driver assistance systems","long distance freight transport","Inland waterway transport","Smart mobility","Mobility integration","Personal Rapid Transit system","Safety measures \\u0026 requirements for roads","Green rail transport","Electrical","Vehicle manufacturing","Future Airport Layout","Rail technologies","European Intermodal Research Advisory Council","inland navigation","Automated urban vehicles","ECSS-standards","Traveller services","Polluting transport","Air Traffic Control","Cooperative and connected and automated transport","Innovative powertrains","Quality of transport system and services","door-to- door logistics chain","Inter-modal aspects of urban mobility","travel (and mobility)","Innovative freight delivery systems","urban freight delivery infrastructures"],"datasources":[{"openaireId":"doajarticles::1c5bdf8fca58937894ad1441cca99b76"},{"openaireId":"doajarticles::b37a634324a45c821687e6e80e6f53b4"},{"openaireId":"doajarticles::4bf64f2a104040e4e055cd9594b2d77c"},{"openaireId":"doajarticles::479ca537c12755d1868bbf02938a900c"},{"openaireId":"doajarticles::55f31df96a60e2309f45b7c265fcf7a2"},{"openaireId":"doajarticles::c52a09891a5301f9986ebbfe3761810c"},{"openaireId":"doajarticles::379807bc7f6c71a227ef1651462c414c"},{"openaireId":"doajarticles::36069db531a00b85a2e8fb301f4bdc19"},{"openaireId":"doajarticles::b6a898da311ded96fabf49c520b80d5d"},{"openaireId":"doajarticles::d0753d9180b35a271d8b4a31f449749f"},{"openaireId":"doajarticles::172050a92511838393a3fe237ae47e31"},{"openaireId":"doajarticles::301ed96c62abb160a3e29796efe5c95c"},{"openaireId":"doajarticles::0f4f805b3d842f2c7f1b077c3426fa59"},{"openaireId":"doajarticles::ba73728b84437b8d48ae287b867c7215"},{"openaireId":"doajarticles::86faef424d804309ccf45f692523aa48"},{"openaireId":"doajarticles::73bd758fa41671de70964c3ecba013af"},{"openaireId":"doajarticles::e661fc0bdb24af42b740a08f0ddc6cf4"},{"openaireId":"doajarticles::a6d3052047d5dbfbd43d95b4afb0f3d7"},{"openaireId":"doajarticles::ca61df07089acc53a1569bde6673d82a"},{"openaireId":"doajarticles::237dd6f1606600459d0297abd8ed9976"},{"openaireId":"doajarticles::fba6191177ede7c51ea1cdf58eae7f8b"}],"zenodoCommunities":[{"zenodoCommunityId":"jsdtl"},{"zenodoCommunityId":"utc-martrec"},{"zenodoCommunityId":"utc-uti"},{"zenodoCommunityId":"stp"},{"zenodoCommunityId":"c2smart"},{"zenodoCommunityId":"stride-utc"},{"zenodoCommunityId":"crowd4roads"},{"zenodoCommunityId":"lemo"},{"zenodoCommunityId":"imov3d"},{"zenodoCommunityId":"tra2018"},{"zenodoCommunityId":"optimum"},{"zenodoCommunityId":"stars"},{"zenodoCommunityId":"iecteim"},{"zenodoCommunityId":"iccpt2019"}],"organizationCommunity":[]},"science-innovation-policy":
+ {"id":"science-innovation-policy","subjects":["Sustainability-oriented science policy", "STI policies", "science—society relations",
+ "Science & Technology Policy", "Innovation policy", "science policy", "Policy and Law"],
+ "datasources":[{"openaireId":"doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d"},
+ {"openaireId":"doajarticles::ae4c7286c79590f19fdca670156ce816"},
+ {"openaireId":"doajarticles::0f664bce92ce953e0c7a92068c46bfb3"},
+ {"openaireId":"doajarticles::00017183dc4c858fb77541985323a4ef"},
+ {"openaireId":"doajarticles::93b306f458cce3d7aaaf58c0a725f4f9"},
+ {"openaireId":"doajarticles::9dbf8fbf3e9fe0fe1fc01e55fbd90bfc"},
+ {"openaireId":"doajarticles::a2bda8785c863279bba4b8f34827b4c9"},
+ {"openaireId":"doajarticles::019a1fcb42c3fea1c1b689df76330b58"},
+ {"openaireId":"doajarticles::0daa8281938831e9c82bfed8b55a2975"},
+ {"openaireId":"doajarticles::f67ad6d268162079b3abd51a24468744"},
+ {"openaireId":"doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d"},
+ {"openaireId":"doajarticles::ad114356e196a4a3d84dda59c720dacd"},
+ {"openaireId":"doajarticles::01e8a54fdecaaf354c67a2dd74ae7d4f"},
+ {"openaireId":"doajarticles::449305f096b10a9464449ff2d0e10e06"},
+ {"openaireId":"doajarticles::982c0c0ac378256254cce2fa6572bb6c"},
+ {"openaireId":"doajarticles::49d6ed47138884566ce93cf0ccb12c02"},
+ {"openaireId":"doajarticles::a98e820dbc2e8ee0fc84ab66f263267c"},
+ {"openaireId":"doajarticles::50b1ce37427b36368f8f0f1317e47f83"},
+ {"openaireId":"doajarticles::f0ec29b7450b2ac5d0ad45327eeb531a"},
+ {"openaireId":"doajarticles::d8d421d3b0349a7aaa93758b27a54e84"},
+ {"openaireId":"doajarticles::7ffc35ac5133da01d421ccf8af5b70bc"}
+ ],"zenodoCommunities":[{"zenodoCommunityId":"risis"}],"organizationCommunity":[]},"mes":{"id":"mes","subjects":["marine","ocean","fish","aqua","sea"],"datasources":[],"zenodoCommunities":[{"zenodoCommunityId":"adriplan"},{"zenodoCommunityId":"devotes-project"},{"zenodoCommunityId":"euro-basin"},{"zenodoCommunityId":"naclim"},{"zenodoCommunityId":"discardless"},{"zenodoCommunityId":"assisibf"},{"zenodoCommunityId":"meece"},{"zenodoCommunityId":"facts"},{"zenodoCommunityId":"proeel"},{"zenodoCommunityId":"aquatrace"},{"zenodoCommunityId":"myfish"},{"zenodoCommunityId":"atlas"},{"zenodoCommunityId":"blue-actionh2020"},{"zenodoCommunityId":"sponges"},{"zenodoCommunityId":"merces_project"},{"zenodoCommunityId":"bigdataocean"},{"zenodoCommunityId":"columbus"},{"zenodoCommunityId":"h2020-aquainvad-ed"},{"zenodoCommunityId":"aquarius"},{"zenodoCommunityId":"southern-ocean-observing-system"},{"zenodoCommunityId":"eawag"},{"zenodoCommunityId":"mossco"},{"zenodoCommunityId":"onc"},{"zenodoCommunityId":"oceanbiogeochemistry"},{"zenodoCommunityId":"oceanliteracy"},{"zenodoCommunityId":"openearth"},{"zenodoCommunityId":"ocean"},{"zenodoCommunityId":"calcifierraman"},{"zenodoCommunityId":"bermudabream"},{"zenodoCommunityId":"brcorp1"},{"zenodoCommunityId":"mce"},{"zenodoCommunityId":"biogeochem"},{"zenodoCommunityId":"ecc2014"},{"zenodoCommunityId":"fisheries"},{"zenodoCommunityId":"sedinstcjfas"},{"zenodoCommunityId":"narmada"},{"zenodoCommunityId":"umr-entropie"},{"zenodoCommunityId":"farfish2020"},{"zenodoCommunityId":"primefish"},{"zenodoCommunityId":"zf-ilcs"},{"zenodoCommunityId":"climefish"},{"zenodoCommunityId":"afrimed_eu"},{"zenodoCommunityId":"spi-ace"},{"zenodoCommunityId":"cice-consortium"},{"zenodoCommunityId":"nemo-ocean"},{"zenodoCommunityId":"mesopp-h2020"},{"zenodoCommunityId":"marxiv"}],"organizationCommunity":[]},"ni":{"id":"ni","subjects":["brain mapping","brain imaging","electroencephalography","arterial spin labelling","brain fingerprinting","brain","neuroimaging","Multimodal Brain Image Analysis","fMRI","neuroinformatics","fetal brain","brain ultrasonic imaging","topographic brain mapping","diffusion tensor imaging","computerized knowledge assessment","connectome mapping","brain magnetic resonance imaging","brain abnormalities"],"datasources":[{"openaireId":"re3data_____::5b9bf9171d92df854cf3c520692e9122"},{"openaireId":"doajarticles::c7d3de67dc77af72f6747157441252ec"},{"openaireId":"re3data_____::8515794670370f49c1d176c399c714f5"},{"openaireId":"doajarticles::d640648c84b10d425f96f11c3de468f3"},{"openaireId":"doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"},{"openaireId":"rest________::fb1a3d4523c95e63496e3bc7ba36244b"}],"zenodoCommunities":[{"zenodoCommunityId":"neuroinformatics"},{"zenodoCommunityId":"hbp"},{"zenodoCommunityId":"from_neuroscience_to_machine_learning"},{"zenodoCommunityId":"ci2c"},{"zenodoCommunityId":"opensourcebrain"},{"zenodoCommunityId":"brainspeak"},{"zenodoCommunityId":"braincom"},{"zenodoCommunityId":"nextgenvis"},{"zenodoCommunityId":"meso-brain"},{"zenodoCommunityId":"neuroplasticity-workshop"},{"zenodoCommunityId":"bionics"},{"zenodoCommunityId":"brainmattrain-676408"},{"zenodoCommunityId":"repronim"},{"zenodoCommunityId":"affectiveneuro"},{"zenodoCommunityId":"con"},{"zenodoCommunityId":"lab_neurol_sperim_irfmn_irccs_milano_it"}],"organizationCommunity":[]},"dh-ch":{"id":"dh-ch","subjects":["modern art","monuments","europeana data model","sites","field walking","frescoes","LIDO metadata schema","art history","excavation","Arts and Humanities General","cities","coins","temples","numismatics","lithics","roads","environmental archaeology","digital cultural heritage","archaeological reports","history","CRMba","churches","cultural heritage","archaeological stratigraphy","religious art","buidings","digital humanities","survey","archaeological sites","linguistic studies","bioarchaeology","architectural orders","palaeoanthropology","fine arts","europeana","CIDOC CRM","decorations","classic art","stratigraphy","digital archaeology","intangible cultural heritage","walls","humanities","chapels","CRMtex","Language and Literature","paintings","archaeology","fair data","mosaics","burials","architecture","medieval art","castles","CARARE metadata schema","statues","natural language processing","inscriptions","CRMsci","vaults","contemporary art","Arts and Humanities","CRMarchaeo","pottery","site","architectural","vessels"],"datasources":[{"openaireId":"re3data_____::9ebe127e5f3a0bf401875690f3bb6b81"},{"openaireId":"doajarticles::c6cd4b532e12868c1d760a8d7cda6815"},{"openaireId":"doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b"},{"openaireId":"doajarticles::6eb31d13b12bc06bbac06aef63cf33c9"},{"openaireId":"doajarticles::0da84e9dfdc8419576169e027baa8028"},{"openaireId":"re3data_____::84e123776089ce3c7a33db98d9cd15a8"},{"openaireId":"openaire____::c5502a43e76feab55dd00cf50f519125"},{"openaireId":"re3data_____::a48f09c562b247a9919acfe195549b47"},{"openaireId":"opendoar____::97275a23ca44226c9964043c8462be96"}],"zenodoCommunities":[{"zenodoCommunityId":"storm"},{"zenodoCommunityId":"crosscult"},{"zenodoCommunityId":"wholodance_eu"},{"zenodoCommunityId":"digcur2013"},{"zenodoCommunityId":"gravitate"},{"zenodoCommunityId":"dipp2014"},{"zenodoCommunityId":"digitalhumanities"},{"zenodoCommunityId":"dimpo"},{"zenodoCommunityId":"adho"},{"zenodoCommunityId":"chc"},{"zenodoCommunityId":"wahr"},{"zenodoCommunityId":"ibe"},{"zenodoCommunityId":"ariadne"},{"zenodoCommunityId":"parthenos-hub"},{"zenodoCommunityId":"parthenos-training"},{"zenodoCommunityId":"gandhara"},{"zenodoCommunityId":"cmsouthasia"},{"zenodoCommunityId":"nilgirihills"},{"zenodoCommunityId":"shamsa_mustecio"},{"zenodoCommunityId":"bodhgaya"}],"organizationCommunity":[]}}}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml
new file mode 100644
index 000000000..4f0d25f34
--- /dev/null
+++ b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml
@@ -0,0 +1,1390 @@
+
+
+
+
+
+
+ zenodo
+
+
+
+
+ Result: 2
+
+
+
+
+ Result: 3
+
+
+
+
+ Result: 4
+
+
+
+ re3data_____::a507cdacc5bbcc08761c92185dee5cab
+
+
+
+
+
+ Result: 5
+
+
+
+
+ rda
+
+
+
+
+ Result: 6
+
+ SDG13 - Climate action
+ SDG8 - Decent work and economic growth
+ SDG15 - Life on land
+ SDG2 - Zero hunger
+ SDG17 - Partnerships for the goals
+ SDG10 - Reduced inequalities
+ SDG5 - Gender equality
+ SDG12 - Responsible consumption and production
+ SDG14 - Life below water
+ SDG6 - Clean water and sanitation
+ SDG11 - Sustainable cities and communities
+ SDG1 - No poverty
+ SDG3 - Good health and well being
+ SDG7 - Affordable and clean energy
+ SDG4 - Quality education
+ SDG9 - Industry innovation and infrastructure
+ SDG16 - Peace justice and strong institutions
+
+
+
+
+ Result: 7
+
+ modern art
+ monuments
+ europeana data model
+ sites
+ field walking
+ frescoes
+ LIDO metadata schema
+ art history
+ excavation
+ Arts and Humanities General
+ cities
+ coins
+ temples
+ numismatics
+ lithics
+ roads
+ environmental archaeology
+ digital cultural heritage
+ archaeological reports
+ history
+ CRMba
+ churches
+ cultural heritage
+ archaeological stratigraphy
+ religious art
+ buidings
+ digital humanities
+ survey
+ archaeological sites
+ linguistic studies
+ bioarchaeology
+ architectural orders
+ palaeoanthropology
+ fine arts
+ europeana
+ CIDOC CRM
+ decorations
+ classic art
+ stratigraphy
+ digital archaeology
+ intangible cultural heritage
+ walls
+ humanities
+ chapels
+ CRMtex
+ Language and Literature
+ paintings
+ archaeology
+ fair data
+ mosaics
+ burials
+ architecture
+ medieval art
+ castles
+ CARARE metadata schema
+ statues
+ natural language processing
+ inscriptions
+ CRMsci
+ vaults
+ contemporary art
+ Arts and Humanities
+ CRMarchaeo
+ pottery
+ site
+ architectural
+ vessels
+
+
+
+ re3data_____::9ebe127e5f3a0bf401875690f3bb6b81
+
+
+
+ doajarticles::c6cd4b532e12868c1d760a8d7cda6815
+
+
+
+ doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b
+
+
+
+ doajarticles::6eb31d13b12bc06bbac06aef63cf33c9
+
+
+
+ doajarticles::0da84e9dfdc8419576169e027baa8028
+
+
+
+ re3data_____::84e123776089ce3c7a33db98d9cd15a8
+
+
+
+ openaire____::c5502a43e76feab55dd00cf50f519125
+
+
+
+ re3data_____::a48f09c562b247a9919acfe195549b47
+
+
+
+ opendoar____::97275a23ca44226c9964043c8462be96
+
+
+
+
+
+ storm
+
+
+
+ crosscult
+
+
+
+ wholodance_eu
+
+
+
+ digcur2013
+
+
+
+ gravitate
+
+
+
+ dipp2014
+
+
+
+ digitalhumanities
+
+
+
+ dimpo
+
+
+
+ adho
+
+
+
+ chc
+
+
+
+ wahr
+
+
+
+ ibe
+
+
+
+ ariadne
+
+
+
+ parthenos-hub
+
+
+
+ parthenos-training
+
+
+
+ gandhara
+
+
+
+ cmsouthasia
+
+
+
+ nilgirihills
+
+
+
+ shamsa_mustecio
+
+
+
+ bodhgaya
+
+
+
+
+ Result: 8
+
+ Stock Assessment
+ pelagic
+ Acoustic
+ Fish farming
+ Fisheries
+ Fishermen
+ maximum sustainable yield
+ trawler
+ Fishing vessel
+ Fisherman
+ Fishing gear
+ mackerel
+ RFMO
+ Fish Aggregating Device
+ Bycatch
+ Fishery
+ common fisheries policy
+ Fishing fleet
+ Aquaculture
+
+
+
+ doajarticles::8cec81178926caaca531afbd8eb5d64c
+
+
+
+ doajarticles::0f7a7f30b5400615cae1829f3e743982
+
+
+
+ doajarticles::9740f7f5af3e506d2ad2c215cdccd51a
+
+
+
+ doajarticles::9f3fbaae044fa33cb7069b72935a3254
+
+
+
+ doajarticles::cb67f33eb9819f5c624ce0313957f6b3
+
+
+
+ doajarticles::e21c97cbb7a209afc75703681c462906
+
+
+
+ doajarticles::554cde3be9e5c4588b4c4f9f503120cb
+
+
+
+ tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b
+
+
+
+ doajarticles::57c5d3837da943e93b28ec4db82ec7a5
+
+
+
+ doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1
+
+
+
+ doajarticles::e21c97cbb7a209afc75703681c462906
+
+
+
+ doajarticles::dca64612dfe0963fffc119098a319957
+
+
+
+ doajarticles::dd70e44479f0ade25aa106aef3e87a0a
+
+
+
+
+
+ discardless
+
+
+
+ farfish2020
+
+
+
+ facts
+
+
+
+ climefish
+
+
+
+ proeel
+
+
+
+ primefish
+
+
+
+ h2020_vicinaqua
+
+
+
+ meece
+
+
+
+ rlsadb
+
+
+
+ iotc_ctoi
+
+
+
+
+ Result: 9
+
+ brain mapping
+ brain imaging
+ electroencephalography
+ arterial spin labelling
+ brain fingerprinting
+ brain
+ neuroimaging
+ Multimodal Brain Image Analysis
+ fMRI
+ neuroinformatics
+ fetal brain
+ brain ultrasonic imaging
+ topographic brain mapping
+ diffusion tensor imaging
+ computerized knowledge assessment
+ connectome mapping
+ brain magnetic resonance imaging
+ brain abnormalities
+
+
+
+ re3data_____::5b9bf9171d92df854cf3c520692e9122
+
+
+
+ doajarticles::c7d3de67dc77af72f6747157441252ec
+
+
+
+ re3data_____::8515794670370f49c1d176c399c714f5
+
+
+
+ doajarticles::d640648c84b10d425f96f11c3de468f3
+
+
+
+ doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a
+
+
+
+ rest________::fb1a3d4523c95e63496e3bc7ba36244b
+
+
+
+
+
+ neuroinformatics
+
+
+
+ hbp
+
+
+
+ from_neuroscience_to_machine_learning
+
+
+
+ ci2c
+
+
+
+ opensourcebrain
+
+
+
+ brainspeak
+
+
+
+ braincom
+
+
+
+ nextgenvis
+
+
+
+ meso-brain
+
+
+
+ neuroplasticity-workshop
+
+
+
+ bionics
+
+
+
+ brainmattrain-676408
+
+
+
+ repronim
+
+
+
+ affectiveneuro
+
+
+
+ con
+
+
+
+ lab_neurol_sperim_irfmn_irccs_milano_it
+
+
+
+
+ Result: 10
+
+ marine
+ ocean
+ fish
+ aqua
+ sea
+
+
+
+
+ adriplan
+
+
+
+ devotes-project
+
+
+
+ euro-basin
+
+
+
+ naclim
+
+
+
+ discardless
+
+
+
+ assisibf
+
+
+
+ meece
+
+
+
+ facts
+
+
+
+ proeel
+
+
+
+ aquatrace
+
+
+
+ myfish
+
+
+
+ atlas
+
+
+
+ blue-actionh2020
+
+
+
+ sponges
+
+
+
+ merces_project
+
+
+
+ bigdataocean
+
+
+
+ columbus
+
+
+
+ h2020-aquainvad-ed
+
+
+
+ aquarius
+
+
+
+ southern-ocean-observing-system
+
+
+
+ eawag
+
+
+
+ mossco
+
+
+
+ onc
+
+
+
+ oceanbiogeochemistry
+
+
+
+ oceanliteracy
+
+
+
+ openearth
+
+
+
+ ocean
+
+
+
+ calcifierraman
+
+
+
+ bermudabream
+
+
+
+ brcorp1
+
+
+
+ mce
+
+
+
+ biogeochem
+
+
+
+ ecc2014
+
+
+
+ fisheries
+
+
+
+ sedinstcjfas
+
+
+
+ narmada
+
+
+
+ umr-entropie
+
+
+
+ farfish2020
+
+
+
+ primefish
+
+
+
+ zf-ilcs
+
+
+
+ climefish
+
+
+
+ afrimed_eu
+
+
+
+ spi-ace
+
+
+
+ cice-consortium
+
+
+
+ nemo-ocean
+
+
+
+ mesopp-h2020
+
+
+
+ marxiv
+
+
+
+
+ Result: 11
+
+
+
+
+ instruct
+
+
+
+ west-life
+
+
+
+
+ Result: 12
+
+
+
+
+ Result: 13
+
+ animal production and health
+ fisheries and aquaculture
+ food safety and human nutrition
+ information management
+ food technology
+ agri-food education and extension
+ natural resources and environment
+ food system
+ engineering technology and Research
+ agriculture
+ food safety risk assessment
+ food security
+ farming practices and systems
+ plant production and protection
+ agri-food economics and policy
+ Agri-food
+ food distribution
+ forestry
+
+
+
+ opendoar____::1a551829d50f1400b0dab21fdd969c04
+
+
+
+ opendoar____::49af6c4e558a7569d80eee2e035e2bd7
+
+
+
+ opendoar____::0266e33d3f546cb5436a10798e657d97
+
+
+
+ opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06
+
+
+
+ opendoar____::41bfd20a38bb1b0bec75acf0845530a7
+
+
+
+ opendoar____::87ae6fb631f7c8a627e8e28785d9992d
+
+
+
+
+
+ edenis
+
+
+
+ efsa-pilot
+
+
+
+ egene3
+
+
+
+ efsa-kj
+
+
+
+ euromixproject
+
+
+
+ discardless
+
+
+
+ sedinstcjfst
+
+
+
+ afinet-kc
+
+
+
+ 2231-4784
+
+
+
+ 2231-0606
+
+
+
+ solace
+
+
+
+ pa17
+
+
+
+ smartakis
+
+
+
+ sedinstcjae
+
+
+
+ phenology_camera
+
+
+
+ aginfra
+
+
+
+ erosa
+
+
+
+ bigdatagrapes
+
+
+
+
+ Result: 14
+
+
+
+ opendoar____::7e7757b1e12abcb736ab9a754ffb617a
+ {"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}
+
+
+ opendoar____::96da2f590cd7246bbde0051047b0d6f7
+ {"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}
+
+
+
+
+ dimpo
+
+
+
+
+ Result: 15
+
+
+
+
+ Result: 16
+
+
+
+
+ Result: 17
+
+ Green Transport
+ City mobility systems
+ Vulnerable road users
+ Traffic engineering
+ Transport electrification
+ Mobility
+ Intermodal freight transport
+ Clean vehicle fleets
+ Intelligent mobility
+ Inflight refueling
+ District mobility systems
+ Navigation and control systems for optimised planning and routing
+ European Space Technology Platform
+ European Transport networks
+ Green cars
+ Inter-modality infrastructures
+ Advanced Take Off and Landing Ideas
+ Sustainable urban systems
+ port-area railway networks
+ Innovative forms of urban transport
+ Alliance for Logistics Innovation through Collaboration in Europe
+ Advisory Council for Aeronautics Research in Europe
+ Mobility services for people and goods
+ Guidance and traffic management
+ Passenger mobility
+ Smart mobility and services
+ transport innovation
+ high-speed railway
+ Vehicle design
+ Inland shipping
+ public transportation
+ aviation’s climate impact
+ Road transport
+ On-demand public transport
+ Personal Air Transport
+ Transport
+ transport vulnerability
+ Pipeline transport
+ European Association of Aviation Training and Education Organisations
+ Defrosting of railway infrastructure
+ Inclusive and affordable transport
+ River Information Services
+ jel:L92
+ Increased use of public transport
+ Seamless mobility
+ STRIA
+ trolleybus transport
+ Intelligent Transport System
+ Low-emission alternative energy for transport
+ Shared mobility for people and goods
+ Business model for urban mobility
+ Interoperability of transport systems
+ Cross-border train slot booking
+ Air transport
+ Transport pricing
+ Sustainable transport
+ European Rail Transport Research Advisory Council
+ Alternative aircraft configurations
+ Transport and Mobility
+ Railways applications
+ urban transport
+ Environmental impact of transport
+ urban freight delivery systems
+ Automated Road Transport
+ Alternative fuels in public transport
+ Active LIDAR-sensor for GHG-measurements
+ Autonomous logistics operations
+ Rational use of motorised transport
+ Network and traffic management systems
+ electrification of railway wagons
+ Single European Sky
+ Electrified road systems
+ transportation planning
+ Railway dynamics
+ Motorway of the Sea
+ smart railway communications
+ Maritime transport
+ Environmental- friendly transport
+ Combined transport
+ Connected automated driving technology
+ Innovative freight logistics services
+ automated and shared vehicles
+ Alternative Aircraft Systems
+ Land-use and transport interaction
+ Public transport system
+ Business plan for shared mobility
+ Shared mobility
+ Growing of mobility demand
+ European Road Transport Research Advisory Council
+ WATERBORNE ETP
+ Effective transport management system
+ Short Sea Shipping
+ air traffic management
+ Sea hubs and the motorways of the sea
+ Urban mobility solutions
+ Smart city planning
+ Maritime spatial planning
+ EUropean rail Research Network of Excellence
+ Transport governance
+ ENERGY CONSUMPTION BY THE TRANSPORT SECTOR
+ Integrated urban plan
+ inland waterway services
+ European Conference of Transport Research Institutes
+ air vehicles
+ E-freight
+ Automated Driving
+ Automated ships
+ pricing for cross-border passenger transport
+ Vehicle efficiency
+ Railway transport
+ Electric vehicles
+ Road traffic monitoring
+ Deep sea shipping
+ Circular economy in transport
+ Traffic congestion
+ air transport system
+ Urban logistics
+ Rail transport
+ OpenStreetMap
+ high speed rail
+ Transportation engineering
+ Intermodal travel information
+ Flight Data Recorders
+ Advanced driver assistance systems
+ long distance freight transport
+ Inland waterway transport
+ Smart mobility
+ Mobility integration
+ Personal Rapid Transit system
+ Safety measures & requirements for roads
+ Green rail transport
+ Electrical
+ Vehicle manufacturing
+ Future Airport Layout
+ Rail technologies
+ European Intermodal Research Advisory Council
+ inland navigation
+ Automated urban vehicles
+ ECSS-standards
+ Traveller services
+ Polluting transport
+ Air Traffic Control
+ Cooperative and connected and automated transport
+ Innovative powertrains
+ Quality of transport system and services
+ door-to- door logistics chain
+ Inter-modal aspects of urban mobility
+ travel (and mobility)
+ Innovative freight delivery systems
+ urban freight delivery infrastructures
+
+
+
+ doajarticles::1c5bdf8fca58937894ad1441cca99b76
+
+
+
+ doajarticles::b37a634324a45c821687e6e80e6f53b4
+
+
+
+ doajarticles::4bf64f2a104040e4e055cd9594b2d77c
+
+
+
+ doajarticles::479ca537c12755d1868bbf02938a900c
+
+
+
+ doajarticles::55f31df96a60e2309f45b7c265fcf7a2
+
+
+
+ doajarticles::c52a09891a5301f9986ebbfe3761810c
+
+
+
+ doajarticles::379807bc7f6c71a227ef1651462c414c
+
+
+
+ doajarticles::36069db531a00b85a2e8fb301f4bdc19
+
+
+
+ doajarticles::b6a898da311ded96fabf49c520b80d5d
+
+
+
+ doajarticles::d0753d9180b35a271d8b4a31f449749f
+
+
+
+ doajarticles::172050a92511838393a3fe237ae47e31
+
+
+
+ doajarticles::301ed96c62abb160a3e29796efe5c95c
+
+
+
+ doajarticles::0f4f805b3d842f2c7f1b077c3426fa59
+
+
+
+ doajarticles::ba73728b84437b8d48ae287b867c7215
+
+
+
+ doajarticles::86faef424d804309ccf45f692523aa48
+
+
+
+ doajarticles::73bd758fa41671de70964c3ecba013af
+
+
+
+ doajarticles::e661fc0bdb24af42b740a08f0ddc6cf4
+
+
+
+ doajarticles::a6d3052047d5dbfbd43d95b4afb0f3d7
+
+
+
+ doajarticles::ca61df07089acc53a1569bde6673d82a
+
+
+
+ doajarticles::237dd6f1606600459d0297abd8ed9976
+
+
+
+ doajarticles::fba6191177ede7c51ea1cdf58eae7f8b
+
+
+
+
+
+ jsdtl
+
+
+
+ utc-martrec
+
+
+
+ utc-uti
+
+
+
+ stp
+
+
+
+ c2smart
+
+
+
+ stride-utc
+
+
+
+ crowd4roads
+
+
+
+ lemo
+
+
+
+ imov3d
+
+
+
+ tra2018
+
+
+
+ optimum
+
+
+
+ stars
+
+
+
+ iecteim
+
+
+
+ iccpt2019
+
+
+
+
+ Result: 18
+
+
+
+
+ Result: 19
+
+
+
+
+ Result: 20
+
+
+
+
+ Result: 21
+
+ Sustainability-oriented science policy
+ STI policies
+ science—society relations
+ Science & Technology Policy
+ Innovation policy
+ science policy
+ Policy and Law
+
+
+
+ doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d
+
+
+
+ doajarticles::ae4c7286c79590f19fdca670156ce816
+
+
+
+ doajarticles::0f664bce92ce953e0c7a92068c46bfb3
+
+
+
+ doajarticles::00017183dc4c858fb77541985323a4ef
+
+
+
+ doajarticles::93b306f458cce3d7aaaf58c0a725f4f9
+
+
+
+ doajarticles::9dbf8fbf3e9fe0fe1fc01e55fbd90bfc
+
+
+
+ doajarticles::a2bda8785c863279bba4b8f34827b4c9
+
+
+
+ doajarticles::019a1fcb42c3fea1c1b689df76330b58
+
+
+
+ doajarticles::0daa8281938831e9c82bfed8b55a2975
+
+
+
+ doajarticles::f67ad6d268162079b3abd51a24468744
+
+
+
+ doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d
+
+
+
+ doajarticles::ad114356e196a4a3d84dda59c720dacd
+
+
+
+ doajarticles::01e8a54fdecaaf354c67a2dd74ae7d4f
+
+
+
+ doajarticles::449305f096b10a9464449ff2d0e10e06
+
+
+
+ doajarticles::982c0c0ac378256254cce2fa6572bb6c
+
+
+
+ doajarticles::49d6ed47138884566ce93cf0ccb12c02
+
+
+
+ doajarticles::a98e820dbc2e8ee0fc84ab66f263267c
+
+
+
+ doajarticles::50b1ce37427b36368f8f0f1317e47f83
+
+
+
+ doajarticles::f0ec29b7450b2ac5d0ad45327eeb531a
+
+
+
+ doajarticles::d8d421d3b0349a7aaa93758b27a54e84
+
+
+
+ doajarticles::7ffc35ac5133da01d421ccf8af5b70bc
+
+
+
+
+
+ risis
+
+
+
+
+ Result: 22
+
+ COVID-19
+ Severe acute respiratory syndrome coronavirus 2
+ SARS-CoV-2
+ COVID19
+ 2019 novel coronavirus
+ coronavirus disease 2019
+ HCoV-19
+ mesh:C000657245
+ 2019-nCoV
+ coronavirus disease-19
+ mesh:COVID-19
+ COVID2019
+
+
+
+ opendoar____::358aee4cc897452c00244351e4d91f69
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ re3data_____::7b0ad08687b2c960d5aeef06f811d5e6
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ driver______::bee53aa31dc2cbb538c10c2b65fa5824
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ openaire____::437f4b072b1aa198adcbc35910ff3b98
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ openaire____::081b82f96300b6a6e3d282bad31cb6e2
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ openaire____::9e3be59865b2c1c335d32dae2fe7b254
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ opendoar____::8b6dd7db9af49e67306feb59a8bdc52c
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ share_______::4719356ec8d7d55d3feb384ce879ad6c
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ share_______::bbd802baad85d1fd440f32a7a3a2c2b1
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ opendoar____::6f4922f45568161a8cdf4ad2299f6d23
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
+ {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
+
+
+
+ re3data_____::7980778c78fb4cf0fab13ce2159030dc
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]}
+
+
+ re3data_____::978378def740bbf2bfb420de868c460b
+ {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]}
+
+
+
+
+ chicago-covid-19
+
+
+
+ covid-19-senacyt-panama-sample
+
+
+
+ covid-19-tx-rct-stats-review
+
+
+
+ covid_19_senacyt_abc_panama
+
+
+
+
+
+
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/no_updates/dataset_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/no_updates/dataset_10.json.gz
new file mode 100644
index 000000000..bd29d59ae
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/no_updates/dataset_10.json.gz differ
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz
new file mode 100644
index 000000000..2eb33c5a4
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz differ
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz
new file mode 100644
index 000000000..ee62cd791
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz differ
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext/dataset_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext/dataset_10.json.gz
new file mode 100644
index 000000000..cf3c3aa7b
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext/dataset_10.json.gz differ
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject_datasource/dataset_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject_datasource/dataset_10.json.gz
new file mode 100644
index 000000000..fdc76a04c
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject_datasource/dataset_10.json.gz differ
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz
new file mode 100644
index 000000000..ea9e212bd
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz differ
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/publication/update_datasource/publication_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/publication/update_datasource/publication_10.json.gz
new file mode 100644
index 000000000..99c4015e7
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/publication/update_datasource/publication_10.json.gz differ
diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/software/software_10.json.gz b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/software/software_10.json.gz
new file mode 100644
index 000000000..3dcadf41d
Binary files /dev/null and b/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/software/software_10.json.gz differ
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
index 503e4c504..739c7a462 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@@ -24,8 +24,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.HdfsSupport;
-import eu.dnetlib.dhp.oa.graph.raw.common.DbClient;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
index e96c41066..e5e348642 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@@ -30,8 +30,8 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
-import eu.dnetlib.dhp.oa.graph.raw.common.DbClient;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
@@ -94,7 +94,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
log.info("Processing orgs...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
- log.info("Processing relations ds <-> orgs ...");
+ log.info("Processing relationsNoRemoval ds <-> orgs ...");
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
log.info("Processing projects <-> orgs ...");
@@ -370,6 +370,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
final DataInfo info = dataInfo(
false, null, false, false,
+
qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9");
final List collectedFrom = listKeyValues(
@@ -460,7 +461,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
final Boolean inferred = rs.getBoolean("inferred");
final String trust = rs.getString("trust");
return dataInfo(
+
deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
+
}
private Qualifier prepareQualifierSplitting(final String s) {
@@ -516,6 +519,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
if (arr.length == 3) {
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null;
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null;
+
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null;
if (issn != null || eissn != null || lissn != null) {
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz
index c2389b767..a5b8c8774 100644
Binary files a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz differ
diff --git a/dhp-workflows/dhp-propagation/pom.xml b/dhp-workflows/dhp-propagation/pom.xml
new file mode 100644
index 000000000..9492fa7c5
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/pom.xml
@@ -0,0 +1,43 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.1-SNAPSHOT
+
+ 4.0.0
+
+ dhp-propagation
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+ org.apache.spark
+ spark-hive_2.11
+ test
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java
new file mode 100644
index 000000000..8d2fede82
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java
@@ -0,0 +1,169 @@
+
+package eu.dnetlib.dhp;
+
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class PropagationConstant {
+ public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional";
+
+ public static final String PROPAGATION_DATA_INFO_TYPE = "propagation";
+
+ public static final String TRUE = "true";
+
+ public static final String DNET_COUNTRY_SCHEMA = "dnet:countries";
+ public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
+ public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
+
+ public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
+ public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
+
+ public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = "result:organization:instrepo";
+ public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository";
+
+ public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel";
+ public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation";
+
+ public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = "result:community:semrel";
+ public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation";
+
+ public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = "result:community:organization";
+ public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization";
+
+ public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
+ public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
+
+ public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy";
+
+ public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization";
+ public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation";
+ public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf";
+ public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution";
+
+ public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult";
+
+ public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject";
+ public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome";
+ public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy";
+ public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces";
+
+ public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges";
+
+ public static final String PROPAGATION_AUTHOR_PID = "ORCID";
+
+ public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
+ +
+ "from result r " +
+ "lateral view explode(instance) i as inst " +
+ "where r.datainfo.deletedbyinference=false";
+
+ public static Country getCountry(String classid, String classname) {
+ Country nc = new Country();
+ nc.setClassid(classid);
+ nc.setClassname(classname);
+ nc.setSchemename(DNET_COUNTRY_SCHEMA);
+ nc.setSchemeid(DNET_COUNTRY_SCHEMA);
+ nc
+ .setDataInfo(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_COUNTRY_INSTREPO_CLASS_ID,
+ PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME));
+ return nc;
+ }
+
+ public static DataInfo getDataInfo(
+ String inference_provenance, String inference_class_id, String inference_class_name) {
+ DataInfo di = new DataInfo();
+ di.setInferred(true);
+ di.setDeletedbyinference(false);
+ di.setTrust("0.85");
+ di.setInferenceprovenance(inference_provenance);
+ di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
+ return di;
+ }
+
+ public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
+ Qualifier pa = new Qualifier();
+ pa.setClassid(inference_class_id);
+ pa.setClassname(inference_class_name);
+ pa.setSchemeid(DNET_SCHEMA_ID);
+ pa.setSchemename(DNET_SCHEMA_NAME);
+ return pa;
+ }
+
+ public static Relation getRelation(
+ String source,
+ String target,
+ String rel_class,
+ String rel_type,
+ String subrel_type,
+ String inference_provenance,
+ String inference_class_id,
+ String inference_class_name) {
+ Relation r = new Relation();
+ r.setSource(source);
+ r.setTarget(target);
+ r.setRelClass(rel_class);
+ r.setRelType(rel_type);
+ r.setSubRelType(subrel_type);
+ r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name));
+ return r;
+ }
+
+ public static String getConstraintList(String text, List constraints) {
+ String ret = " and (" + text + constraints.get(0) + "'";
+ for (int i = 1; i < constraints.size(); i++) {
+ ret += " OR " + text + constraints.get(i) + "'";
+ }
+ ret += ")";
+ return ret;
+ }
+
+ public static void removeOutputDir(SparkSession spark, String path) {
+ HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+ }
+
+ public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
+ return Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ }
+
+ public static Boolean isTest(ArgumentApplicationParser parser) {
+ return Optional
+ .ofNullable(parser.get("isTest"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.FALSE);
+ }
+
+ public static void createCfHbforResult(SparkSession spark) {
+ org.apache.spark.sql.Dataset cfhb = spark.sql(cfHbforResultQuery);
+ cfhb.createOrReplaceTempView("cfhb");
+ }
+
+ public static Dataset readPath(
+ SparkSession spark, String inputPath, Class clazz) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java
new file mode 100644
index 000000000..271cc6bb3
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java
@@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+
+public class CountrySbs implements Serializable {
+ private String classid;
+ private String classname;
+
+ public String getClassid() {
+ return classid;
+ }
+
+ public void setClassid(String classid) {
+ this.classid = classid;
+ }
+
+ public String getClassname() {
+ return classname;
+ }
+
+ public void setClassname(String classname) {
+ this.classname = classname;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java
new file mode 100644
index 000000000..642192f73
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java
@@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+
+public class DatasourceCountry implements Serializable {
+ private String dataSourceId;
+ private CountrySbs country;
+
+ public String getDataSourceId() {
+ return dataSourceId;
+ }
+
+ public void setDataSourceId(String dataSourceId) {
+ this.dataSourceId = dataSourceId;
+ }
+
+ public CountrySbs getCountry() {
+ return country;
+ }
+
+ public void setCountry(CountrySbs country) {
+ this.country = country;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
new file mode 100644
index 000000000..e91a1e48a
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
@@ -0,0 +1,121 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+/**
+ * For the association of the country to the datasource The association is computed only for datasource of specific type
+ * or having whitelisted ids The country is registered in the Organization associated to the Datasource, so the relation
+ * provides between Datasource and Organization is exploited to get the country for the datasource
+ */
+public class PrepareDatasourceCountryAssociation {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareDatasourceCountryAssociation.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ prepareDatasourceCountryAssociation(
+ spark,
+ Arrays.asList(parser.get("whitelist").split(";")),
+ Arrays.asList(parser.get("allowedtypes").split(";")),
+ inputPath,
+ outputPath);
+ });
+ }
+
+ private static void prepareDatasourceCountryAssociation(
+ SparkSession spark,
+ List whitelist,
+ List allowedtypes,
+ String inputPath,
+ String outputPath) {
+ String whitelisted = "";
+ for (String i : whitelist) {
+ whitelisted += " OR id = '" + i + "'";
+ }
+
+ Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
+ Dataset relation = readPath(spark, inputPath + "/relation", Relation.class);
+ Dataset organization = readPath(spark, inputPath + "/organization", Organization.class);
+
+ datasource.createOrReplaceTempView("datasource");
+ relation.createOrReplaceTempView("relation");
+ organization.createOrReplaceTempView("organization");
+
+ String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
+ + "FROM ( SELECT id "
+ + " FROM datasource "
+ + " WHERE (datainfo.deletedbyinference = false "
+ + whitelisted
+ + ") "
+ + getConstraintList("datasourcetype.classid = '", allowedtypes)
+ + ") d "
+ + "JOIN ( SELECT source, target "
+ + " FROM relation "
+ + " WHERE relclass = '"
+ + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
+ + "' "
+ + " AND datainfo.deletedbyinference = false ) rel "
+ + "ON d.id = rel.source "
+ + "JOIN (SELECT id, country "
+ + " FROM organization "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND length(country.classid) > 0) o "
+ + "ON o.id = rel.target";
+
+ spark
+ .sql(query)
+ .as(Encoders.bean(DatasourceCountry.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputPath);
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
new file mode 100644
index 000000000..34b376413
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
@@ -0,0 +1,98 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class PrepareResultCountrySet {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
+
+ private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet "
+ + "FROM ( SELECT id, country "
+ + "FROM datasource_country JOIN cfhb ON cf = dataSourceId "
+ + "UNION ALL "
+ + "SELECT id, country FROM datasource_country "
+ + "JOIN cfhb ON hb = dataSourceId ) tmp "
+ + "GROUP BY id";
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCountrySet.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String datasourcecountrypath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", datasourcecountrypath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ getPotentialResultToUpdate(
+ spark,
+ inputPath,
+ outputPath,
+ datasourcecountrypath,
+ resultClazz);
+ });
+ }
+
+ private static void getPotentialResultToUpdate(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ String datasourcecountrypath,
+ Class resultClazz) {
+
+ Dataset result = readPath(spark, inputPath, resultClazz);
+ result.createOrReplaceTempView("result");
+ // log.info("number of results: {}", result.count());
+ createCfHbforResult(spark);
+
+ Dataset datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
+
+ datasource_country.createOrReplaceTempView("datasource_country");
+ // log.info("datasource_country number : {}", datasource_country.count());
+
+ spark
+ .sql(RESULT_COUNTRYSET_QUERY)
+ .as(Encoders.bean(ResultCountrySet.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Append)
+ .json(outputPath);
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java
new file mode 100644
index 000000000..8c29424f2
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultCountrySet implements Serializable {
+ private String resultId;
+ private ArrayList countrySet;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getCountrySet() {
+ return countrySet;
+ }
+
+ public void setCountrySet(ArrayList countrySet) {
+ this.countrySet = countrySet;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
new file mode 100644
index 000000000..9dc17701b
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@@ -0,0 +1,132 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Country;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import scala.Tuple2;
+
+public class SparkCountryPropagationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkCountryPropagationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String sourcePath = parser.get("sourcePath");
+ log.info("sourcePath: {}", sourcePath);
+
+ String preparedInfoPath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", preparedInfoPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> execPropagation(
+ spark,
+ sourcePath,
+ preparedInfoPath,
+ outputPath,
+ resultClazz,
+ saveGraph));
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String sourcePath,
+ String preparedInfoPath,
+ String outputPath,
+ Class resultClazz,
+ boolean saveGraph) {
+
+ if (saveGraph) {
+ // updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
+ log.info("Reading Graph table from: {}", sourcePath);
+ Dataset res = readPath(spark, sourcePath, resultClazz);
+
+ log.info("Reading prepared info: {}", preparedInfoPath);
+ Dataset prepared = spark
+ .read()
+ .json(preparedInfoPath)
+ .as(Encoders.bean(ResultCountrySet.class));
+
+ res
+ .joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
+ .map(getCountryMergeFn(), Encoders.bean(resultClazz))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputPath);
+ }
+ }
+
+ private static MapFunction, R> getCountryMergeFn() {
+ return (MapFunction, R>) t -> {
+ Optional.ofNullable(t._2()).ifPresent(r -> {
+ t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
+ });
+ return t._1();
+ };
+ }
+
+ private static List merge(List c1, List c2) {
+ HashSet countries = c1
+ .stream()
+ .map(c -> c.getClassid())
+ .collect(Collectors.toCollection(HashSet::new));
+
+ return c2
+ .stream()
+ .filter(c -> !countries.contains(c.getClassid()))
+ .map(c -> getCountry(c.getClassid(), c.getClassname()))
+ .collect(Collectors.toList());
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java
new file mode 100644
index 000000000..a5fcab360
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java
@@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+public class AutoritativeAuthor {
+
+ private String name;
+ private String surname;
+ private String fullname;
+ private String orcid;
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getSurname() {
+ return surname;
+ }
+
+ public void setSurname(String surname) {
+ this.surname = surname;
+ }
+
+ public String getFullname() {
+ return fullname;
+ }
+
+ public void setFullname(String fullname) {
+ this.fullname = fullname;
+ }
+
+ public String getOrcid() {
+ return orcid;
+ }
+
+ public void setOrcid(String orcid) {
+ this.orcid = orcid;
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
new file mode 100644
index 000000000..3e16b4b4b
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
@@ -0,0 +1,125 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class PrepareResultOrcidAssociationStep1 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConf = IOUtils
+ .toString(
+ PrepareResultOrcidAssociationStep1.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
+ log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
+
+ final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
+ log.info("resultType: {}", resultType);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ String inputRelationPath = inputPath + "/relation";
+ log.info("inputRelationPath: {}", inputRelationPath);
+
+ String inputResultPath = inputPath + "/" + resultType;
+ log.info("inputResultPath: {}", inputResultPath);
+
+ String outputResultPath = outputPath + "/" + resultType;
+ log.info("outputResultPath: {}", outputResultPath);
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ prepareInfo(
+ spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
+ });
+ }
+
+ private static void prepareInfo(
+ SparkSession spark,
+ String inputRelationPath,
+ String inputResultPath,
+ String outputResultPath,
+ Class resultClazz,
+ List allowedsemrel) {
+
+ Dataset relation = readPath(spark, inputRelationPath, Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ log.info("Reading Graph table from: {}", inputResultPath);
+ Dataset result = readPath(spark, inputResultPath, resultClazz);
+ result.createOrReplaceTempView("result");
+
+ String query = " select target resultId, author authorList"
+ + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
+ + " from ( "
+ + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
+ + " from result "
+ + " lateral view explode (author) a as MyT "
+ + " lateral view explode (MyT.pid) p as MyP "
+ + " where MyP.qualifier.classid = 'ORCID') tmp "
+ + " group by id) r_t "
+ + " join ("
+ + " select source, target "
+ + " from relation "
+ + " where datainfo.deletedbyinference = false "
+ + getConstraintList(" relclass = '", allowedsemrel)
+ + ") rel_rel "
+ + " on source = id";
+ spark
+ .sql(query)
+ .as(Encoders.bean(ResultOrcidList.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputResultPath);
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java
new file mode 100644
index 000000000..65d8811bc
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java
@@ -0,0 +1,97 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import scala.Tuple2;
+
+public class PrepareResultOrcidAssociationStep2 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class);
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultOrcidAssociationStep2.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ mergeInfo(spark, inputPath, outputPath);
+ });
+ }
+
+ private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {
+
+ Dataset resultOrcidAssoc = readPath(spark, inputPath + "/publication", ResultOrcidList.class)
+ .union(readPath(spark, inputPath + "/dataset", ResultOrcidList.class))
+ .union(readPath(spark, inputPath + "/otherresearchproduct", ResultOrcidList.class))
+ .union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
+
+ resultOrcidAssoc
+ .toJavaRDD()
+ .mapToPair(r -> new Tuple2<>(r.getResultId(), r))
+ .reduceByKey(
+ (a, b) -> {
+ if (a == null) {
+ return b;
+ }
+ if (b == null) {
+ return a;
+ }
+ Set orcid_set = new HashSet<>();
+ a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
+ b
+ .getAuthorList()
+ .stream()
+ .forEach(
+ aa -> {
+ if (!orcid_set.contains(aa.getOrcid())) {
+ a.getAuthorList().add(aa);
+ orcid_set.add(aa.getOrcid());
+ }
+ });
+ return a;
+ })
+ .map(c -> c._2())
+ .map(r -> OBJECT_MAPPER.writeValueAsString(r))
+ .saveAsTextFile(outputPath, GzipCodec.class);
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java
new file mode 100644
index 000000000..54b415d1c
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java
@@ -0,0 +1,27 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+public class ResultOrcidList implements Serializable {
+ String resultId;
+ List authorList = new ArrayList<>();
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public List getAuthorList() {
+ return authorList;
+ }
+
+ public void setAuthorList(List authorList) {
+ this.authorList = authorList;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java
new file mode 100644
index 000000000..ebb75a5a6
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java
@@ -0,0 +1,199 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import scala.Tuple2;
+
+public class SparkOrcidToResultFromSemRelJob {
+ private static final Logger log = LoggerFactory.getLogger(SparkOrcidToResultFromSemRelJob.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkOrcidToResultFromSemRelJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ final String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String possibleUpdates = parser.get("possibleUpdatesPath");
+ log.info("possibleUpdatesPath: {}", possibleUpdates);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ if (saveGraph)
+ execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String possibleUpdatesPath,
+ String inputPath,
+ String outputPath,
+ Class resultClazz) {
+
+ // read possible updates (resultId and list of possible orcid to add
+ Dataset possible_updates = readPath(spark, possibleUpdatesPath, ResultOrcidList.class);
+ // read the result we have been considering
+ Dataset result = readPath(spark, inputPath, resultClazz);
+ // make join result left_outer with possible updates
+
+ result
+ .joinWith(
+ possible_updates,
+ result.col("id").equalTo(possible_updates.col("resultId")),
+ "left_outer")
+ .map(authorEnrichFn(), Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static MapFunction, R> authorEnrichFn() {
+ return (MapFunction, R>) value -> {
+ R ret = value._1();
+ Optional rol = Optional.ofNullable(value._2());
+ if (rol.isPresent()) {
+ List toenrich_author = ret.getAuthor();
+ List autoritativeAuthors = rol.get().getAuthorList();
+ for (Author author : toenrich_author) {
+ if (!containsAllowedPid(author)) {
+ enrichAuthor(author, autoritativeAuthors);
+ }
+ }
+ }
+
+ return ret;
+ };
+ }
+
+ private static void enrichAuthor(Author a, List au) {
+ for (AutoritativeAuthor aa : au) {
+ if (enrichAuthor(aa, a)) {
+ return;
+ }
+ }
+ }
+
+ private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
+ boolean toaddpid = false;
+
+ if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) {
+ if (StringUtils.isNoneEmpty(author.getSurname())) {
+ if (autoritative_author
+ .getSurname()
+ .trim()
+ .equalsIgnoreCase(author.getSurname().trim())) {
+
+ // have the same surname. Check the name
+ if (StringUtils.isNoneEmpty(autoritative_author.getName())) {
+ if (StringUtils.isNoneEmpty(author.getName())) {
+ if (autoritative_author
+ .getName()
+ .trim()
+ .equalsIgnoreCase(author.getName().trim())) {
+ toaddpid = true;
+ }
+ // they could be differently written (i.e. only the initials of the name
+ // in one of the two
+ if (autoritative_author
+ .getName()
+ .trim()
+ .substring(0, 0)
+ .equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
+ toaddpid = true;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (toaddpid) {
+ StructuredProperty p = new StructuredProperty();
+ p.setValue(autoritative_author.getOrcid());
+ p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID));
+ p
+ .setDataInfo(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID,
+ PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME));
+
+ Optional> authorPid = Optional.ofNullable(author.getPid());
+ if (authorPid.isPresent()) {
+ authorPid.get().add(p);
+ } else {
+ author.setPid(Lists.newArrayList(p));
+ }
+
+ }
+ return toaddpid;
+ }
+
+ private static boolean containsAllowedPid(Author a) {
+ Optional> pids = Optional.ofNullable(a.getPid());
+ if (!pids.isPresent()) {
+ return false;
+ }
+ for (StructuredProperty pid : pids.get()) {
+ if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) {
+ return true;
+ }
+ }
+ return false;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java
new file mode 100644
index 000000000..05dcdc692
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java
@@ -0,0 +1,126 @@
+
+package eu.dnetlib.dhp.projecttoresult;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.PropagationConstant.getConstraintList;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareProjectResultsAssociation {
+ private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareProjectResultsAssociation.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String potentialUpdatePath = parser.get("potentialUpdatePath");
+ log.info("potentialUpdatePath {}: ", potentialUpdatePath);
+
+ String alreadyLinkedPath = parser.get("alreadyLinkedPath");
+ log.info("alreadyLinkedPath: {} ", alreadyLinkedPath);
+
+ final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
+ log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ prepareResultProjProjectResults(
+ spark,
+ inputPath,
+ potentialUpdatePath,
+ alreadyLinkedPath,
+ allowedsemrel);
+ });
+ }
+
+ private static void prepareResultProjProjectResults(
+ SparkSession spark,
+ String inputPath,
+ String potentialUpdatePath,
+ String alreadyLinkedPath,
+ List allowedsemrel) {
+
+ Dataset relation = readPath(spark, inputPath, Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ String resproj_relation_query = "SELECT source, target "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND relClass = '"
+ + RELATION_RESULT_PROJECT_REL_CLASS
+ + "'";
+
+ Dataset resproj_relation = spark.sql(resproj_relation_query);
+ resproj_relation.createOrReplaceTempView("resproj_relation");
+
+ String potential_update_query = "SELECT resultId, collect_set(projectId) projectSet "
+ + "FROM ( "
+ + "SELECT r1.target resultId, r2.target projectId "
+ + " FROM (SELECT source, target "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + getConstraintList(" relClass = '", allowedsemrel)
+ + " ) r1"
+ + " JOIN resproj_relation r2 "
+ + " ON r1.source = r2.source "
+ + " ) tmp "
+ + "GROUP BY resultId ";
+
+ spark
+ .sql(potential_update_query)
+ .as(Encoders.bean(ResultProjectSet.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(potentialUpdatePath);
+
+ String result_projectset_query = "SELECT source resultId, collect_set(target) projectSet "
+ + "FROM resproj_relation "
+ + "GROUP BY source";
+
+ spark
+ .sql(result_projectset_query)
+ .as(Encoders.bean(ResultProjectSet.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(alreadyLinkedPath);
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java
new file mode 100644
index 000000000..1d5280874
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.projecttoresult;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultProjectSet implements Serializable {
+ private String resultId;
+ private ArrayList projectSet;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getProjectSet() {
+ return projectSet;
+ }
+
+ public void setProjectSet(ArrayList project) {
+ this.projectSet = project;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java
new file mode 100644
index 000000000..36694b3dd
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java
@@ -0,0 +1,147 @@
+
+package eu.dnetlib.dhp.projecttoresult;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+public class SparkResultToProjectThroughSemRelJob {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkResultToProjectThroughSemRelJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ final String potentialUpdatePath = parser.get("potentialUpdatePath");
+ log.info("potentialUpdatePath {}: ", potentialUpdatePath);
+
+ final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
+ log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
+
+ final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
+ log.info("saveGraph: {}", saveGraph);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ execPropagation(
+ spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String outputPath,
+ String alreadyLinkedPath,
+ String potentialUpdatePath,
+ Boolean saveGraph) {
+
+ Dataset toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class);
+ Dataset alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class);
+
+ if (saveGraph) {
+ toaddrelations
+ .joinWith(
+ alreadyLinked,
+ toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")),
+ "left_outer")
+ .flatMap(mapRelationRn(), Encoders.bean(Relation.class))
+ .write()
+ .mode(SaveMode.Append)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+ }
+
+ private static FlatMapFunction, Relation> mapRelationRn() {
+ return (FlatMapFunction, Relation>) value -> {
+ List new_relations = new ArrayList<>();
+ ResultProjectSet potential_update = value._1();
+ Optional already_linked = Optional.ofNullable(value._2());
+ if (already_linked.isPresent()) {
+ already_linked
+ .get()
+ .getProjectSet()
+ .stream()
+ .forEach(
+ (p -> {
+ if (potential_update
+ .getProjectSet()
+ .contains(p)) {
+ potential_update.getProjectSet().remove(p);
+ }
+ }));
+ }
+ String resId = potential_update.getResultId();
+ potential_update
+ .getProjectSet()
+ .stream()
+ .forEach(
+ projectId -> {
+ new_relations
+ .add(
+ getRelation(
+ resId,
+ projectId,
+ RELATION_RESULT_PROJECT_REL_CLASS,
+ RELATION_RESULTPROJECT_REL_TYPE,
+ RELATION_RESULTPROJECT_SUBREL_TYPE,
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
+ new_relations
+ .add(
+ getRelation(
+ projectId,
+ resId,
+ RELATION_PROJECT_RESULT_REL_CLASS,
+ RELATION_RESULTPROJECT_REL_TYPE,
+ RELATION_RESULTPROJECT_SUBREL_TYPE,
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
+ PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
+ });
+ return new_relations.iterator();
+ };
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java
new file mode 100644
index 000000000..7d786058a
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java
@@ -0,0 +1,21 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+public class OrganizationMap extends HashMap> {
+
+ public OrganizationMap() {
+ super();
+ }
+
+ public List get(String key) {
+
+ if (super.get(key) == null) {
+ return new ArrayList<>();
+ }
+ return super.get(key);
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java
new file mode 100644
index 000000000..e2d4d5687
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java
@@ -0,0 +1,130 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.*;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareResultCommunitySet {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCommunitySet.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final OrganizationMap organizationMap = new Gson()
+ .fromJson(
+ parser.get("organizationtoresultcommunitymap"),
+ OrganizationMap.class);
+ log.info("organizationMap: {}", new Gson().toJson(organizationMap));
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ prepareInfo(spark, inputPath, outputPath, organizationMap);
+ });
+ }
+
+ private static void prepareInfo(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ OrganizationMap organizationMap) {
+
+ Dataset relation = readPath(spark, inputPath, Relation.class);
+ relation.createOrReplaceTempView("relation");
+
+ String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges "
+ + "FROM (SELECT source, target "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND relClass = '"
+ + RELATION_RESULT_ORGANIZATION_REL_CLASS
+ + "') result_organization "
+ + "LEFT JOIN (SELECT source, collect_set(target) org_set "
+ + " FROM relation "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND relClass = '"
+ + RELATION_REPRESENTATIVERESULT_RESULT_CLASS
+ + "' "
+ + " GROUP BY source) organization_organization "
+ + "ON result_organization.target = organization_organization.source ";
+
+ Dataset result_organizationset = spark
+ .sql(query)
+ .as(Encoders.bean(ResultOrganizations.class));
+
+ result_organizationset
+ .map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
+ .filter(Objects::nonNull)
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static MapFunction mapResultCommunityFn(
+ OrganizationMap organizationMap) {
+ return (MapFunction) value -> {
+ String rId = value.getResultId();
+ Optional> orgs = Optional.ofNullable(value.getMerges());
+ String oTarget = value.getOrgId();
+ Set communitySet = new HashSet<>();
+ if (organizationMap.containsKey(oTarget)) {
+ communitySet.addAll(organizationMap.get(oTarget));
+ }
+ if (orgs.isPresent())
+ for (String oId : orgs.get()) {
+ if (organizationMap.containsKey(oId)) {
+ communitySet.addAll(organizationMap.get(oId));
+ }
+ }
+ if (communitySet.size() > 0) {
+ ResultCommunityList rcl = new ResultCommunityList();
+ rcl.setResultId(rId);
+ ArrayList communityList = new ArrayList<>();
+ communityList.addAll(communitySet);
+ rcl.setCommunityList(communityList);
+ return rcl;
+ }
+ return null;
+ };
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java
new file mode 100644
index 000000000..e3275745d
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultCommunityList implements Serializable {
+ private String resultId;
+ private ArrayList communityList;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getCommunityList() {
+ return communityList;
+ }
+
+ public void setCommunityList(ArrayList communityList) {
+ this.communityList = communityList;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java
new file mode 100644
index 000000000..3ea9d41d6
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java
@@ -0,0 +1,35 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultOrganizations implements Serializable {
+ private String resultId;
+ private String orgId;
+ private ArrayList merges;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public String getOrgId() {
+ return orgId;
+ }
+
+ public void setOrgId(String orgId) {
+ this.orgId = orgId;
+ }
+
+ public ArrayList getMerges() {
+ return merges;
+ }
+
+ public void setMerges(ArrayList merges) {
+ this.merges = merges;
+ }
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java
new file mode 100644
index 000000000..71275cc7f
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java
@@ -0,0 +1,137 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromorganization;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+import scala.Tuple2;
+
+public class SparkResultToCommunityFromOrganizationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromOrganizationJob.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkResultToCommunityFromOrganizationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String possibleupdatespath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", possibleupdatespath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ if (saveGraph)
+ execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ Class resultClazz,
+ String possibleUpdatesPath) {
+
+ Dataset possibleUpdates = readPath(spark, possibleUpdatesPath, ResultCommunityList.class);
+ Dataset result = readPath(spark, inputPath, resultClazz);
+
+ result
+ .joinWith(
+ possibleUpdates,
+ result.col("id").equalTo(possibleUpdates.col("resultId")),
+ "left_outer")
+ .map(resultCommunityFn(), Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ private static MapFunction, R> resultCommunityFn() {
+ return (MapFunction, R>) value -> {
+ R ret = value._1();
+ Optional rcl = Optional.ofNullable(value._2());
+ if (rcl.isPresent()) {
+ ArrayList communitySet = rcl.get().getCommunityList();
+ List contextList = ret
+ .getContext()
+ .stream()
+ .map(con -> con.getId())
+ .collect(Collectors.toList());
+ Result res = new Result();
+ res.setId(ret.getId());
+ List propagatedContexts = new ArrayList<>();
+ for (String cId : communitySet) {
+ if (!contextList.contains(cId)) {
+ Context newContext = new Context();
+ newContext.setId(cId);
+ newContext
+ .setDataInfo(
+ Arrays
+ .asList(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID,
+ PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME)));
+ propagatedContexts.add(newContext);
+ }
+ }
+ res.setContext(propagatedContexts);
+ ret.mergeFrom(res);
+ }
+ return ret;
+ };
+ }
+
+}
diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java
new file mode 100644
index 000000000..4f5ac2552
--- /dev/null
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java
@@ -0,0 +1,167 @@
+
+package eu.dnetlib.dhp.resulttocommunityfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class PrepareResultCommunitySetStep1 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class);
+
+ private static final String COMMUNITY_LIST_XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')"
+ + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']"
+ + " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'"
+ + " return $x//CONFIGURATION/context/@id/string()";
+
+ /**
+ * associates to each result the set of community contexts they are associated to; associates to each target of a
+ * relation with allowed semantics the set of community context it could possibly inherit from the source of the
+ * relation
+ */
+ // TODO
+ private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context "
+ + "from (select id, collect_set(co.id) community_context "
+ + " from result "
+ + " lateral view explode (context) c as co "
+ + " where datainfo.deletedbyinference = false %s group by id) p "
+ + " JOIN "
+ + " (select source, target from relation "
+ + " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source";
+
+ /**
+ * a dataset for example could be linked to more than one publication. For each publication linked to that dataset
+ * the previous query will produce a row: targetId set of community context the target could possibly inherit with
+ * the following query there will be a single row for each result linked to more than one result of the result type
+ * currently being used
+ */
+ // TODO
+ private static final String RESULT_COMMUNITY_LIST_QUERY = "select resultId , collect_set(co) communityList "
+ + "from result_context "
+ + "lateral view explode (community_context) c as co "
+ + "where length(co) > 0 "
+ + "group by resultId";
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCommunitySetStep1.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
+ log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
+
+ final String isLookupUrl = parser.get("isLookUpUrl");
+ log.info("isLookupUrl: {}", isLookupUrl);
+
+ final List communityIdList = getCommunityList(isLookupUrl);
+ log.info("communityIdList: {}", new Gson().toJson(communityIdList));
+
+ final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
+ log.info("resultType: {}", resultType);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ if (isTest(parser)) {
+ removeOutputDir(spark, outputPath);
+ }
+ prepareInfo(
+ spark,
+ inputPath,
+ outputPath,
+ allowedsemrel,
+ resultClazz,
+ resultType,
+ communityIdList);
+ });
+ }
+
+ private static void prepareInfo(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ List allowedsemrel,
+ Class