diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index c1d6e1b5b..c7cb11b08 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -83,6 +83,10 @@
com.jayway.jsonpath
json-path
+
+ org.postgresql
+ postgresql
+
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
similarity index 95%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java
rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
index 94f17aad5..cedc9bd4d 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.common;
import java.io.Closeable;
import java.io.IOException;
@@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
- private final Connection connection;
+ private Connection connection;
public DbClient(final String address, final String login, final String password) {
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
index accc06d12..e32dd10fa 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java
@@ -13,6 +13,7 @@ public class ModelConstants {
public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date";
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
+ public static final String DNET_COUNTRY_TYPE = "dnet:countries";
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
@@ -49,6 +50,13 @@ public class ModelConstants {
public static final String HAS_PARTICIPANT = "hasParticipant";
public static final String IS_PARTICIPANT = "isParticipant";
+ public static final String RESULT_ORGANIZATION = "resultOrganization";
+ public static final String AFFILIATION = "affiliation";
+ public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf";
+ public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution";
+
+ public static final String MERGES = "merges";
+
public static final String UNKNOWN = "UNKNOWN";
public static final String NOT_AVAILABLE = "not available";
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
index cdde37fd4..9ee7c2deb 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
@@ -1,10 +1,15 @@
package eu.dnetlib.dhp.schema.common;
+import static com.google.common.base.Preconditions.checkArgument;
+
import java.util.Map;
+import java.util.Objects;
import java.util.Optional;
import java.util.function.Function;
+import org.apache.commons.lang3.StringUtils;
+
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.*;
@@ -13,7 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class ModelSupport {
/** Defines the mapping between the actual entity type and the main entity type */
- private static final Map entityMapping = Maps.newHashMap();
+ private static Map entityMapping = Maps.newHashMap();
static {
entityMapping.put(EntityType.publication, MainEntityType.result);
@@ -53,6 +58,232 @@ public class ModelSupport {
oafTypes.put("relation", Relation.class);
}
+ public static final Map entityIdPrefix = Maps.newHashMap();
+
+ static {
+ entityIdPrefix.put("datasource", "10");
+ entityIdPrefix.put("organization", "20");
+ entityIdPrefix.put("project", "40");
+ entityIdPrefix.put("result", "50");
+ }
+
+ public static final Map relationInverseMap = Maps.newHashMap();
+
+ static {
+ relationInverseMap
+ .put(
+ "personResult_authorship_isAuthorOf", new RelationInverse()
+ .setRelation("isAuthorOf")
+ .setInverse("hasAuthor")
+ .setRelType("personResult")
+ .setSubReltype("authorship"));
+ relationInverseMap
+ .put(
+ "personResult_authorship_hasAuthor", new RelationInverse()
+ .setInverse("isAuthorOf")
+ .setRelation("hasAuthor")
+ .setRelType("personResult")
+ .setSubReltype("authorship"));
+ relationInverseMap
+ .put(
+ "projectOrganization_participation_isParticipant", new RelationInverse()
+ .setRelation("isParticipant")
+ .setInverse("hasParticipant")
+ .setRelType("projectOrganization")
+ .setSubReltype("participation"));
+ relationInverseMap
+ .put(
+ "projectOrganization_participation_hasParticipant", new RelationInverse()
+ .setInverse("isParticipant")
+ .setRelation("hasParticipant")
+ .setRelType("projectOrganization")
+ .setSubReltype("participation"));
+ relationInverseMap
+ .put(
+ "resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
+ .setRelation("hasAuthorInstitution")
+ .setInverse("isAuthorInstitutionOf")
+ .setRelType("resultOrganization")
+ .setSubReltype("affiliation"));
+ relationInverseMap
+ .put(
+ "resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
+ .setInverse("hasAuthorInstitution")
+ .setRelation("isAuthorInstitutionOf")
+ .setRelType("resultOrganization")
+ .setSubReltype("affiliation"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedup_merges", new RelationInverse()
+ .setRelation("merges")
+ .setInverse("isMergedIn")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("organizationOrganization")
+ .setSubReltype("dedupSimilarity"));
+
+ relationInverseMap
+ .put(
+ "resultProject_outcome_isProducedBy", new RelationInverse()
+ .setRelation("isProducedBy")
+ .setInverse("produces")
+ .setRelType("resultProject")
+ .setSubReltype("outcome"));
+ relationInverseMap
+ .put(
+ "resultProject_outcome_produces", new RelationInverse()
+ .setInverse("isProducedBy")
+ .setRelation("produces")
+ .setRelType("resultProject")
+ .setSubReltype("outcome"));
+ relationInverseMap
+ .put(
+ "projectPerson_contactPerson_isContact", new RelationInverse()
+ .setRelation("isContact")
+ .setInverse("hasContact")
+ .setRelType("projectPerson")
+ .setSubReltype("contactPerson"));
+ relationInverseMap
+ .put(
+ "projectPerson_contactPerson_hasContact", new RelationInverse()
+ .setInverse("isContact")
+ .setRelation("hasContact")
+ .setRelType("personPerson")
+ .setSubReltype("coAuthorship"));
+ relationInverseMap
+ .put(
+ "personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
+ .setInverse("isCoAuthorOf")
+ .setRelation("isCoAuthorOf")
+ .setRelType("personPerson")
+ .setSubReltype("coAuthorship"));
+ relationInverseMap
+ .put(
+ "personPerson_dedup_merges", new RelationInverse()
+ .setInverse("isMergedIn")
+ .setRelation("merges")
+ .setRelType("personPerson")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "personPerson_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("personPerson")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("personPerson")
+ .setSubReltype("dedupSimilarity"));
+ relationInverseMap
+ .put(
+ "datasourceOrganization_provision_isProvidedBy", new RelationInverse()
+ .setInverse("provides")
+ .setRelation("isProvidedBy")
+ .setRelType("datasourceOrganization")
+ .setSubReltype("provision"));
+ relationInverseMap
+ .put(
+ "datasourceOrganization_provision_provides", new RelationInverse()
+ .setInverse("isProvidedBy")
+ .setRelation("provides")
+ .setRelType("datasourceOrganization")
+ .setSubReltype("provision"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("isAmongTopNSimilarDocuments")
+ .setRelation("hasAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("hasAmongTopNSimilarDocuments")
+ .setRelation("isAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_relationship_isRelatedTo", new RelationInverse()
+ .setInverse("isRelatedTo")
+ .setRelation("isRelatedTo")
+ .setRelType("resultResult")
+ .setSubReltype("relationship"));
+ relationInverseMap
+ .put(
+ "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
+ .setInverse("hasAmongTopNSimilarDocuments")
+ .setRelation("isAmongTopNSimilarDocuments")
+ .setRelType("resultResult")
+ .setSubReltype("similarity"));
+ relationInverseMap
+ .put(
+ "resultResult_supplement_isSupplementTo", new RelationInverse()
+ .setInverse("isSupplementedBy")
+ .setRelation("isSupplementTo")
+ .setRelType("resultResult")
+ .setSubReltype("supplement"));
+ relationInverseMap
+ .put(
+ "resultResult_supplement_isSupplementedBy", new RelationInverse()
+ .setInverse("isSupplementTo")
+ .setRelation("isSupplementedBy")
+ .setRelType("resultResult")
+ .setSubReltype("supplement"));
+ relationInverseMap
+ .put(
+ "resultResult_part_isPartOf", new RelationInverse()
+ .setInverse("hasPart")
+ .setRelation("isPartOf")
+ .setRelType("resultResult")
+ .setSubReltype("part"));
+ relationInverseMap
+ .put(
+ "resultResult_part_hasPart", new RelationInverse()
+ .setInverse("isPartOf")
+ .setRelation("hasPart")
+ .setRelType("resultResult")
+ .setSubReltype("part"));
+ relationInverseMap
+ .put(
+ "resultResult_dedup_merges", new RelationInverse()
+ .setInverse("isMergedIn")
+ .setRelation("merges")
+ .setRelType("resultResult")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "resultResult_dedup_isMergedIn", new RelationInverse()
+ .setInverse("merges")
+ .setRelation("isMergedIn")
+ .setRelType("resultResult")
+ .setSubReltype("dedup"));
+ relationInverseMap
+ .put(
+ "resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
+ .setInverse("isSimilarTo")
+ .setRelation("isSimilarTo")
+ .setRelType("resultResult")
+ .setSubReltype("dedupSimilarity"));
+
+ }
+
private static final String schemeTemplate = "dnet:%s_%s_relations";
private ModelSupport() {
@@ -153,6 +384,21 @@ public class ModelSupport {
entityMapping.get(EntityType.valueOf(targetType)).name());
}
+ public static String tableIdentifier(String dbName, String tableName) {
+
+ checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty");
+ checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty");
+
+ return String.format("%s.%s", dbName, tableName);
+ }
+
+ public static String tableIdentifier(String dbName, Class clazz) {
+
+ checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null");
+
+ return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase());
+ }
+
public static Function idFn() {
return x -> {
if (isSubClass(x, Relation.class)) {
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java
new file mode 100644
index 000000000..4757c637e
--- /dev/null
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java
@@ -0,0 +1,46 @@
+
+package eu.dnetlib.dhp.schema.common;
+
+public class RelationInverse {
+ private String relation;
+ private String inverse;
+ private String relType;
+ private String subReltype;
+
+ public String getRelType() {
+ return relType;
+ }
+
+ public RelationInverse setRelType(String relType) {
+ this.relType = relType;
+ return this;
+ }
+
+ public String getSubReltype() {
+ return subReltype;
+ }
+
+ public RelationInverse setSubReltype(String subReltype) {
+ this.subReltype = subReltype;
+ return this;
+ }
+
+ public String getRelation() {
+ return relation;
+ }
+
+ public RelationInverse setRelation(String relation) {
+ this.relation = relation;
+ return this;
+ }
+
+ public String getInverse() {
+ return inverse;
+ }
+
+ public RelationInverse setInverse(String inverse) {
+ this.inverse = inverse;
+ return this;
+ }
+
+}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
index b9bd4c5f0..231fb1e60 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java
@@ -2,8 +2,7 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
-import java.util.List;
-import java.util.Objects;
+import java.util.*;
public class Author implements Serializable {
@@ -86,4 +85,5 @@ public class Author implements Serializable {
public int hashCode() {
return Objects.hash(fullname, name, surname, rank, pid, affiliation);
}
+
}
diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java
index 90d573ac0..e55c0eb7b 100644
--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java
@@ -523,7 +523,9 @@ public class ProtoConverter implements Serializable {
}
private static Context mapContext(ResultProtos.Result.Context context) {
-
+ if (context == null || StringUtils.isBlank(context.getId())) {
+ return null;
+ }
final Context entity = new Context();
entity.setId(context.getId());
entity
@@ -537,6 +539,10 @@ public class ProtoConverter implements Serializable {
}
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
+ if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) {
+ return null;
+ }
+
final KeyValue keyValue = new KeyValue();
keyValue.setKey(kv.getKey());
keyValue.setValue(kv.getValue());
@@ -575,6 +581,10 @@ public class ProtoConverter implements Serializable {
}
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
+ if (sp == null | StringUtils.isBlank(sp.getValue())) {
+ return null;
+ }
+
final StructuredProperty structuredProperty = new StructuredProperty();
structuredProperty.setValue(sp.getValue());
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
@@ -611,6 +621,10 @@ public class ProtoConverter implements Serializable {
}
public static Field mapStringField(FieldTypeProtos.StringField s) {
+ if (s == null || StringUtils.isBlank(s.getValue())) {
+ return null;
+ }
+
final Field stringField = new Field<>();
stringField.setValue(s.getValue());
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
@@ -618,19 +632,16 @@ public class ProtoConverter implements Serializable {
}
public static Field mapBoolField(FieldTypeProtos.BoolField b) {
+ if (b == null) {
+ return null;
+ }
+
final Field booleanField = new Field<>();
booleanField.setValue(b.getValue());
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
return booleanField;
}
- public static Field mapIntField(FieldTypeProtos.IntField b) {
- final Field entity = new Field<>();
- entity.setValue(b.getValue());
- entity.setDataInfo(mapDataInfo(b.getDataInfo()));
- return entity;
- }
-
public static Journal mapJournal(FieldTypeProtos.Journal j) {
final Journal journal = new Journal();
journal.setConferencedate(j.getConferencedate());
diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml
new file mode 100644
index 000000000..37abc22f6
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/pom.xml
@@ -0,0 +1,36 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.1-SNAPSHOT
+
+ 4.0.0
+
+ dhp-blacklist
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java
new file mode 100644
index 000000000..b4bcc509e
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java
@@ -0,0 +1,94 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PrepareMergedRelationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareMergedRelationJob.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareMergedRelationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ String outputPath = parser.get("outputPath");
+ log.info("outputPath: {} ", outputPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ selectMergesRelations(
+ spark,
+ inputPath,
+ outputPath);
+ });
+ }
+
+ private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
+
+ Dataset relation = readRelations(spark, inputPath);
+
+ relation
+ .filter("relclass = 'merges' and datainfo.deletedbyinference=false")
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ public static org.apache.spark.sql.Dataset readRelations(
+ SparkSession spark, String inputPath) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map(
+ (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class),
+ Encoders.bean(Relation.class));
+ }
+
+ private static void removeOutputDir(SparkSession spark, String path) {
+ HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+ }
+
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java
new file mode 100644
index 000000000..2caa66db4
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java
@@ -0,0 +1,141 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import java.io.BufferedWriter;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.sql.ResultSet;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.function.Function;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.DbClient;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.common.RelationInverse;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class ReadBlacklistFromDB implements Closeable {
+
+ private final DbClient dbClient;
+ private static final Log log = LogFactory.getLog(ReadBlacklistFromDB.class);
+ private final Configuration conf;
+ private final BufferedWriter writer;
+ private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private final static String query = "SELECT source_type, unnest(original_source_objects) as source, " +
+ "target_type, unnest(original_target_objects) as target, " +
+ "relationship FROM blacklist WHERE status = 'ACCEPTED'";
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ ReadBlacklistFromDB.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/blacklist_parameters.json")));
+
+ parser.parseArgument(args);
+
+ final String dbUrl = parser.get("postgresUrl");
+ final String dbUser = parser.get("postgresUser");
+ final String dbPassword = parser.get("postgresPassword");
+ final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
+ final String hdfsNameNode = parser.get("hdfsNameNode");
+
+ try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
+ dbPassword)) {
+
+ log.info("Processing blacklist...");
+ rbl.execute(query, rbl::processBlacklistEntry);
+
+ }
+ }
+
+ public void execute(final String sql, final Function> producer) throws Exception {
+
+ final Consumer consumer = rs -> producer.apply(rs).forEach(r -> writeRelation(r));
+
+ dbClient.processResults(sql, consumer);
+ }
+
+ public List processBlacklistEntry(ResultSet rs) {
+ try {
+ Relation direct = new Relation();
+ Relation inverse = new Relation();
+
+ String source_prefix = ModelSupport.entityIdPrefix.get(rs.getString("source_type"));
+ String target_prefix = ModelSupport.entityIdPrefix.get(rs.getString("target_type"));
+
+ String source_direct = source_prefix + "|" + rs.getString("source");
+ direct.setSource(source_direct);
+ inverse.setTarget(source_direct);
+
+ String target_direct = target_prefix + "|" + rs.getString("target");
+ direct.setTarget(target_direct);
+ inverse.setSource(target_direct);
+
+ String encoding = rs.getString("relationship");
+ RelationInverse ri = ModelSupport.relationInverseMap.get(encoding);
+ direct.setRelClass(ri.getRelation());
+ inverse.setRelClass(ri.getInverse());
+ direct.setRelType(ri.getRelType());
+ inverse.setRelType(ri.getRelType());
+ direct.setSubRelType(ri.getSubReltype());
+ inverse.setSubRelType(ri.getSubReltype());
+
+ return Arrays.asList(direct, inverse);
+
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ dbClient.close();
+ writer.close();
+ }
+
+ public ReadBlacklistFromDB(
+ final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword)
+ throws Exception {
+
+ this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
+ this.conf = new Configuration();
+ this.conf.set("fs.defaultFS", hdfsNameNode);
+ FileSystem fileSystem = FileSystem.get(this.conf);
+ Path hdfsWritePath = new Path(hdfsPath);
+ FSDataOutputStream fsDataOutputStream = null;
+ if (fileSystem.exists(hdfsWritePath)) {
+ fsDataOutputStream = fileSystem.append(hdfsWritePath);
+ } else {
+ fsDataOutputStream = fileSystem.create(hdfsWritePath);
+ }
+
+ this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
+ }
+
+ protected void writeRelation(final Relation r) {
+ try {
+ writer.write(OBJECT_MAPPER.writeValueAsString(r));
+ writer.newLine();
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java
new file mode 100644
index 000000000..92289ec2d
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java
@@ -0,0 +1,152 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Objects;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+public class SparkRemoveBlacklistedRelationJob {
+ private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkRemoveBlacklistedRelationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ final String blacklistPath = parser.get("hdfsPath");
+ log.info("blacklistPath {}: ", blacklistPath);
+
+ final String mergesPath = parser.get("mergesPath");
+ log.info("mergesPath {}: ", mergesPath);
+
+ SparkConf conf = new SparkConf();
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ removeBlacklistedRelations(
+ spark,
+ blacklistPath,
+ inputPath,
+ outputPath,
+ mergesPath);
+ });
+ }
+
+ private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
+ String outputPath, String mergesPath) {
+ Dataset blackListed = readRelations(spark, blacklistPath + "/blacklist");
+ Dataset inputRelation = readRelations(spark, inputPath);
+ Dataset mergesRelation = readRelations(spark, mergesPath);
+
+ log.info("InputRelationCount: {}", inputRelation.count());
+
+ Dataset dedupSource = blackListed
+ .joinWith(
+ mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
+ "left_outer")
+ .map((MapFunction, Relation>) c -> {
+ Optional
+ .ofNullable(c._2())
+ .ifPresent(mr -> c._1().setSource(mr.getSource()));
+ return c._1();
+ }, Encoders.bean(Relation.class));
+
+ Dataset dedupBL = dedupSource
+ .joinWith(
+ mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")),
+ "left_outer")
+ .map((MapFunction, Relation>) c -> {
+ Optional
+ .ofNullable(c._2())
+ .ifPresent(mr -> c._1().setTarget(mr.getSource()));
+ return c._1();
+ }, Encoders.bean(Relation.class));
+
+ dedupBL
+ .write()
+ .mode(SaveMode.Overwrite)
+ .json(blacklistPath + "/deduped");
+
+ inputRelation
+ .joinWith(
+ dedupBL, (inputRelation
+ .col("source")
+ .equalTo(dedupBL.col("source"))
+ .and(
+ inputRelation
+ .col("target")
+ .equalTo(dedupBL.col("target")))),
+ "left_outer")
+ .map((MapFunction, Relation>) c -> {
+ Relation ir = c._1();
+ Optional obl = Optional.ofNullable(c._2());
+ if (obl.isPresent()) {
+ if (ir.equals(obl.get())) {
+ return null;
+ }
+ }
+ return ir;
+ }, Encoders.bean(Relation.class))
+ .filter(Objects::nonNull)
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ public static org.apache.spark.sql.Dataset readRelations(
+ SparkSession spark, String inputPath) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map(
+ (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class),
+ Encoders.bean(Relation.class));
+ }
+
+ private static void removeOutputDir(SparkSession spark, String path) {
+ HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+ }
+
+}
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json
new file mode 100644
index 000000000..9a2eadaa7
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json
@@ -0,0 +1,32 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nn",
+ "paramLongName": "hdfsNameNode",
+ "paramDescription": "the name node on hdfs",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pgurl",
+ "paramLongName": "postgresUrl",
+ "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pguser",
+ "paramLongName": "postgresUser",
+ "paramDescription": "postgres user",
+ "paramRequired": false
+ },
+ {
+ "paramName": "pgpasswd",
+ "paramLongName": "postgresPassword",
+ "paramDescription": "postgres password",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json
new file mode 100644
index 000000000..4a3d21f4d
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json
@@ -0,0 +1,26 @@
+[
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the path to the graph used to remove the relations ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path where to store the temporary result ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "issm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": false
+ },
+ {
+ "paramName":"h",
+ "paramLongName":"hive_metastore_uris",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml
new file mode 100644
index 000000000..fe82ae194
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml
@@ -0,0 +1,54 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+
+
+ spark2SqlQueryExecutionListeners
+ com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+ sparkExecutorNumber
+ 4
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ sparkDriverMemory
+ 15G
+
+
+ sparkExecutorMemory
+ 6G
+
+
+ sparkExecutorCores
+ 1
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
new file mode 100644
index 000000000..dd7827da4
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
@@ -0,0 +1,200 @@
+
+
+
+ postgresURL
+ the url of the postgress server to query
+
+
+ postgresUser
+ the username to access the postgres db
+
+
+ postgresPassword
+ the postgres password
+
+
+ sourcePath
+ the source path
+
+
+ outputPath
+ the graph output path
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+ mapreduce.job.queuename
+ ${queueName}
+
+
+ oozie.launcher.mapred.job.queue.name
+ ${oozieLauncherQueueName}
+
+
+ oozie.action.sharelib.for.spark
+ ${oozieActionShareLibForSpark2}
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${nameNode}/${sourcePath}/publication
+ ${nameNode}/${outputPath}/publication
+
+
+
+
+
+
+
+ ${nameNode}/${sourcePath}/dataset
+ ${nameNode}/${outputPath}/dataset
+
+
+
+
+
+
+
+ ${nameNode}/${sourcePath}/otherresearchproduct
+ ${nameNode}/${outputPath}/otherresearchproduct
+
+
+
+
+
+
+
+ ${nameNode}/${sourcePath}/software
+ ${nameNode}/${outputPath}/software
+
+
+
+
+
+
+
+ ${nameNode}/${sourcePath}/organization
+ ${nameNode}/${outputPath}/organization
+
+
+
+
+
+
+
+ ${nameNode}/${sourcePath}/project
+ ${nameNode}/${outputPath}/project
+
+
+
+
+
+
+
+ ${nameNode}/${sourcePath}/datasource
+ ${nameNode}/${outputPath}/datasource
+
+
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB
+ --hdfsPath${workingDir}/blacklist
+ --hdfsNameNode${nameNode}
+ --postgresUrl${postgresURL}
+ --postgresUser${postgresUser}
+ --postgresPassword${postgresPassword}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ PrepareMergedRelation
+ eu.dnetlib.dhp.blacklist.PrepareMergedRelationJob
+ dhp-blacklist-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/relation
+ --outputPath${workingDir}/mergesRelation
+ --hive_metastore_uris${hive_metastore_uris}
+
+
+
+
+
+
+
+ yarn
+ cluster
+ ApplyBlacklist
+ eu.dnetlib.dhp.blacklist.SparkRemoveBlacklistedRelationJob
+ dhp-blacklist-${projectVersion}.jar
+
+ --executor-cores=${sparkExecutorCores}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+ --conf spark.sql.shuffle.partitions=3840
+
+ --sourcePath${sourcePath}/relation
+ --outputPath${outputPath}/relation
+ --hdfsPath${workingDir}/blacklist
+ --mergesPath${workingDir}/mergesRelation
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json
new file mode 100644
index 000000000..91a87b8b5
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json
@@ -0,0 +1,33 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the path to the graph used to remove the relations ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "out",
+ "paramLongName": "outputPath",
+ "paramDescription": "the path where to store the temporary result ",
+ "paramRequired": true
+ },
+ {
+ "paramName": "issm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": false
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "mergesPath",
+ "paramDescription": "true if the spark session is managed",
+ "paramRequired": true
+
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java
new file mode 100644
index 000000000..0487a5844
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java
@@ -0,0 +1,162 @@
+
+package eu.dnetlib.dhp.blacklist;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class BlackListTest {
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private static final ClassLoader cl = eu.dnetlib.dhp.blacklist.BlackListTest.class.getClassLoader();
+
+ private static SparkSession spark;
+
+ private static Path workingDir;
+ private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.blacklist.BlackListTest.class);
+
+ @BeforeAll
+ public static void beforeAll() throws IOException {
+ workingDir = Files.createTempDirectory(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
+ log.info("using work dir {}", workingDir);
+
+ SparkConf conf = new SparkConf();
+ conf.setAppName(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
+
+ conf.setMaster("local[*]");
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("spark.ui.enabled", "false");
+ conf.set("spark.sql.warehouse.dir", workingDir.toString());
+ conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+ spark = SparkSession
+ .builder()
+ .appName(BlackListTest.class.getSimpleName())
+ .config(conf)
+ .getOrCreate();
+ }
+
+ @AfterAll
+ public static void afterAll() throws IOException {
+ FileUtils.deleteDirectory(workingDir.toFile());
+ spark.stop();
+ }
+
+
+ @Test
+ public void noRemoveTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsNoRemoval").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(13, tmp.count());
+
+ }
+
+ @Test
+ public void removeNoMergeMatchTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsOneRemoval").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(12, tmp.count());
+
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
+
+ Assertions
+ .assertEquals(
+ 0, verificationDataset
+ .filter(
+ "source = '40|corda__h2020::5161f53ab205d803c36b4c888fe7deef' and " +
+ "target = '20|dedup_wf_001::157af406bc653aa4d9749318b644de43'")
+ .count());
+
+ Assertions.assertEquals(0, verificationDataset.filter("relClass = 'hasParticipant'").count());
+ }
+
+ @Test
+ public void removeMergeMatchTest() throws Exception {
+ SparkRemoveBlacklistedRelationJob
+ .main(
+ new String[] {
+ "-isSparkSessionManaged",
+ Boolean.FALSE.toString(),
+ "-sourcePath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch").getPath(),
+ "-outputPath",
+ workingDir.toString() + "/relation",
+ "-hdfsPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
+ "-mergesPath",
+ getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRelOneMerge").getPath(),
+ });
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ JavaRDD tmp = sc
+ .textFile(workingDir.toString() + "/relation")
+ .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+
+ Assertions.assertEquals(12, tmp.count());
+
+ org.apache.spark.sql.Dataset verificationDataset = spark
+ .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
+
+ Assertions.assertEquals(12, verificationDataset.filter("relClass = 'isProvidedBy'").count());
+
+ }
+}
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist
new file mode 100644
index 000000000..ea95130af
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist
@@ -0,0 +1,20 @@
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"hasParticipant","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"isParticipant","source":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43","target":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::a727cc288016db7132ef9a799aa83350","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::a727cc288016db7132ef9a799aa83350"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::062cf091d5c7a7d730001c34177042e3","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::062cf091d5c7a7d730001c34177042e3"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::1b172ab34639e7935e2357119cf20830","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|od_______908::1b172ab34639e7935e2357119cf20830"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
+{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json
new file mode 100644
index 000000000..8f0d296d6
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json
@@ -0,0 +1,14 @@
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______177::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json
new file mode 100644
index 000000000..3d74ffa6e
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json
@@ -0,0 +1,14 @@
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
+{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json
new file mode 100644
index 000000000..761cba478
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProducedBy","relType":"resultProject","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"outcome","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json
new file mode 100644
index 000000000..a79d1d8eb
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::018cb61ed43c01704decc66183ce5d60","subRelType":"provision","target":"20|dedup_wf_001::b9fff055ce5efacecbe4ef918c127f86"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json
new file mode 100644
index 000000000..f809acfeb
--- /dev/null
+++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json
@@ -0,0 +1,13 @@
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","subRelType":"participation","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
+{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
index 0694556b2..9e5d98644 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java
@@ -29,31 +29,32 @@ public class EventFactory {
"yyyy-MM-dd"
};
- public static Event newBrokerEvent(final Result source, final Result target, final UpdateInfo> updateInfo) {
+ public static Event newBrokerEvent(final UpdateInfo> updateInfo) {
final long now = new Date().getTime();
final Event res = new Event();
- final Map map = createMapFromResult(target, source, updateInfo);
+ final Map map = createMapFromResult(updateInfo);
- final String payload = createPayload(target, updateInfo);
+ final String payload = createPayload(updateInfo);
final String eventId = calculateEventId(
- updateInfo.getTopic(), target.getOriginalId().get(0), updateInfo.getHighlightValueAsString());
+ updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0),
+ updateInfo.getHighlightValueAsString());
res.setEventId(eventId);
res.setProducerId(PRODUCER_ID);
res.setPayload(payload);
res.setMap(map);
- res.setTopic(updateInfo.getTopic());
+ res.setTopic(updateInfo.getTopicPath());
res.setCreationDate(now);
res.setExpiryDate(calculateExpiryDate(now));
res.setInstantMessage(false);
return res;
}
- private static String createPayload(final Result result, final UpdateInfo> updateInfo) {
+ private static String createPayload(final UpdateInfo> updateInfo) {
final OpenAireEventPayload payload = new OpenAireEventPayload();
// TODO
@@ -62,32 +63,34 @@ public class EventFactory {
return payload.toJSON();
}
- private static Map createMapFromResult(final Result oaf, final Result source,
- final UpdateInfo> updateInfo) {
+ private static Map createMapFromResult(final UpdateInfo> updateInfo) {
final Map map = new HashMap<>();
- final List collectedFrom = oaf.getCollectedfrom();
+ final Result source = updateInfo.getSource();
+ final Result target = updateInfo.getTarget();
+
+ final List collectedFrom = target.getCollectedfrom();
if (collectedFrom.size() == 1) {
map.put("target_datasource_id", collectedFrom.get(0).getKey());
map.put("target_datasource_name", collectedFrom.get(0).getValue());
}
- final List ids = oaf.getOriginalId();
+ final List ids = target.getOriginalId();
if (ids.size() > 0) {
map.put("target_publication_id", ids.get(0));
}
- final List titles = oaf.getTitle();
+ final List titles = target.getTitle();
if (titles.size() > 0) {
map.put("target_publication_title", titles.get(0));
}
- final long date = parseDateTolong(oaf.getDateofacceptance().getValue());
+ final long date = parseDateTolong(target.getDateofacceptance().getValue());
if (date > 0) {
map.put("target_dateofacceptance", date);
}
- final List subjects = oaf.getSubject();
+ final List subjects = target.getSubject();
if (subjects.size() > 0) {
map
.put(
@@ -95,7 +98,7 @@ public class EventFactory {
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
}
- final List authors = oaf.getAuthor();
+ final List authors = target.getAuthor();
if (authors.size() > 0) {
map
.put(
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java
new file mode 100644
index 000000000..29f6cbe3a
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java
@@ -0,0 +1,52 @@
+
+package eu.dnetlib.dhp.broker.model;
+
+public enum Topic {
+
+ // ENRICHMENT MISSING
+ ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT(
+ "ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE(
+ "ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID(
+ "ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE(
+ "ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC(
+ "ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV(
+ "ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL(
+ "ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC(
+ "ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM(
+ "ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK(
+ "ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID(
+ "ENRICH/MISSING/AUTHOR/ORCID"),
+
+ // ENRICHMENT MORE
+ ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT(
+ "ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT(
+ "ENRICH/MORE/PROJECT"), ENRICH_MORE_SUBJECT_MESHEUROPMC(
+ "ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV(
+ "ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL(
+ "ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC(
+ "ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM(
+ "ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"),
+
+ // ADDITION
+ ADD_BY_PROJECT("ADD/BY_PROJECT");
+
+ Topic(final String path) {
+ this.path = path;
+ }
+
+ protected String path;
+
+ public String getPath() {
+ return this.path;
+ }
+
+ public static Topic fromPath(final String path) {
+ for (final Topic t : Topic.values()) {
+ if (t.getPath().equals(path)) {
+ return t;
+ }
+ }
+ return null;
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java
index 54d4ef36a..43ebd6dd8 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java
@@ -14,21 +14,20 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.model.EventFactory;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAbstract;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAuthorOrcid;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMissingOpenAccess;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPid;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMissingProject;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPublicationDate;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMissingSubject;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid;
-import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid;
+import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject;
+import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
@@ -37,7 +36,16 @@ public class GenerateEventsApplication {
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
- private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+ private static final UpdateMatcher> enrichMissingAbstract = new EnrichMissingAbstract();
+ private static final UpdateMatcher> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
+ private static final UpdateMatcher> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
+ private static final UpdateMatcher> enrichMissingPid = new EnrichMissingPid();
+ private static final UpdateMatcher> enrichMissingProject = new EnrichMissingProject();
+ private static final UpdateMatcher> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
+ private static final UpdateMatcher> enrichMissingSubject = new EnrichMissingSubject();
+ private static final UpdateMatcher> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
+ private static final UpdateMatcher> enrichMorePid = new EnrichMorePid();
+ private static final UpdateMatcher> enrichMoreSubject = new EnrichMoreSubject();
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@@ -76,37 +84,22 @@ public class GenerateEventsApplication {
}
private List generateEvents(final Result... children) {
- final List list = new ArrayList<>();
+ final List> list = new ArrayList<>();
- for (final Result source : children) {
- for (final Result target : children) {
- if (source != target) {
- list
- .addAll(
- findUpdates(source, target)
- .stream()
- .map(info -> EventFactory.newBrokerEvent(source, target, info))
- .collect(Collectors.toList()));
- }
- }
+ for (final Result target : children) {
+ list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMorePid.searchUpdatesForRecord(target, children));
+ list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children));
}
- return list;
- }
-
- private List> findUpdates(final Result source, final Result target) {
- final List> list = new ArrayList<>();
- list.addAll(EnrichMissingAbstract.findUpdates(source, target));
- list.addAll(EnrichMissingAuthorOrcid.findUpdates(source, target));
- list.addAll(EnrichMissingOpenAccess.findUpdates(source, target));
- list.addAll(EnrichMissingPid.findUpdates(source, target));
- list.addAll(EnrichMissingProject.findUpdates(source, target));
- list.addAll(EnrichMissingPublicationDate.findUpdates(source, target));
- list.addAll(EnrichMissingSubject.findUpdates(source, target));
- list.addAll(EnrichMoreOpenAccess.findUpdates(source, target));
- list.addAll(EnrichMorePid.findUpdates(source, target));
- list.addAll(EnrichMoreSubject.findUpdates(source, target));
- return list;
+ return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
}
}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java
new file mode 100644
index 000000000..43cf738f8
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java
@@ -0,0 +1,36 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMissingAbstract extends UpdateMatcher {
+
+ public EnrichMissingAbstract() {
+ super(false);
+ }
+
+ @Override
+ protected List> findUpdates(final Result source, final Result target) {
+ if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
+ return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target));
+ }
+ return new ArrayList<>();
+ }
+
+ @Override
+ public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source,
+ final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MISSING_ABSTRACT,
+ highlightValue, source, target,
+ (p, s) -> p.getAbstracts().add(s),
+ s -> s);
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java
new file mode 100644
index 000000000..beeccdbe8
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java
@@ -0,0 +1,34 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMissingAuthorOrcid extends UpdateMatcher> {
+
+ public EnrichMissingAuthorOrcid() {
+ super(true);
+ }
+
+ @Override
+ protected List>> findUpdates(final Result source, final Result target) {
+ // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
+ return Arrays.asList();
+ }
+
+ @Override
+ public UpdateInfo> generateUpdateInfo(final Pair highlightValue,
+ final Result source, final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MISSING_AUTHOR_ORCID,
+ highlightValue, source, target,
+ (p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()),
+ pair -> pair.getLeft() + "::" + pair.getRight());
+ }
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java
new file mode 100644
index 000000000..a4a2ea0c6
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java
@@ -0,0 +1,55 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import eu.dnetlib.broker.objects.Instance;
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
+import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMissingOpenAccess extends UpdateMatcher {
+
+ public EnrichMissingOpenAccess() {
+ super(true);
+ }
+
+ @Override
+ protected List> findUpdates(final Result source, final Result target) {
+ final long count = target
+ .getInstance()
+ .stream()
+ .map(i -> i.getAccessright().getClassid())
+ .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
+ .count();
+
+ if (count > 0) {
+ return Arrays.asList();
+ }
+
+ return source
+ .getInstance()
+ .stream()
+ .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
+ .map(ConversionUtils::oafInstanceToBrokerInstances)
+ .flatMap(s -> s)
+ .map(i -> generateUpdateInfo(i, source, target))
+ .collect(Collectors.toList());
+ }
+
+ @Override
+ public UpdateInfo generateUpdateInfo(final Instance highlightValue,
+ final Result source,
+ final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MISSING_OA_VERSION,
+ highlightValue, source, target,
+ (p, i) -> p.getInstances().add(i),
+ Instance::getUrl);
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java
new file mode 100644
index 000000000..a8df62541
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java
@@ -0,0 +1,45 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import eu.dnetlib.broker.objects.Pid;
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMissingPid extends UpdateMatcher {
+
+ public EnrichMissingPid() {
+ super(true);
+ }
+
+ @Override
+ protected List> findUpdates(final Result source, final Result target) {
+ final long count = target.getPid().size();
+
+ if (count > 0) {
+ return Arrays.asList();
+ }
+
+ return source
+ .getPid()
+ .stream()
+ .map(ConversionUtils::oafPidToBrokerPid)
+ .map(i -> generateUpdateInfo(i, source, target))
+ .collect(Collectors.toList());
+ }
+
+ @Override
+ public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MISSING_PID,
+ highlightValue, source, target,
+ (p, pid) -> p.getPids().add(pid),
+ pid -> pid.getType() + "::" + pid.getValue());
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java
new file mode 100644
index 000000000..b6e5b3b57
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java
@@ -0,0 +1,35 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.Arrays;
+import java.util.List;
+
+import eu.dnetlib.broker.objects.Project;
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMissingProject extends UpdateMatcher {
+
+ public EnrichMissingProject() {
+ super(true);
+ }
+
+ @Override
+ protected List> findUpdates(final Result source, final Result target) {
+ // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
+ return Arrays.asList();
+ }
+
+ @Override
+ public UpdateInfo generateUpdateInfo(final Project highlightValue,
+ final Result source,
+ final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MISSING_PROJECT,
+ highlightValue, source, target,
+ (p, prj) -> p.getProjects().add(prj),
+ prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java
new file mode 100644
index 000000000..e9ec082c4
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java
@@ -0,0 +1,33 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.Arrays;
+import java.util.List;
+
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMissingPublicationDate extends UpdateMatcher {
+
+ public EnrichMissingPublicationDate() {
+ super(false);
+ }
+
+ @Override
+ protected List> findUpdates(final Result source, final Result target) {
+ // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
+ return Arrays.asList();
+ }
+
+ @Override
+ public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source,
+ final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MISSING_PUBLICATION_DATE,
+ highlightValue, source, target,
+ (p, date) -> p.setPublicationdate(date),
+ s -> s);
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java
new file mode 100644
index 000000000..79e9d469b
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java
@@ -0,0 +1,53 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+public class EnrichMissingSubject extends UpdateMatcher> {
+
+ public EnrichMissingSubject() {
+ super(true);
+ }
+
+ @Override
+ protected List>> findUpdates(final Result source, final Result target) {
+ final Set existingTypes = target
+ .getSubject()
+ .stream()
+ .map(StructuredProperty::getQualifier)
+ .map(Qualifier::getClassid)
+ .collect(Collectors.toSet());
+
+ return source
+ .getPid()
+ .stream()
+ .filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
+ .map(ConversionUtils::oafSubjectToPair)
+ .map(i -> generateUpdateInfo(i, source, target))
+ .collect(Collectors.toList());
+ }
+
+ @Override
+ public UpdateInfo> generateUpdateInfo(final Pair highlightValue,
+ final Result source,
+ final Result target) {
+
+ return new UpdateInfo<>(
+ Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
+ highlightValue, source, target,
+ (p, pair) -> p.getSubjects().add(pair.getRight()),
+ pair -> pair.getLeft() + "::" + pair.getRight());
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java
new file mode 100644
index 000000000..40c9b0500
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java
@@ -0,0 +1,53 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import eu.dnetlib.broker.objects.Instance;
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
+import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMoreOpenAccess extends UpdateMatcher {
+
+ public EnrichMoreOpenAccess() {
+ super(true);
+ }
+
+ @Override
+ protected List> findUpdates(final Result source, final Result target) {
+ final Set urls = target
+ .getInstance()
+ .stream()
+ .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
+ .map(i -> i.getUrl())
+ .flatMap(List::stream)
+ .collect(Collectors.toSet());
+
+ return source
+ .getInstance()
+ .stream()
+ .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
+ .map(ConversionUtils::oafInstanceToBrokerInstances)
+ .flatMap(s -> s)
+ .filter(i -> !urls.contains(i.getUrl()))
+ .map(i -> generateUpdateInfo(i, source, target))
+ .collect(Collectors.toList());
+ }
+
+ @Override
+ public UpdateInfo generateUpdateInfo(final Instance highlightValue,
+ final Result source,
+ final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MORE_OA_VERSION,
+ highlightValue, source, target,
+ (p, i) -> p.getInstances().add(i),
+ Instance::getUrl);
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java
new file mode 100644
index 000000000..0e7b7766a
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java
@@ -0,0 +1,46 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import eu.dnetlib.broker.objects.Pid;
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMorePid extends UpdateMatcher {
+
+ public EnrichMorePid() {
+ super(true);
+ }
+
+ @Override
+ protected List> findUpdates(final Result source, final Result target) {
+ final Set existingPids = target
+ .getPid()
+ .stream()
+ .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
+ .collect(Collectors.toSet());
+
+ return source
+ .getPid()
+ .stream()
+ .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
+ .map(ConversionUtils::oafPidToBrokerPid)
+ .map(i -> generateUpdateInfo(i, source, target))
+ .collect(Collectors.toList());
+ }
+
+ @Override
+ public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
+ return new UpdateInfo<>(
+ Topic.ENRICH_MORE_PID,
+ highlightValue, source, target,
+ (p, pid) -> p.getPids().add(pid),
+ pid -> pid.getType() + "::" + pid.getValue());
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java
new file mode 100644
index 000000000..e6374479b
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java
@@ -0,0 +1,50 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class EnrichMoreSubject extends UpdateMatcher> {
+
+ public EnrichMoreSubject() {
+ super(true);
+ }
+
+ @Override
+ protected List>> findUpdates(final Result source, final Result target) {
+ final Set existingSubjects = target
+ .getSubject()
+ .stream()
+ .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
+ .collect(Collectors.toSet());
+
+ return source
+ .getPid()
+ .stream()
+ .filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
+ .map(ConversionUtils::oafSubjectToPair)
+ .map(i -> generateUpdateInfo(i, source, target))
+ .collect(Collectors.toList());
+ }
+
+ @Override
+ public UpdateInfo> generateUpdateInfo(final Pair highlightValue,
+ final Result source,
+ final Result target) {
+
+ return new UpdateInfo<>(
+ Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
+ highlightValue, source, target,
+ (p, pair) -> p.getSubjects().add(pair.getRight()),
+ pair -> pair.getLeft() + "::" + pair.getRight());
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java
new file mode 100644
index 000000000..b8b6132cd
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java
@@ -0,0 +1,64 @@
+
+package eu.dnetlib.dhp.broker.oa.matchers;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public abstract class UpdateMatcher {
+
+ private final boolean multipleUpdate;
+
+ public UpdateMatcher(final boolean multipleUpdate) {
+ this.multipleUpdate = multipleUpdate;
+ }
+
+ public Collection> searchUpdatesForRecord(final Result res, final Result... others) {
+
+ final Map> infoMap = new HashMap<>();
+
+ for (final Result source : others) {
+ if (source != res) {
+ for (final UpdateInfo info : findUpdates(source, res)) {
+ final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
+ if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
+ } else {
+ infoMap.put(s, info);
+ }
+ }
+ }
+ }
+
+ final Collection> values = infoMap.values();
+
+ if (values.isEmpty() || multipleUpdate) {
+ return values;
+ } else {
+ final UpdateInfo v = values
+ .stream()
+ .sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust()))
+ .findFirst()
+ .get();
+ return Arrays.asList(v);
+ }
+ }
+
+ protected abstract List> findUpdates(Result source, Result target);
+
+ protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, final Result source,
+ final Result target);
+
+ protected static boolean isMissing(final List> list) {
+ return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java
new file mode 100644
index 000000000..d61d5bfb7
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java
@@ -0,0 +1,7 @@
+
+package eu.dnetlib.dhp.broker.oa.util;
+
+public class BrokerConstants {
+
+ public final static String OPEN_ACCESS = "OPEN";
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
new file mode 100644
index 000000000..2e2ce202a
--- /dev/null
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
@@ -0,0 +1,36 @@
+
+package eu.dnetlib.dhp.broker.oa.util;
+
+import java.util.stream.Stream;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import eu.dnetlib.broker.objects.Instance;
+import eu.dnetlib.broker.objects.Pid;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+public class ConversionUtils {
+
+ public static Stream oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) {
+ return i.getUrl().stream().map(url -> {
+ final Instance r = new Instance();
+ r.setUrl(url);
+ r.setInstancetype(i.getInstancetype().getClassid());
+ r.setLicense(BrokerConstants.OPEN_ACCESS);
+ r.setHostedby(i.getHostedby().getValue());
+ return r;
+ });
+ }
+
+ public static Pid oafPidToBrokerPid(final StructuredProperty sp) {
+ final Pid pid = new Pid();
+ pid.setValue(sp.getValue());
+ pid.setType(sp.getQualifier().getClassid());
+ return pid;
+ }
+
+ public static final Pair oafSubjectToPair(final StructuredProperty sp) {
+ return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
+ }
+
+}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java
deleted file mode 100644
index 493d1f97c..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java
+++ /dev/null
@@ -1,31 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMissingAbstract extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMissingAbstract(final String highlightValue, final float trust) {
- super("ENRICH/MISSING/ABSTRACT", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getAbstracts().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java
deleted file mode 100644
index 6899c62a3..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java
+++ /dev/null
@@ -1,31 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMissingAuthorOrcid extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMissingAuthorOrcid(final String highlightValue, final float trust) {
- super("ENRICH/MISSING/AUTHOR/ORCID", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- // TODO
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java
deleted file mode 100644
index 9464130f3..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java
+++ /dev/null
@@ -1,32 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.Instance;
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMissingOpenAccess extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMissingOpenAccess(final Instance highlightValue, final float trust) {
- super("ENRICH/MISSING/OPENACCESS_VERSION", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getInstances().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue().getUrl();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java
deleted file mode 100644
index 293d4993f..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java
+++ /dev/null
@@ -1,32 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.broker.objects.Pid;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMissingPid extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMissingPid(final Pid highlightValue, final float trust) {
- super("ENRICH/MISSING/PID", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getPids().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue().getType() + "::" + getHighlightValue().getValue();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java
deleted file mode 100644
index a22c179a2..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java
+++ /dev/null
@@ -1,33 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.broker.objects.Project;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMissingProject extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMissingProject(final Project highlightValue, final float trust) {
- super("ENRICH/MISSING/PROJECT", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getProjects().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue().getFunder() + "::" + getHighlightValue().getFundingProgram()
- + getHighlightValue().getCode();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java
deleted file mode 100644
index 869dca264..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java
+++ /dev/null
@@ -1,31 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMissingPublicationDate extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMissingPublicationDate(final String highlightValue, final float trust) {
- super("ENRICH/MISSING/PUBLICATION_DATE", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().setPublicationdate(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java
deleted file mode 100644
index a2ed5d043..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java
+++ /dev/null
@@ -1,36 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMissingSubject extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // MESHEUROPMC
- // ARXIV
- // JEL
- // DDC
- // ACM
-
- return Arrays.asList();
- }
-
- private EnrichMissingSubject(final String subjectClassification, final String highlightValue, final float trust) {
- super("ENRICH/MISSING/SUBJECT/" + subjectClassification, highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getSubjects().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java
deleted file mode 100644
index 4f1e88d3d..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java
+++ /dev/null
@@ -1,32 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.Instance;
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMoreOpenAccess extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMoreOpenAccess(final Instance highlightValue, final float trust) {
- super("ENRICH/MORE/OPENACCESS_VERSION", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getInstances().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue().getUrl();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java
deleted file mode 100644
index ecf2cf310..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java
+++ /dev/null
@@ -1,32 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.broker.objects.Pid;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMorePid extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
- return Arrays.asList();
- }
-
- private EnrichMorePid(final Pid highlightValue, final float trust) {
- super("ENRICH/MORE/PID", highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getPids().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue().getType() + "::" + getHighlightValue().getValue();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java
deleted file mode 100644
index f29b86292..000000000
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java
+++ /dev/null
@@ -1,36 +0,0 @@
-
-package eu.dnetlib.dhp.broker.oa.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import eu.dnetlib.broker.objects.OpenAireEventPayload;
-import eu.dnetlib.dhp.schema.oaf.Result;
-
-public class EnrichMoreSubject extends UpdateInfo {
-
- public static List findUpdates(final Result source, final Result target) {
- // MESHEUROPMC
- // ARXIV
- // JEL
- // DDC
- // ACM
-
- return Arrays.asList();
- }
-
- private EnrichMoreSubject(final String subjectClassification, final String highlightValue, final float trust) {
- super("ENRICH/MORE/SUBJECT/" + subjectClassification, highlightValue, trust);
- }
-
- @Override
- public void compileHighlight(final OpenAireEventPayload payload) {
- payload.getHighlight().getSubjects().add(getHighlightValue());
- }
-
- @Override
- public String getHighlightValueAsString() {
- return getHighlightValue();
- }
-
-}
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java
index f7b6b69e9..5cc0d371d 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java
@@ -1,36 +1,77 @@
package eu.dnetlib.dhp.broker.oa.util;
+import java.util.function.BiConsumer;
+import java.util.function.Function;
+
import eu.dnetlib.broker.objects.OpenAireEventPayload;
+import eu.dnetlib.broker.objects.Publication;
+import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.oaf.Result;
-public abstract class UpdateInfo {
+public final class UpdateInfo {
- private final String topic;
+ private final Topic topic;
private final T highlightValue;
+ private final Result source;
+
+ private final Result target;
+
+ private final BiConsumer compileHighlight;
+
+ private final Function highlightToString;
+
private final float trust;
- protected UpdateInfo(final String topic, final T highlightValue, final float trust) {
+ public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target,
+ final BiConsumer compileHighlight,
+ final Function highlightToString) {
this.topic = topic;
this.highlightValue = highlightValue;
- this.trust = trust;
+ this.source = source;
+ this.target = target;
+ this.compileHighlight = compileHighlight;
+ this.highlightToString = highlightToString;
+ this.trust = calculateTrust(source, target);
}
public T getHighlightValue() {
return highlightValue;
}
+ public Result getSource() {
+ return source;
+ }
+
+ public Result getTarget() {
+ return target;
+ }
+
+ private float calculateTrust(final Result source, final Result target) {
+ // TODO
+ return 0.9f;
+ }
+
+ protected Topic getTopic() {
+ return topic;
+ }
+
+ public String getTopicPath() {
+ return topic.getPath();
+ }
+
public float getTrust() {
return trust;
}
- public String getTopic() {
- return topic;
+ public void compileHighlight(final OpenAireEventPayload payload) {
+ compileHighlight.accept(payload.getHighlight(), getHighlightValue());
}
- abstract public void compileHighlight(OpenAireEventPayload payload);
-
- abstract public String getHighlightValueAsString();
+ public String getHighlightValueAsString() {
+ return highlightToString.apply(getHighlightValue());
+ }
}
diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml
new file mode 100644
index 000000000..fe9833e3e
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/pom.xml
@@ -0,0 +1,64 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.2.1-SNAPSHOT
+
+ 4.0.0
+
+ dhp-enrichment
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+ org.apache.spark
+ spark-hive_2.11
+ test
+
+
+
+ dom4j
+ dom4j
+
+
+ jaxen
+ jaxen
+
+
+ com.jayway.jsonpath
+ json-path
+
+
+
+ io.github.classgraph
+ classgraph
+ 4.8.71
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java
new file mode 100644
index 000000000..c8eb017c7
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java
@@ -0,0 +1,151 @@
+
+package eu.dnetlib.dhp;
+
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class PropagationConstant {
+ public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional";
+
+ public static final String PROPAGATION_DATA_INFO_TYPE = "propagation";
+
+ public static final String TRUE = "true";
+
+ public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
+ public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
+
+ public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = "result:organization:instrepo";
+ public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository";
+
+ public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel";
+ public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation";
+
+ public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = "result:community:semrel";
+ public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation";
+
+ public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = "result:community:organization";
+ public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization";
+
+ public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
+ public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
+
+ public static final String PROPAGATION_AUTHOR_PID = "ORCID";
+
+ public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
+ +
+ "from result r " +
+ "lateral view explode(instance) i as inst " +
+ "where r.datainfo.deletedbyinference=false";
+
+ public static Country getCountry(String classid, String classname) {
+ Country nc = new Country();
+ nc.setClassid(classid);
+ nc.setClassname(classname);
+ nc.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
+ nc.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
+ nc
+ .setDataInfo(
+ getDataInfo(
+ PROPAGATION_DATA_INFO_TYPE,
+ PROPAGATION_COUNTRY_INSTREPO_CLASS_ID,
+ PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME));
+ return nc;
+ }
+
+ public static DataInfo getDataInfo(
+ String inference_provenance, String inference_class_id, String inference_class_name) {
+ DataInfo di = new DataInfo();
+ di.setInferred(true);
+ di.setDeletedbyinference(false);
+ di.setTrust("0.85");
+ di.setInferenceprovenance(inference_provenance);
+ di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
+ return di;
+ }
+
+ public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
+ Qualifier pa = new Qualifier();
+ pa.setClassid(inference_class_id);
+ pa.setClassname(inference_class_name);
+ pa.setSchemeid(ModelConstants.DNET_PID_TYPES);
+ pa.setSchemename(ModelConstants.DNET_PID_TYPES);
+ return pa;
+ }
+
+ public static Relation getRelation(
+ String source,
+ String target,
+ String rel_class,
+ String rel_type,
+ String subrel_type,
+ String inference_provenance,
+ String inference_class_id,
+ String inference_class_name) {
+ Relation r = new Relation();
+ r.setSource(source);
+ r.setTarget(target);
+ r.setRelClass(rel_class);
+ r.setRelType(rel_type);
+ r.setSubRelType(subrel_type);
+ r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name));
+ return r;
+ }
+
+ public static String getConstraintList(String text, List constraints) {
+ String ret = " and (" + text + constraints.get(0) + "'";
+ for (int i = 1; i < constraints.size(); i++) {
+ ret += " OR " + text + constraints.get(i) + "'";
+ }
+ ret += ")";
+ return ret;
+ }
+
+ public static void removeOutputDir(SparkSession spark, String path) {
+ HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+ }
+
+ public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
+ return Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ }
+
+ public static Boolean isTest(ArgumentApplicationParser parser) {
+ return Optional
+ .ofNullable(parser.get("isTest"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.FALSE);
+ }
+
+ public static void createCfHbforResult(SparkSession spark) {
+ org.apache.spark.sql.Dataset cfhb = spark.sql(cfHbforResultQuery);
+ cfhb.createOrReplaceTempView("cfhb");
+ }
+
+ public static Dataset readPath(
+ SparkSession spark, String inputPath, Class clazz) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
new file mode 100644
index 000000000..1c65e8ade
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java
@@ -0,0 +1,122 @@
+
+package eu.dnetlib.dhp.bulktag;
+
+import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.bulktag.community.*;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class SparkBulkTagJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob.class);
+ public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkBulkTagJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ Boolean isTest = Optional
+ .ofNullable(parser.get("isTest"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.FALSE);
+ log.info("isTest: {} ", isTest);
+
+ final String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class);
+ log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ CommunityConfiguration cc;
+
+ String taggingConf = parser.get("taggingConf");
+
+ if (isTest) {
+ cc = CommunityConfigurationFactory.newInstance(taggingConf);
+ } else {
+ cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookUpUrl"));
+ }
+
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
+ });
+ }
+
+ private static void execBulkTag(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ ProtoMap protoMappingParams,
+ Class resultClazz,
+ CommunityConfiguration communityConfiguration) {
+
+ ResultTagger resultTagger = new ResultTagger();
+ readPath(spark, inputPath, resultClazz)
+ .map(
+ (MapFunction) value -> resultTagger
+ .enrichContextCriteria(
+ value, communityConfiguration, protoMappingParams),
+ Encoders.bean(resultClazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath);
+ }
+
+ public static Dataset readPath(
+ SparkSession spark, String inputPath, Class clazz) {
+ return spark
+ .read()
+ .textFile(inputPath)
+ .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java
new file mode 100644
index 000000000..0f45d3beb
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 01/08/2018. */
+public class Community implements Serializable {
+
+ private static final Log log = LogFactory.getLog(Community.class);
+
+ private String id;
+ private List subjects = new ArrayList<>();
+ private List providers = new ArrayList<>();
+ private List zenodoCommunities = new ArrayList<>();
+
+ public String toJson() {
+ final Gson g = new Gson();
+ return g.toJson(this);
+ }
+
+ public boolean isValid() {
+ return !getSubjects().isEmpty()
+ || !getProviders().isEmpty()
+ || !getZenodoCommunities().isEmpty();
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public List getSubjects() {
+ return subjects;
+ }
+
+ public void setSubjects(List subjects) {
+ this.subjects = subjects;
+ }
+
+ public List getProviders() {
+ return providers;
+ }
+
+ public void setProviders(List providers) {
+ this.providers = providers;
+ }
+
+ public List getZenodoCommunities() {
+ return zenodoCommunities;
+ }
+
+ public void setZenodoCommunities(List zenodoCommunities) {
+ this.zenodoCommunities = zenodoCommunities;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java
new file mode 100644
index 000000000..29ddde15f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java
@@ -0,0 +1,196 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
+import eu.dnetlib.dhp.bulktag.criteria.Selection;
+
+/** Created by miriam on 02/08/2018. */
+public class CommunityConfiguration implements Serializable {
+
+ private static final Log log = LogFactory.getLog(CommunityConfiguration.class);
+
+ private Map communities;
+
+ // map subject -> communityid
+ private Map>> subjectMap = new HashMap<>();
+ // map datasourceid -> communityid
+ private Map>> datasourceMap = new HashMap<>();
+ // map zenodocommunityid -> communityid
+ private Map>> zenodocommunityMap = new HashMap<>();
+
+ public Map>> getSubjectMap() {
+ return subjectMap;
+ }
+
+ public void setSubjectMap(Map>> subjectMap) {
+ this.subjectMap = subjectMap;
+ }
+
+ public Map>> getDatasourceMap() {
+ return datasourceMap;
+ }
+
+ public void setDatasourceMap(
+ Map>> datasourceMap) {
+ this.datasourceMap = datasourceMap;
+ }
+
+ public Map>> getZenodocommunityMap() {
+ return zenodocommunityMap;
+ }
+
+ public void setZenodocommunityMap(
+ Map>> zenodocommunityMap) {
+ this.zenodocommunityMap = zenodocommunityMap;
+ }
+
+ CommunityConfiguration(final Map communities) {
+ this.communities = communities;
+ init();
+ }
+
+ void init() {
+
+ if (subjectMap == null) {
+ subjectMap = Maps.newHashMap();
+ }
+ if (datasourceMap == null) {
+ datasourceMap = Maps.newHashMap();
+ }
+ if (zenodocommunityMap == null) {
+ zenodocommunityMap = Maps.newHashMap();
+ }
+
+ for (Community c : getCommunities().values()) {
+ // get subjects
+ final String id = c.getId();
+ for (String sbj : c.getSubjects()) {
+ Pair p = new Pair<>(id, new SelectionConstraints());
+ add(sbj.toLowerCase().trim(), p, subjectMap);
+ }
+ // get datasources
+ for (Provider d : c.getProviders()) {
+
+ add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
+ }
+ // get zenodo communities
+ for (ZenodoCommunity zc : c.getZenodoCommunities()) {
+ add(
+ zc.getZenodoCommunityId(),
+ new Pair<>(id, zc.getSelCriteria()),
+ zenodocommunityMap);
+ }
+ }
+ }
+
+ private void add(
+ String key,
+ Pair value,
+ Map>> map) {
+ List> values = map.get(key);
+
+ if (values == null) {
+ values = new ArrayList<>();
+ map.put(key, values);
+ }
+ values.add(value);
+ }
+
+ public List> getCommunityForSubject(String sbj) {
+ return subjectMap.get(sbj);
+ }
+
+ public List> getCommunityForDatasource(String dts) {
+ return datasourceMap.get(dts);
+ }
+
+ public List getCommunityForDatasource(
+ final String dts, final Map> param) {
+ List> lp = datasourceMap.get(dts);
+ if (lp == null)
+ return Lists.newArrayList();
+
+ return lp
+ .stream()
+ .map(
+ p -> {
+ if (p.getSnd() == null)
+ return p.getFst();
+ if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
+ return p.getFst();
+ else
+ return null;
+ })
+ .filter(st -> (st != null))
+ .collect(Collectors.toList());
+ }
+
+ public List> getCommunityForZenodoCommunity(String zc) {
+ return zenodocommunityMap.get(zc);
+ }
+
+ public List getCommunityForSubjectValue(String value) {
+
+ return getContextIds(subjectMap.get(value));
+ }
+
+ public List getCommunityForDatasourceValue(String value) {
+
+ return getContextIds(datasourceMap.get(value.toLowerCase()));
+ }
+
+ public List getCommunityForZenodoCommunityValue(String value) {
+
+ return getContextIds(zenodocommunityMap.get(value.toLowerCase()));
+ }
+
+ private List getContextIds(List> list) {
+ if (list != null) {
+ return list.stream().map(p -> p.getFst()).collect(Collectors.toList());
+ }
+ return Lists.newArrayList();
+ }
+
+ public Map getCommunities() {
+ return communities;
+ }
+
+ public void setCommunities(Map communities) {
+ this.communities = communities;
+ }
+
+ public String toJson() {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
+ Gson gson = builder.create();
+
+ return gson.toJson(this);
+ }
+
+ public int size() {
+ return communities.keySet().size();
+ }
+
+ public Community getCommunityById(String id) {
+ return communities.get(id);
+ }
+
+ public List getCommunityList() {
+ return Lists.newLinkedList(communities.values());
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java
new file mode 100644
index 000000000..607315f3f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java
@@ -0,0 +1,138 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Node;
+import org.dom4j.io.SAXReader;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
+import eu.dnetlib.dhp.bulktag.criteria.Selection;
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
+
+/** Created by miriam on 03/08/2018. */
+public class CommunityConfigurationFactory {
+
+ private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class);
+
+ private static VerbResolver resolver = VerbResolverFactory.newInstance();
+
+ public static CommunityConfiguration newInstance(final String xml) throws DocumentException {
+
+ log.debug(String.format("parsing community configuration from:\n%s", xml));
+
+ final Document doc = new SAXReader().read(new StringReader(xml));
+
+ final Map communities = Maps.newHashMap();
+
+ for (final Object o : doc.selectNodes("//community")) {
+
+ final Node node = (Node) o;
+
+ final Community community = parseCommunity(node);
+
+ if (community.isValid()) {
+ communities.put(community.getId(), community);
+ }
+ }
+
+ log.info(String.format("loaded %s community configuration profiles", communities.size()));
+ log.debug(String.format("loaded community configuration:\n%s", communities.toString()));
+
+ return new CommunityConfiguration(communities);
+ }
+
+ public static CommunityConfiguration fromJson(final String json) {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
+ Gson gson = builder.create();
+ final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class);
+ log.info(String.format("loaded %s community configuration profiles", conf.size()));
+ conf.init();
+ log.info("created inverse maps");
+
+ return conf;
+ }
+
+ private static Community parseCommunity(final Node node) {
+
+ final Community c = new Community();
+
+ c.setId(node.valueOf("./@id"));
+
+ log.info(String.format("community id: %s", c.getId()));
+
+ c.setSubjects(parseSubjects(node));
+ c.setProviders(parseDatasources(node));
+ c.setZenodoCommunities(parseZenodoCommunities(node));
+ return c;
+ }
+
+ private static List parseSubjects(final Node node) {
+
+ final List subjects = Lists.newArrayList();
+
+ final List list = node.selectNodes("./subjects/subject");
+
+ for (Node n : list) {
+ log.debug("text of the node " + n.getText());
+ subjects.add(StringUtils.trim(n.getText()));
+ }
+ log.info("size of the subject list " + subjects.size());
+ return subjects;
+ }
+
+ private static List parseDatasources(final Node node) {
+ final List list = node.selectNodes("./datasources/datasource");
+ final List providerList = new ArrayList<>();
+ for (Node n : list) {
+ Provider d = new Provider();
+ d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
+ d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
+ providerList.add(d);
+ }
+ log.info("size of the datasource list " + providerList.size());
+ return providerList;
+ }
+
+ private static List parseZenodoCommunities(final Node node) {
+ final Node oacommunitynode = node.selectSingleNode("./oacommunity");
+ String oacommunity = null;
+ if (oacommunitynode != null) {
+ String tmp = oacommunitynode.getText();
+ if (StringUtils.isNotBlank(tmp))
+ oacommunity = tmp;
+ }
+
+ final List list = node.selectNodes("./zenodocommunities/zenodocommunity");
+ final List zenodoCommunityList = new ArrayList<>();
+ for (Node n : list) {
+ ZenodoCommunity zc = new ZenodoCommunity();
+ zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText());
+ zc.setSelCriteria(n.selectSingleNode("./selcriteria"));
+
+ zenodoCommunityList.add(zc);
+ }
+ if (oacommunity != null) {
+ ZenodoCommunity zc = new ZenodoCommunity();
+ zc.setZenodoCommunityId(oacommunity);
+ zenodoCommunityList.add(zc);
+ }
+ log.info("size of the zenodo community list " + zenodoCommunityList.size());
+ return zenodoCommunityList;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
new file mode 100644
index 000000000..e0856ae8f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
@@ -0,0 +1,56 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+
+import eu.dnetlib.dhp.bulktag.criteria.Selection;
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+public class Constraint implements Serializable {
+ private String verb;
+ private String field;
+ private String value;
+ private Selection selection;
+
+ public Constraint() {
+ }
+
+ public String getVerb() {
+ return verb;
+ }
+
+ public void setVerb(String verb) {
+ this.verb = verb;
+ }
+
+ public String getField() {
+ return field;
+ }
+
+ public void setField(String field) {
+ this.field = field;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(String value) {
+ this.value = value;
+ }
+
+ public void setSelection(Selection sel) {
+ selection = sel;
+ }
+
+ public void setSelection(VerbResolver resolver)
+ throws InvocationTargetException, NoSuchMethodException, InstantiationException,
+ IllegalAccessException {
+ selection = resolver.getSelectionCriteria(verb, value);
+ }
+
+ public boolean verifyCriteria(String metadata) {
+ return selection.apply(metadata);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
new file mode 100644
index 000000000..b56dfaaa3
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
@@ -0,0 +1,74 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Type;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+/** Created by miriam on 02/08/2018. */
+public class Constraints implements Serializable {
+ private static final Log log = LogFactory.getLog(Constraints.class);
+ // private ConstraintEncapsulator ce;
+ private List constraint;
+
+ public Constraints() {
+ }
+
+ public List getConstraint() {
+ return constraint;
+ }
+
+ public void setConstraint(List constraint) {
+ this.constraint = constraint;
+ }
+
+ public void setSc(String json) {
+ Type collectionType = new TypeToken>() {
+ }.getType();
+ constraint = new Gson().fromJson(json, collectionType);
+ }
+
+ void setSelection(VerbResolver resolver) {
+ for (Constraint st : constraint) {
+
+ try {
+ st.setSelection(resolver);
+ } catch (NoSuchMethodException e) {
+ log.error(e.getMessage());
+ } catch (IllegalAccessException e) {
+ log.error(e.getMessage());
+ } catch (InvocationTargetException e) {
+ log.error(e.getMessage());
+ } catch (InstantiationException e) {
+ log.error(e.getMessage());
+ }
+ }
+ }
+
+ // Constraint in and
+ public boolean verifyCriteria(final Map> param) {
+
+ for (Constraint sc : constraint) {
+ boolean verified = false;
+ for (String value : param.get(sc.getField())) {
+ if (sc.verifyCriteria(value.trim())) {
+ verified = true;
+ }
+ }
+ if (!verified)
+ return verified;
+ }
+ return true;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java
new file mode 100644
index 000000000..50e1836fa
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java
@@ -0,0 +1,39 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 03/08/2018. */
+public class Pair implements Serializable {
+ private A fst;
+ private B snd;
+
+ public A getFst() {
+ return fst;
+ }
+
+ public Pair setFst(A fst) {
+ this.fst = fst;
+ return this;
+ }
+
+ public B getSnd() {
+ return snd;
+ }
+
+ public Pair setSnd(B snd) {
+ this.snd = snd;
+ return this;
+ }
+
+ public Pair(A a, B b) {
+ fst = a;
+ snd = b;
+ }
+
+ public String toJson() {
+ return new Gson().toJson(this);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java
new file mode 100644
index 000000000..fd7481719
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java
@@ -0,0 +1,12 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.util.HashMap;
+
+public class ProtoMap extends HashMap implements Serializable {
+
+ public ProtoMap() {
+ super();
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java
new file mode 100644
index 000000000..b9c37f4dc
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java
@@ -0,0 +1,61 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Node;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+/** Created by miriam on 01/08/2018. */
+public class Provider implements Serializable {
+ private static final Log log = LogFactory.getLog(Provider.class);
+
+ private String openaireId;
+
+ private SelectionConstraints selectionConstraints;
+
+ public SelectionConstraints getSelCriteria() {
+ return selectionConstraints;
+ }
+
+ public SelectionConstraints getSelectionConstraints() {
+ return selectionConstraints;
+ }
+
+ public void setSelectionConstraints(SelectionConstraints selectionConstraints) {
+ this.selectionConstraints = selectionConstraints;
+ }
+
+ public void setSelCriteria(SelectionConstraints selCriteria) {
+ this.selectionConstraints = selCriteria;
+ }
+
+ public String getOpenaireId() {
+ return openaireId;
+ }
+
+ public void setOpenaireId(String openaireId) {
+ this.openaireId = openaireId;
+ }
+
+ private void setSelCriteria(String json, VerbResolver resolver) {
+ log.info("Selection constraints for datasource = " + json);
+ selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class);
+
+ selectionConstraints.setSelection(resolver);
+ }
+
+ public void setSelCriteria(Node n, VerbResolver resolver) {
+ try {
+ setSelCriteria(n.getText(), resolver);
+ } catch (Exception e) {
+ log.info("not set selection criteria... ");
+ selectionConstraints = null;
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java
new file mode 100644
index 000000000..7ec2f916f
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.util.List;
+
+import org.dom4j.DocumentException;
+
+import com.google.common.base.Joiner;
+
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class QueryInformationSystem {
+ private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+ + " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
+ + " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
+ + " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
+ + " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
+ + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] "
+ + " return "
+ + " "
+ + " { $x//CONFIGURATION/context/@id} "
+ + " "
+ + " {for $y in tokenize($subj,',') "
+ + " return "
+ + " {$y}} "
+ + " "
+ + " "
+ + " {for $d in $datasources "
+ + " where $d/param[./@name='enabled']/text()='true' "
+ + " return "
+ + " "
+ + " "
+ + " {$d//param[./@name='openaireId']/text()} "
+ + " "
+ + " "
+ + " {$d/param[./@name='selcriteria']/text()} "
+ + " "
+ + " } "
+ + " "
+ + " "
+ + " {for $zc in $communities "
+ + " return "
+ + " "
+ + " "
+ + " {$zc/param[./@name='zenodoid']/text()} "
+ + " "
+ + " "
+ + " {$zc/param[./@name='selcriteria']/text()} "
+ + " "
+ + " } "
+ + " "
+ + " ";
+
+ public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
+ throws ISLookUpException, DocumentException {
+ ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
+ final List res = isLookUp.quickSearchProfile(XQUERY);
+
+ final String xmlConf = "" + Joiner.on(" ").join(res) + "";
+
+ return CommunityConfigurationFactory.newInstance(xmlConf);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
new file mode 100644
index 000000000..f5a985d15
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
@@ -0,0 +1,247 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.gson.Gson;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.JsonPath;
+
+import eu.dnetlib.dhp.schema.oaf.*;
+
+/** Created by miriam on 02/08/2018. */
+public class ResultTagger implements Serializable {
+
+ private String trust = "0.8";
+
+ private boolean clearContext(Result result) {
+ int tmp = result.getContext().size();
+ List clist = result
+ .getContext()
+ .stream()
+ .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
+ .collect(Collectors.toList());
+ result.setContext(clist);
+ return (tmp != clist.size());
+ }
+
+ private Map> getParamMap(final Result result, Map params) {
+ Map> param = new HashMap<>();
+ String json = new Gson().toJson(result, Result.class);
+ DocumentContext jsonContext = JsonPath.parse(json);
+ if (params == null) {
+ params = new HashMap<>();
+ }
+ for (String key : params.keySet()) {
+ try {
+ param.put(key, jsonContext.read(params.get(key)));
+ } catch (com.jayway.jsonpath.PathNotFoundException e) {
+ param.put(key, new ArrayList<>());
+ // throw e;
+ }
+ }
+ return param;
+ }
+
+ public R enrichContextCriteria(
+ final R result, final CommunityConfiguration conf, final Map criteria) {
+
+ // }
+ // public Result enrichContextCriteria(final Result result, final CommunityConfiguration
+ // conf, final Map criteria) {
+ final Map> param = getParamMap(result, criteria);
+
+ // Verify if the entity is deletedbyinference. In case verify if to clean the context list
+ // from all the zenodo communities
+ if (result.getDataInfo().getDeletedbyinference()) {
+ clearContext(result);
+ return result;
+ }
+
+ // communities contains all the communities to be added as context for the result
+ final Set communities = new HashSet<>();
+
+ // tagging for Subject
+ final Set subjects = new HashSet<>();
+ Optional> oresultsubj = Optional.ofNullable(result.getSubject());
+ if (oresultsubj.isPresent()) {
+ oresultsubj
+ .get()
+ .stream()
+ .map(subject -> subject.getValue())
+ .filter(StringUtils::isNotBlank)
+ .map(String::toLowerCase)
+ .map(String::trim)
+ .collect(Collectors.toCollection(HashSet::new))
+ .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
+ }
+
+ communities.addAll(subjects);
+
+ // Tagging for datasource
+ final Set datasources = new HashSet<>();
+ final Set tmp = new HashSet<>();
+
+ Optional> oresultinstance = Optional.ofNullable(result.getInstance());
+ if (oresultinstance.isPresent()) {
+ for (Instance i : oresultinstance.get()) {
+ tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
+ tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
+ }
+
+ oresultinstance
+ .get()
+ .stream()
+ .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
+ .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
+ .map(s -> StringUtils.substringAfter(s, "|"))
+ .collect(Collectors.toCollection(HashSet::new))
+ .forEach(
+ dsId -> datasources
+ .addAll(
+ conf.getCommunityForDatasource(dsId, param)));
+ }
+
+ communities.addAll(datasources);
+
+ /* Tagging for Zenodo Communities */
+ final Set czenodo = new HashSet<>();
+
+ Optional> oresultcontext = Optional.ofNullable(result.getContext());
+ if (oresultcontext.isPresent()) {
+ oresultcontext
+ .get()
+ .stream()
+ .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
+ .collect(Collectors.toList())
+ .forEach(
+ c -> czenodo
+ .addAll(
+ conf
+ .getCommunityForZenodoCommunityValue(
+ c
+ .getId()
+ .substring(
+ c.getId().lastIndexOf("/") + 1)
+ .trim())));
+ }
+
+ communities.addAll(czenodo);
+
+ clearContext(result);
+
+ /* Verify if there is something to bulktag */
+ if (communities.isEmpty()) {
+ return result;
+ }
+
+ result
+ .getContext()
+ .stream()
+ .map(
+ c -> {
+ if (communities.contains(c.getId())) {
+ Optional> opt_dataInfoList = Optional.ofNullable(c.getDataInfo());
+ List dataInfoList;
+ if (opt_dataInfoList.isPresent())
+ dataInfoList = opt_dataInfoList.get();
+ else {
+ dataInfoList = new ArrayList<>();
+ c.setDataInfo(dataInfoList);
+ }
+ if (subjects.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_SUBJECT,
+ CLASS_NAME_BULKTAG_SUBJECT));
+ if (datasources.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_DATASOURCE,
+ CLASS_NAME_BULKTAG_DATASOURCE));
+ if (czenodo.contains(c.getId()))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_CZENODO,
+ CLASS_NAME_BULKTAG_ZENODO));
+ }
+ return c;
+ })
+ .collect(Collectors.toList());
+
+ communities
+ .removeAll(
+ result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
+
+ if (communities.isEmpty())
+ return result;
+
+ List toaddcontext = communities
+ .stream()
+ .map(
+ c -> {
+ Context context = new Context();
+ context.setId(c);
+ List dataInfoList = new ArrayList<>();
+ if (subjects.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_SUBJECT,
+ CLASS_NAME_BULKTAG_SUBJECT));
+ if (datasources.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_DATASOURCE,
+ CLASS_NAME_BULKTAG_DATASOURCE));
+ if (czenodo.contains(c))
+ dataInfoList
+ .add(
+ getDataInfo(
+ BULKTAG_DATA_INFO_TYPE,
+ CLASS_ID_CZENODO,
+ CLASS_NAME_BULKTAG_ZENODO));
+ context.setDataInfo(dataInfoList);
+ return context;
+ })
+ .collect(Collectors.toList());
+
+ result.getContext().addAll(toaddcontext);
+ return result;
+ }
+
+ public static DataInfo getDataInfo(
+ String inference_provenance, String inference_class_id, String inference_class_name) {
+ DataInfo di = new DataInfo();
+ di.setInferred(true);
+ di.setInferenceprovenance(inference_provenance);
+ di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
+ return di;
+ }
+
+ public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
+ Qualifier pa = new Qualifier();
+ pa.setClassid(inference_class_id);
+ pa.setClassname(inference_class_name);
+ pa.setSchemeid(DNET_PROVENANCE_ACTIONS);
+ pa.setSchemename(DNET_PROVENANCE_ACTIONS);
+ return pa;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java
new file mode 100644
index 000000000..71ff61d1b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java
@@ -0,0 +1,51 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+import java.lang.reflect.Type;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+
+import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
+
+public class SelectionConstraints implements Serializable {
+ private List criteria;
+
+ public SelectionConstraints() {
+ }
+
+ public List getCriteria() {
+ return criteria;
+ }
+
+ public void setCriteria(List criteria) {
+ this.criteria = criteria;
+ }
+
+ public void setSc(String json) {
+ Type collectionType = new TypeToken>() {
+ }.getType();
+ criteria = new Gson().fromJson(json, collectionType);
+ }
+
+ // Constraints in or
+ public boolean verifyCriteria(final Map> param) {
+ for (Constraints selc : criteria) {
+ if (selc.verifyCriteria(param)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public void setSelection(VerbResolver resolver) {
+
+ for (Constraints cs : criteria) {
+ cs.setSelection(resolver);
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java
new file mode 100644
index 000000000..3cdc7c941
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java
@@ -0,0 +1,17 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+public class TaggingConstants {
+
+ public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
+
+ public static final String CLASS_ID_SUBJECT = "community:subject";
+ public static final String CLASS_ID_DATASOURCE = "community:datasource";
+ public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
+
+ public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
+
+ public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
+ public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
+ public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java
new file mode 100644
index 000000000..bc6b75fba
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java
@@ -0,0 +1,45 @@
+
+package eu.dnetlib.dhp.bulktag.community;
+
+import java.io.Serializable;
+
+import org.dom4j.Node;
+
+import com.google.gson.Gson;
+
+/** Created by miriam on 01/08/2018. */
+public class ZenodoCommunity implements Serializable {
+
+ private String zenodoCommunityId;
+
+ private SelectionConstraints selCriteria;
+
+ public String getZenodoCommunityId() {
+ return zenodoCommunityId;
+ }
+
+ public void setZenodoCommunityId(String zenodoCommunityId) {
+ this.zenodoCommunityId = zenodoCommunityId;
+ }
+
+ public SelectionConstraints getSelCriteria() {
+ return selCriteria;
+ }
+
+ public void setSelCriteria(SelectionConstraints selCriteria) {
+ this.selCriteria = selCriteria;
+ }
+
+ private void setSelCriteria(String json) {
+ // Type collectionType = new TypeToken>(){}.getType();
+ selCriteria = new Gson().fromJson(json, SelectionConstraints.class);
+ }
+
+ public void setSelCriteria(Node n) {
+ if (n == null) {
+ selCriteria = null;
+ } else {
+ setSelCriteria(n.getText());
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java
new file mode 100644
index 000000000..496630fa3
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("contains")
+public class ContainsVerb implements Selection, Serializable {
+
+ private String param;
+
+ public ContainsVerb() {
+ }
+
+ public ContainsVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.contains(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java
new file mode 100644
index 000000000..a4a6f5663
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("contains_ignorecase")
+public class ContainsVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public ContainsVerbIgnoreCase() {
+ }
+
+ public ContainsVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.toLowerCase().contains(param.toLowerCase());
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java
new file mode 100644
index 000000000..b9088d012
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("equals")
+public class EqualVerb implements Selection, Serializable {
+
+ private String param;
+
+ public EqualVerb() {
+ }
+
+ public EqualVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.equals(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java
new file mode 100644
index 000000000..c5f0ce070
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("equals_ignorecase")
+public class EqualVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public EqualVerbIgnoreCase() {
+ }
+
+ public EqualVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return value.equalsIgnoreCase(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java
new file mode 100644
index 000000000..e9b948b2b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java
@@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.lang.reflect.Type;
+
+import com.google.gson.*;
+
+public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
+
+ private static final String CLASSNAME = "CLASSNAME";
+ private static final String DATA = "DATA";
+
+ public Object deserialize(
+ JsonElement jsonElement,
+ Type type,
+ JsonDeserializationContext jsonDeserializationContext)
+ throws JsonParseException {
+
+ JsonObject jsonObject = jsonElement.getAsJsonObject();
+ JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME);
+ String className = prim.getAsString();
+ Class klass = getObjectClass(className);
+ return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass);
+ }
+
+ public JsonElement serialize(
+ Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) {
+ JsonObject jsonObject = new JsonObject();
+ jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName());
+ jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement));
+ return jsonObject;
+ }
+
+ /** **** Helper method to get the className of the object to be deserialized **** */
+ public Class getObjectClass(String className) {
+ try {
+ return Class.forName(className);
+ } catch (ClassNotFoundException e) {
+ // e.printStackTrace();
+ throw new JsonParseException(e.getMessage());
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java
new file mode 100644
index 000000000..03ec9804b
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_contains")
+public class NotContainsVerb implements Selection, Serializable {
+
+ private String param;
+
+ public NotContainsVerb() {
+ }
+
+ public NotContainsVerb(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.contains(param);
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java
new file mode 100644
index 000000000..b21be83f0
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_contains_ignorecase")
+public class NotContainsVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public NotContainsVerbIgnoreCase() {
+ }
+
+ public NotContainsVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !(value.toLowerCase().contains(param.toLowerCase()));
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java
new file mode 100644
index 000000000..86bf00012
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_equals")
+public class NotEqualVerb implements Selection, Serializable {
+
+ private String param;
+
+ public NotEqualVerb(final String param) {
+ this.param = param;
+ }
+
+ public NotEqualVerb() {
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.equals(param);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java
new file mode 100644
index 000000000..c6958a641
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java
@@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("not_equals_ignorecase")
+public class NotEqualVerbIgnoreCase implements Selection, Serializable {
+
+ private String param;
+
+ public NotEqualVerbIgnoreCase(final String param) {
+ this.param = param;
+ }
+
+ public NotEqualVerbIgnoreCase() {
+ }
+
+ public String getParam() {
+ return param;
+ }
+
+ public void setParam(String param) {
+ this.param = param;
+ }
+
+ @Override
+ public boolean apply(String value) {
+ return !value.equalsIgnoreCase(param);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java
new file mode 100644
index 000000000..ec9fb716d
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java
@@ -0,0 +1,7 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+public interface Selection {
+
+ boolean apply(String value);
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java
new file mode 100644
index 000000000..5b35919bd
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java
@@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+@interface VerbClass {
+
+ String value();
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java
new file mode 100644
index 000000000..3d0db2063
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java
@@ -0,0 +1,56 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import io.github.classgraph.ClassGraph;
+import io.github.classgraph.ClassInfo;
+import io.github.classgraph.ClassInfoList;
+import io.github.classgraph.ScanResult;
+
+public class VerbResolver implements Serializable {
+ private Map> map = null; // = new HashMap<>();
+ private final ClassGraph classgraph = new ClassGraph();
+
+ public VerbResolver() {
+
+ try (ScanResult scanResult = // Assign scanResult in try-with-resources
+ classgraph // Create a new ClassGraph instance
+ .verbose() // If you want to enable logging to stderr
+ .enableAllInfo() // Scan classes, methods, fields, annotations
+ .whitelistPackages(
+ "eu.dnetlib.dhp.bulktag.criteria") // Scan com.xyz and subpackages
+ .scan()) { // Perform the scan and return a ScanResult
+
+ ClassInfoList routeClassInfoList = scanResult
+ .getClassesWithAnnotation(
+ "eu.dnetlib.dhp.bulktag.criteria.VerbClass");
+
+ this.map = routeClassInfoList
+ .stream()
+ .collect(
+ Collectors
+ .toMap(
+ value -> (String) ((ClassInfo) value)
+ .getAnnotationInfo()
+ .get(0)
+ .getParameterValues()
+ .get(0)
+ .getValue(),
+ value -> (Class) ((ClassInfo) value).loadClass()));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ public Selection getSelectionCriteria(String name, String param)
+ throws NoSuchMethodException, IllegalAccessException, InvocationTargetException,
+ InstantiationException {
+
+ // return Class.forName(tmp_map.get(name)).
+ return map.get(name).getDeclaredConstructor((String.class)).newInstance(param);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java
new file mode 100644
index 000000000..0bb801999
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java
@@ -0,0 +1,10 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+public class VerbResolverFactory {
+
+ public static VerbResolver newInstance() {
+
+ return new VerbResolver();
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java
new file mode 100644
index 000000000..271cc6bb3
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java
@@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+
+public class CountrySbs implements Serializable {
+ private String classid;
+ private String classname;
+
+ public String getClassid() {
+ return classid;
+ }
+
+ public void setClassid(String classid) {
+ this.classid = classid;
+ }
+
+ public String getClassname() {
+ return classname;
+ }
+
+ public void setClassname(String classname) {
+ this.classname = classname;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java
new file mode 100644
index 000000000..642192f73
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java
@@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+
+public class DatasourceCountry implements Serializable {
+ private String dataSourceId;
+ private CountrySbs country;
+
+ public String getDataSourceId() {
+ return dataSourceId;
+ }
+
+ public void setDataSourceId(String dataSourceId) {
+ this.dataSourceId = dataSourceId;
+ }
+
+ public CountrySbs getCountry() {
+ return country;
+ }
+
+ public void setCountry(CountrySbs country) {
+ this.country = country;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
new file mode 100644
index 000000000..98b573102
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
@@ -0,0 +1,122 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+/**
+ * For the association of the country to the datasource The association is computed only for datasource of specific type
+ * or having whitelisted ids The country is registered in the Organization associated to the Datasource, so the relation
+ * provides between Datasource and Organization is exploited to get the country for the datasource
+ */
+public class PrepareDatasourceCountryAssociation {
+
+ private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareDatasourceCountryAssociation.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath {}: ", outputPath);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ prepareDatasourceCountryAssociation(
+ spark,
+ Arrays.asList(parser.get("whitelist").split(";")),
+ Arrays.asList(parser.get("allowedtypes").split(";")),
+ inputPath,
+ outputPath);
+ });
+ }
+
+ private static void prepareDatasourceCountryAssociation(
+ SparkSession spark,
+ List whitelist,
+ List allowedtypes,
+ String inputPath,
+ String outputPath) {
+ String whitelisted = "";
+ for (String i : whitelist) {
+ whitelisted += " OR id = '" + i + "'";
+ }
+
+ Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
+ Dataset relation = readPath(spark, inputPath + "/relation", Relation.class);
+ Dataset organization = readPath(spark, inputPath + "/organization", Organization.class);
+
+ datasource.createOrReplaceTempView("datasource");
+ relation.createOrReplaceTempView("relation");
+ organization.createOrReplaceTempView("organization");
+
+ String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
+ + "FROM ( SELECT id "
+ + " FROM datasource "
+ + " WHERE (datainfo.deletedbyinference = false "
+ + whitelisted
+ + ") "
+ + getConstraintList("datasourcetype.classid = '", allowedtypes)
+ + ") d "
+ + "JOIN ( SELECT source, target "
+ + " FROM relation "
+ + " WHERE relclass = '"
+ + ModelConstants.IS_PROVIDED_BY
+ + "' "
+ + " AND datainfo.deletedbyinference = false ) rel "
+ + "ON d.id = rel.source "
+ + "JOIN (SELECT id, country "
+ + " FROM organization "
+ + " WHERE datainfo.deletedbyinference = false "
+ + " AND length(country.classid) > 0) o "
+ + "ON o.id = rel.target";
+
+ spark
+ .sql(query)
+ .as(Encoders.bean(DatasourceCountry.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputPath);
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
new file mode 100644
index 000000000..34b376413
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
@@ -0,0 +1,98 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+
+public class PrepareResultCountrySet {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
+
+ private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet "
+ + "FROM ( SELECT id, country "
+ + "FROM datasource_country JOIN cfhb ON cf = dataSourceId "
+ + "UNION ALL "
+ + "SELECT id, country FROM datasource_country "
+ + "JOIN cfhb ON hb = dataSourceId ) tmp "
+ + "GROUP BY id";
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ PrepareResultCountrySet.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String datasourcecountrypath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", datasourcecountrypath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ getPotentialResultToUpdate(
+ spark,
+ inputPath,
+ outputPath,
+ datasourcecountrypath,
+ resultClazz);
+ });
+ }
+
+ private static void getPotentialResultToUpdate(
+ SparkSession spark,
+ String inputPath,
+ String outputPath,
+ String datasourcecountrypath,
+ Class resultClazz) {
+
+ Dataset result = readPath(spark, inputPath, resultClazz);
+ result.createOrReplaceTempView("result");
+ // log.info("number of results: {}", result.count());
+ createCfHbforResult(spark);
+
+ Dataset datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
+
+ datasource_country.createOrReplaceTempView("datasource_country");
+ // log.info("datasource_country number : {}", datasource_country.count());
+
+ spark
+ .sql(RESULT_COUNTRYSET_QUERY)
+ .as(Encoders.bean(ResultCountrySet.class))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Append)
+ .json(outputPath);
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java
new file mode 100644
index 000000000..8c29424f2
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java
@@ -0,0 +1,26 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+public class ResultCountrySet implements Serializable {
+ private String resultId;
+ private ArrayList countrySet;
+
+ public String getResultId() {
+ return resultId;
+ }
+
+ public void setResultId(String resultId) {
+ this.resultId = resultId;
+ }
+
+ public ArrayList getCountrySet() {
+ return countrySet;
+ }
+
+ public void setCountrySet(ArrayList countrySet) {
+ this.countrySet = countrySet;
+ }
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
new file mode 100644
index 000000000..974b3a3b1
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@@ -0,0 +1,135 @@
+
+package eu.dnetlib.dhp.countrypropagation;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Country;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import scala.Tuple2;
+
+public class SparkCountryPropagationJob {
+
+ private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ public static void main(String[] args) throws Exception {
+
+ String jsonConfiguration = IOUtils
+ .toString(
+ SparkCountryPropagationJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String sourcePath = parser.get("sourcePath");
+ log.info("sourcePath: {}", sourcePath);
+
+ String preparedInfoPath = parser.get("preparedInfoPath");
+ log.info("preparedInfoPath: {}", preparedInfoPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final Boolean saveGraph = Optional
+ .ofNullable(parser.get("saveGraph"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ log.info("saveGraph: {}", saveGraph);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ execPropagation(
+ spark,
+ sourcePath,
+ preparedInfoPath,
+ outputPath,
+ resultClazz,
+ saveGraph);
+ });
+ }
+
+ private static void execPropagation(
+ SparkSession spark,
+ String sourcePath,
+ String preparedInfoPath,
+ String outputPath,
+ Class resultClazz,
+ boolean saveGraph) {
+
+ if (saveGraph) {
+ // updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
+ log.info("Reading Graph table from: {}", sourcePath);
+ Dataset res = readPath(spark, sourcePath, resultClazz);
+
+ log.info("Reading prepared info: {}", preparedInfoPath);
+ Dataset prepared = spark
+ .read()
+ .json(preparedInfoPath)
+ .as(Encoders.bean(ResultCountrySet.class));
+
+ res
+ .joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
+ .map(getCountryMergeFn(), Encoders.bean(resultClazz))
+ .write()
+ .option("compression", "gzip")
+ .mode(SaveMode.Overwrite)
+ .json(outputPath);
+ }
+ }
+
+ private static MapFunction, R> getCountryMergeFn() {
+ return (MapFunction, R>) t -> {
+ Optional.ofNullable(t._2()).ifPresent(r -> {
+ t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
+ });
+ return t._1();
+ };
+ }
+
+ private static List merge(List c1, List c2) {
+ HashSet countries = c1
+ .stream()
+ .map(c -> c.getClassid())
+ .collect(Collectors.toCollection(HashSet::new));
+
+ return c2
+ .stream()
+ .filter(c -> !countries.contains(c.getClassid()))
+ .map(c -> getCountry(c.getClassid(), c.getClassname()))
+ .collect(Collectors.toList());
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java
new file mode 100644
index 000000000..a5fcab360
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java
@@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+public class AutoritativeAuthor {
+
+ private String name;
+ private String surname;
+ private String fullname;
+ private String orcid;
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getSurname() {
+ return surname;
+ }
+
+ public void setSurname(String surname) {
+ this.surname = surname;
+ }
+
+ public String getFullname() {
+ return fullname;
+ }
+
+ public void setFullname(String fullname) {
+ this.fullname = fullname;
+ }
+
+ public String getOrcid() {
+ return orcid;
+ }
+
+ public void setOrcid(String orcid) {
+ this.orcid = orcid;
+ }
+
+}
diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
new file mode 100644
index 000000000..b15f813ac
--- /dev/null
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java
@@ -0,0 +1,123 @@
+
+package eu.dnetlib.dhp.orcidtoresultfromsemrel;
+
+import static eu.dnetlib.dhp.PropagationConstant.*;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class PrepareResultOrcidAssociationStep1 {
+ private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConf = IOUtils
+ .toString(
+ PrepareResultOrcidAssociationStep1.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
+ parser.parseArgument(args);
+
+ Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
+ log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+ String inputPath = parser.get("sourcePath");
+ log.info("inputPath: {}", inputPath);
+
+ final String outputPath = parser.get("outputPath");
+ log.info("outputPath: {}", outputPath);
+
+ final String resultClassName = parser.get("resultTableName");
+ log.info("resultTableName: {}", resultClassName);
+
+ final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
+ log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
+
+ final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
+ log.info("resultType: {}", resultType);
+
+ Class extends Result> resultClazz = (Class extends Result>) Class.forName(resultClassName);
+
+ SparkConf conf = new SparkConf();
+ conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
+
+ String inputRelationPath = inputPath + "/relation";
+ log.info("inputRelationPath: {}", inputRelationPath);
+
+ String inputResultPath = inputPath + "/" + resultType;
+ log.info("inputResultPath: {}", inputResultPath);
+
+ String outputResultPath = outputPath + "/" + resultType;
+ log.info("outputResultPath: {}", outputResultPath);
+
+ runWithSparkHiveSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ removeOutputDir(spark, outputPath);
+ prepareInfo(
+ spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
+ });
+ }
+
+ private static void prepareInfo(
+ SparkSession spark,
+ String inputRelationPath,
+ String inputResultPath,
+ String outputResultPath,
+ Class resultClazz,
+ List