diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index c1d6e1b5bb..c7cb11b085 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -83,6 +83,10 @@ com.jayway.jsonpath json-path + + org.postgresql + postgresql + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java similarity index 95% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java index 94f17aad52..cedc9bd4d1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common; import java.io.Closeable; import java.io.IOException; @@ -14,7 +14,7 @@ public class DbClient implements Closeable { private static final Log log = LogFactory.getLog(DbClient.class); - private final Connection connection; + private Connection connection; public DbClient(final String address, final String login, final String password) { diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index cdde37fd4a..fc85b1ac14 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.schema.oaf.*; public class ModelSupport { /** Defines the mapping between the actual entity type and the main entity type */ - private static final Map entityMapping = Maps.newHashMap(); + private static Map entityMapping = Maps.newHashMap(); static { entityMapping.put(EntityType.publication, MainEntityType.result); @@ -53,6 +53,232 @@ public class ModelSupport { oafTypes.put("relation", Relation.class); } + public static final Map entityIdPrefix = Maps.newHashMap(); + + static { + entityIdPrefix.put("datasource", "10"); + entityIdPrefix.put("organization", "20"); + entityIdPrefix.put("project", "40"); + entityIdPrefix.put("result", "50"); + } + + public static final Map relationInverseMap = Maps.newHashMap(); + + static { + relationInverseMap + .put( + "personResult_authorship_isAuthorOf", new RelationInverse() + .setRelation("isAuthorOf") + .setInverse("hasAuthor") + .setRelType("personResult") + .setSubReltype("authorship")); + relationInverseMap + .put( + "personResult_authorship_hasAuthor", new RelationInverse() + .setInverse("isAuthorOf") + .setRelation("hasAuthor") + .setRelType("personResult") + .setSubReltype("authorship")); + relationInverseMap + .put( + "projectOrganization_participation_isParticipant", new RelationInverse() + .setRelation("isParticipant") + .setInverse("hasParticipant") + .setRelType("projectOrganization") + .setSubReltype("participation")); + relationInverseMap + .put( + "projectOrganization_participation_hasParticipant", new RelationInverse() + .setInverse("isParticipant") + .setRelation("hasParticipant") + .setRelType("projectOrganization") + .setSubReltype("participation")); + relationInverseMap + .put( + "resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse() + .setRelation("hasAuthorInstitution") + .setInverse("isAuthorInstitutionOf") + .setRelType("resultOrganization") + .setSubReltype("affiliation")); + relationInverseMap + .put( + "resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse() + .setInverse("hasAuthorInstitution") + .setRelation("isAuthorInstitutionOf") + .setRelType("resultOrganization") + .setSubReltype("affiliation")); + relationInverseMap + .put( + "organizationOrganization_dedup_merges", new RelationInverse() + .setRelation("merges") + .setInverse("isMergedIn") + .setRelType("organizationOrganization") + .setSubReltype("dedup")); + relationInverseMap + .put( + "organizationOrganization_dedup_isMergedIn", new RelationInverse() + .setInverse("merges") + .setRelation("isMergedIn") + .setRelType("organizationOrganization") + .setSubReltype("dedup")); + relationInverseMap + .put( + "organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse() + .setInverse("isSimilarTo") + .setRelation("isSimilarTo") + .setRelType("organizationOrganization") + .setSubReltype("dedupSimilarity")); + + relationInverseMap + .put( + "resultProject_outcome_isProducedBy", new RelationInverse() + .setRelation("isProducedBy") + .setInverse("produces") + .setRelType("resultProject") + .setSubReltype("outcome")); + relationInverseMap + .put( + "resultProject_outcome_produces", new RelationInverse() + .setInverse("isProducedBy") + .setRelation("produces") + .setRelType("resultProject") + .setSubReltype("outcome")); + relationInverseMap + .put( + "projectPerson_contactPerson_isContact", new RelationInverse() + .setRelation("isContact") + .setInverse("hasContact") + .setRelType("projectPerson") + .setSubReltype("contactPerson")); + relationInverseMap + .put( + "projectPerson_contactPerson_hasContact", new RelationInverse() + .setInverse("isContact") + .setRelation("hasContact") + .setRelType("personPerson") + .setSubReltype("coAuthorship")); + relationInverseMap + .put( + "personPerson_coAuthorship_isCoauthorOf", new RelationInverse() + .setInverse("isCoAuthorOf") + .setRelation("isCoAuthorOf") + .setRelType("personPerson") + .setSubReltype("coAuthorship")); + relationInverseMap + .put( + "personPerson_dedup_merges", new RelationInverse() + .setInverse("isMergedIn") + .setRelation("merges") + .setRelType("personPerson") + .setSubReltype("dedup")); + relationInverseMap + .put( + "personPerson_dedup_isMergedIn", new RelationInverse() + .setInverse("merges") + .setRelation("isMergedIn") + .setRelType("personPerson") + .setSubReltype("dedup")); + relationInverseMap + .put( + "personPerson_dedupSimilarity_isSimilarTo", new RelationInverse() + .setInverse("isSimilarTo") + .setRelation("isSimilarTo") + .setRelType("personPerson") + .setSubReltype("dedupSimilarity")); + relationInverseMap + .put( + "datasourceOrganization_provision_isProvidedBy", new RelationInverse() + .setInverse("provides") + .setRelation("isProvidedBy") + .setRelType("datasourceOrganization") + .setSubReltype("provision")); + relationInverseMap + .put( + "datasourceOrganization_provision_provides", new RelationInverse() + .setInverse("isProvidedBy") + .setRelation("provides") + .setRelType("datasourceOrganization") + .setSubReltype("provision")); + relationInverseMap + .put( + "resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse() + .setInverse("isAmongTopNSimilarDocuments") + .setRelation("hasAmongTopNSimilarDocuments") + .setRelType("resultResult") + .setSubReltype("similarity")); + relationInverseMap + .put( + "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() + .setInverse("hasAmongTopNSimilarDocuments") + .setRelation("isAmongTopNSimilarDocuments") + .setRelType("resultResult") + .setSubReltype("similarity")); + relationInverseMap + .put( + "resultResult_relationship_isRelatedTo", new RelationInverse() + .setInverse("isRelatedTo") + .setRelation("isRelatedTo") + .setRelType("resultResult") + .setSubReltype("relationship")); + relationInverseMap + .put( + "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() + .setInverse("hasAmongTopNSimilarDocuments") + .setRelation("isAmongTopNSimilarDocuments") + .setRelType("resultResult") + .setSubReltype("similarity")); + relationInverseMap + .put( + "resultResult_supplement_isSupplementTo", new RelationInverse() + .setInverse("isSupplementedBy") + .setRelation("isSupplementTo") + .setRelType("resultResult") + .setSubReltype("supplement")); + relationInverseMap + .put( + "resultResult_supplement_isSupplementedBy", new RelationInverse() + .setInverse("isSupplementTo") + .setRelation("isSupplementedBy") + .setRelType("resultResult") + .setSubReltype("supplement")); + relationInverseMap + .put( + "resultResult_part_isPartOf", new RelationInverse() + .setInverse("hasPart") + .setRelation("isPartOf") + .setRelType("resultResult") + .setSubReltype("part")); + relationInverseMap + .put( + "resultResult_part_hasPart", new RelationInverse() + .setInverse("isPartOf") + .setRelation("hasPart") + .setRelType("resultResult") + .setSubReltype("part")); + relationInverseMap + .put( + "resultResult_dedup_merges", new RelationInverse() + .setInverse("isMergedIn") + .setRelation("merges") + .setRelType("resultResult") + .setSubReltype("dedup")); + relationInverseMap + .put( + "resultResult_dedup_isMergedIn", new RelationInverse() + .setInverse("merges") + .setRelation("isMergedIn") + .setRelType("resultResult") + .setSubReltype("dedup")); + relationInverseMap + .put( + "resultResult_dedupSimilarity_isSimilarTo", new RelationInverse() + .setInverse("isSimilarTo") + .setRelation("isSimilarTo") + .setRelType("resultResult") + .setSubReltype("dedupSimilarity")); + + } + private static final String schemeTemplate = "dnet:%s_%s_relations"; private ModelSupport() { diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java new file mode 100644 index 0000000000..4757c637ee --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.schema.common; + +public class RelationInverse { + private String relation; + private String inverse; + private String relType; + private String subReltype; + + public String getRelType() { + return relType; + } + + public RelationInverse setRelType(String relType) { + this.relType = relType; + return this; + } + + public String getSubReltype() { + return subReltype; + } + + public RelationInverse setSubReltype(String subReltype) { + this.subReltype = subReltype; + return this; + } + + public String getRelation() { + return relation; + } + + public RelationInverse setRelation(String relation) { + this.relation = relation; + return this; + } + + public String getInverse() { + return inverse; + } + + public RelationInverse setInverse(String inverse) { + this.inverse = inverse; + return this; + } + +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java index b9bd4c5f07..231fb1e606 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java @@ -2,8 +2,7 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; -import java.util.List; -import java.util.Objects; +import java.util.*; public class Author implements Serializable { @@ -86,4 +85,5 @@ public class Author implements Serializable { public int hashCode() { return Objects.hash(fullname, name, surname, rank, pid, affiliation); } + } diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml new file mode 100644 index 0000000000..37abc22f61 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/pom.xml @@ -0,0 +1,36 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.1-SNAPSHOT + + 4.0.0 + + dhp-blacklist + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java new file mode 100644 index 0000000000..0ef59e8c2b --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java @@ -0,0 +1,87 @@ + +package eu.dnetlib.dhp.blacklist; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class PrepareMergedRelationJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareMergedRelationJob.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareMergedRelationJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + String outputPath = parser.get("outputPath"); + log.info("outputPath: {} ", outputPath); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + selectMergesRelations( + spark, + inputPath, + outputPath); + }); + } + + private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) { + + Dataset relation = readRelations(spark, inputPath); + + relation + .filter("relclass = 'merges' and datainfo.deletedbyinference=false") + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + public static org.apache.spark.sql.Dataset readRelations( + SparkSession spark, String inputPath) { + return spark + .read() + .textFile(inputPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class), + Encoders.bean(Relation.class)); + } +} diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java new file mode 100644 index 0000000000..2caa66db42 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java @@ -0,0 +1,141 @@ + +package eu.dnetlib.dhp.blacklist; + +import java.io.BufferedWriter; +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; +import java.util.Arrays; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Function; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.DbClient; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.common.RelationInverse; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class ReadBlacklistFromDB implements Closeable { + + private final DbClient dbClient; + private static final Log log = LogFactory.getLog(ReadBlacklistFromDB.class); + private final Configuration conf; + private final BufferedWriter writer; + private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private final static String query = "SELECT source_type, unnest(original_source_objects) as source, " + + "target_type, unnest(original_target_objects) as target, " + + "relationship FROM blacklist WHERE status = 'ACCEPTED'"; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + ReadBlacklistFromDB.class + .getResourceAsStream( + "/eu/dnetlib/dhp/blacklist/blacklist_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + final String hdfsPath = parser.get("hdfsPath") + "/blacklist"; + final String hdfsNameNode = parser.get("hdfsNameNode"); + + try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser, + dbPassword)) { + + log.info("Processing blacklist..."); + rbl.execute(query, rbl::processBlacklistEntry); + + } + } + + public void execute(final String sql, final Function> producer) throws Exception { + + final Consumer consumer = rs -> producer.apply(rs).forEach(r -> writeRelation(r)); + + dbClient.processResults(sql, consumer); + } + + public List processBlacklistEntry(ResultSet rs) { + try { + Relation direct = new Relation(); + Relation inverse = new Relation(); + + String source_prefix = ModelSupport.entityIdPrefix.get(rs.getString("source_type")); + String target_prefix = ModelSupport.entityIdPrefix.get(rs.getString("target_type")); + + String source_direct = source_prefix + "|" + rs.getString("source"); + direct.setSource(source_direct); + inverse.setTarget(source_direct); + + String target_direct = target_prefix + "|" + rs.getString("target"); + direct.setTarget(target_direct); + inverse.setSource(target_direct); + + String encoding = rs.getString("relationship"); + RelationInverse ri = ModelSupport.relationInverseMap.get(encoding); + direct.setRelClass(ri.getRelation()); + inverse.setRelClass(ri.getInverse()); + direct.setRelType(ri.getRelType()); + inverse.setRelType(ri.getRelType()); + direct.setSubRelType(ri.getSubReltype()); + inverse.setSubRelType(ri.getSubReltype()); + + return Arrays.asList(direct, inverse); + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws IOException { + dbClient.close(); + writer.close(); + } + + public ReadBlacklistFromDB( + final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword) + throws Exception { + + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + this.conf = new Configuration(); + this.conf.set("fs.defaultFS", hdfsNameNode); + FileSystem fileSystem = FileSystem.get(this.conf); + Path hdfsWritePath = new Path(hdfsPath); + FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { + fsDataOutputStream = fileSystem.append(hdfsWritePath); + } else { + fsDataOutputStream = fileSystem.create(hdfsWritePath); + } + + this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + } + + protected void writeRelation(final Relation r) { + try { + writer.write(OBJECT_MAPPER.writeValueAsString(r)); + writer.newLine(); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java new file mode 100644 index 0000000000..86587bfc90 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -0,0 +1,147 @@ + +package eu.dnetlib.dhp.blacklist; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Objects; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import scala.Tuple2; + +public class SparkRemoveBlacklistedRelationJob { + private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + SparkRemoveBlacklistedRelationJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + final String blacklistPath = parser.get("hdfsPath"); + log.info("blacklistPath {}: ", blacklistPath); + + final String mergesPath = parser.get("mergesPath"); + log.info("mergesPath {}: ", mergesPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeBlacklistedRelations( + spark, + blacklistPath, + inputPath, + outputPath, + mergesPath); + }); + + } + + private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath, + String outputPath, String mergesPath) { + Dataset blackListed = readRelations(spark, blacklistPath + "/blacklist"); + Dataset inputRelation = readRelations(spark, inputPath); + Dataset mergesRelation = readRelations(spark, mergesPath); + + log.info("InputRelationCount: {}", inputRelation.count()); + + Dataset dedupSource = blackListed + .joinWith( + mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), + "left_outer") + .map((MapFunction, Relation>) c -> { + Optional + .ofNullable(c._2()) + .ifPresent(mr -> c._1().setSource(mr.getSource())); + return c._1(); + }, Encoders.bean(Relation.class)); + + Dataset dedupBL = dedupSource + .joinWith( + mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")), + "left_outer") + .map((MapFunction, Relation>) c -> { + Optional + .ofNullable(c._2()) + .ifPresent(mr -> c._1().setTarget(mr.getSource())); + return c._1(); + }, Encoders.bean(Relation.class)); + + dedupBL + .write() + .mode(SaveMode.Overwrite) + .json(blacklistPath + "/deduped"); + + inputRelation + .joinWith( + dedupBL, (inputRelation + .col("source") + .equalTo(dedupBL.col("source")) + .and( + inputRelation + .col("target") + .equalTo(dedupBL.col("target")))), + "left_outer") + .map((MapFunction, Relation>) c -> { + Relation ir = c._1(); + Optional obl = Optional.ofNullable(c._2()); + if (obl.isPresent()) { + if (ir.equals(obl.get())) { + return null; + } + } + return ir; + }, Encoders.bean(Relation.class)) + .filter(Objects::nonNull) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + public static org.apache.spark.sql.Dataset readRelations( + SparkSession spark, String inputPath) { + return spark + .read() + .textFile(inputPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class), + Encoders.bean(Relation.class)); + } + +} diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json new file mode 100644 index 0000000000..9a2eadaa7d --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/blacklist_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "nn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the name node on hdfs", + "paramRequired": true + }, + { + "paramName": "pgurl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "pguser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": false + }, + { + "paramName": "pgpasswd", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json new file mode 100644 index 0000000000..4a3d21f4df --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the path to the graph used to remove the relations ", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path where to store the temporary result ", + "paramRequired": true + }, + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed", + "paramRequired": false + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml new file mode 100644 index 0000000000..fe82ae1940 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/config-default.xml @@ -0,0 +1,54 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml new file mode 100644 index 0000000000..1538318c14 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml @@ -0,0 +1,195 @@ + + + + postgresURL + the url of the postgress server to query + + + postgresUser + the username to access the postgres db + + + postgresPassword + the postgres password + + + sourcePath + the source path + + + outputPath + the graph output path + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/publication + ${nameNode}/${outputPath}/publication + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/dataset + ${nameNode}/${outputPath}/dataset + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/otherresearchproduct + ${nameNode}/${outputPath}/otherresearchproduct + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/software + ${nameNode}/${outputPath}/software + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB + --hdfsPath${workingDir}/blacklist + --hdfsNameNode${nameNode} + --postgresUrl${postgresURL} + --postgresUser${postgresUser} + --postgresPassword${postgresPassword} + + + + + + + + yarn + cluster + PrepareMergedRelation + eu.dnetlib.dhp.blacklist.PrepareMergedRelationJob + dhp-blacklist-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/relation + --outputPath${workingDir}/mergesRelation + --hive_metastore_uris${hive_metastore_uris} + + + + + + + + yarn + cluster + ApplyBlacklist + eu.dnetlib.dhp.blacklist.SparkRemoveBlacklistedRelationJob + dhp-blacklist-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/relation + --outputPath${outputPath}/relation + --hdfsPath${workingDir}/blacklist + --mergesPath${workingDir}/mergesRelation + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json new file mode 100644 index 0000000000..91a87b8b5b --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json @@ -0,0 +1,33 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the path to the graph used to remove the relations ", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path where to store the temporary result ", + "paramRequired": true + }, + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed", + "paramRequired": false + }, + { + "paramName": "m", + "paramLongName": "mergesPath", + "paramDescription": "true if the spark session is managed", + "paramRequired": true + + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java new file mode 100644 index 0000000000..bbfd15674e --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlackListTest.java @@ -0,0 +1,167 @@ + +package eu.dnetlib.dhp.blacklist; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class BlackListTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final ClassLoader cl = eu.dnetlib.dhp.blacklist.BlackListTest.class.getClassLoader(); + + private static SparkSession spark; + + private static Path workingDir; + private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.blacklist.BlackListTest.class); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(BlackListTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + /* + * String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath = + * parser.get("outputPath"); log.info("outputPath {}: ", outputPath); final String blacklistPath = + * parser.get("hdfsPath"); log.info("blacklistPath {}: ", blacklistPath); final String mergesPath = + * parser.get("mergesPath"); log.info("mergesPath {}: ", mergesPath); + */ + @Test + public void noRemoveTest() throws Exception { + SparkRemoveBlacklistedRelationJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsNoRemoval").getPath(), + "-outputPath", + workingDir.toString() + "/relation", + "-hdfsPath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(), + "-mergesPath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(), + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(13, tmp.count()); + + } + + @Test + public void removeNoMergeMatchTest() throws Exception { + SparkRemoveBlacklistedRelationJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsOneRemoval").getPath(), + "-outputPath", + workingDir.toString() + "/relation", + "-hdfsPath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(), + "-mergesPath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(), + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(12, tmp.count()); + + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class)); + + Assertions + .assertEquals( + 0, verificationDataset + .filter( + "source = '40|corda__h2020::5161f53ab205d803c36b4c888fe7deef' and " + + "target = '20|dedup_wf_001::157af406bc653aa4d9749318b644de43'") + .count()); + + Assertions.assertEquals(0, verificationDataset.filter("relClass = 'hasParticipant'").count()); + } + + @Test + public void removeMergeMatchTest() throws Exception { + SparkRemoveBlacklistedRelationJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch").getPath(), + "-outputPath", + workingDir.toString() + "/relation", + "-hdfsPath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(), + "-mergesPath", + getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRelOneMerge").getPath(), + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(12, tmp.count()); + + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class)); + + Assertions.assertEquals(12, verificationDataset.filter("relClass = 'isProvidedBy'").count()); + + } +} diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist new file mode 100644 index 0000000000..ea95130af8 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/blacklist/blacklist @@ -0,0 +1,20 @@ +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"hasParticipant","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"isParticipant","source":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43","target":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::a727cc288016db7132ef9a799aa83350","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::a727cc288016db7132ef9a799aa83350"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::062cf091d5c7a7d730001c34177042e3","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::062cf091d5c7a7d730001c34177042e3"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::1b172ab34639e7935e2357119cf20830","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|od_______908::1b172ab34639e7935e2357119cf20830"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021"} \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json new file mode 100644 index 0000000000..8f0d296d6e --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRel/mergesRel.json @@ -0,0 +1,14 @@ +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______177::67c1385662f2fa0bde310bec15427646"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"} \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json new file mode 100644 index 0000000000..3d74ffa6ed --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/mergesRelOneMerge/mergesRel.json @@ -0,0 +1,14 @@ +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"} \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json new file mode 100644 index 0000000000..761cba4786 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch/relations.json @@ -0,0 +1,13 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProducedBy","relType":"resultProject","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"outcome","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"} \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json new file mode 100644 index 0000000000..a79d1d8ebf --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsNoRemoval/relations.json @@ -0,0 +1,13 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::018cb61ed43c01704decc66183ce5d60","subRelType":"provision","target":"20|dedup_wf_001::b9fff055ce5efacecbe4ef918c127f86"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"} \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json new file mode 100644 index 0000000000..f809acfeb5 --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/resources/eu/dnetlib/dhp/blacklist/relationsOneRemoval/relationsOneRemove.json @@ -0,0 +1,13 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","subRelType":"participation","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml new file mode 100644 index 0000000000..fe9833e3ea --- /dev/null +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -0,0 +1,64 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.1-SNAPSHOT + + 4.0.0 + + dhp-enrichment + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + org.apache.spark + spark-hive_2.11 + test + + + + dom4j + dom4j + + + jaxen + jaxen + + + com.jayway.jsonpath + json-path + + + + io.github.classgraph + classgraph + 4.8.71 + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java new file mode 100644 index 0000000000..8d2fede82e --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -0,0 +1,169 @@ + +package eu.dnetlib.dhp; + +import java.util.List; +import java.util.Optional; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.oaf.*; + +public class PropagationConstant { + public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional"; + + public static final String PROPAGATION_DATA_INFO_TYPE = "propagation"; + + public static final String TRUE = "true"; + + public static final String DNET_COUNTRY_SCHEMA = "dnet:countries"; + public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; + public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; + + public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos"; + public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories"; + + public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = "result:organization:instrepo"; + public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository"; + + public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel"; + public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation"; + + public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = "result:community:semrel"; + public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation"; + + public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = "result:community:organization"; + public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization"; + + public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result"; + public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations"; + + public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy"; + + public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization"; + public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation"; + public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf"; + public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution"; + + public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult"; + + public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject"; + public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome"; + public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy"; + public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces"; + + public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges"; + + public static final String PROPAGATION_AUTHOR_PID = "ORCID"; + + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb " + + + "from result r " + + "lateral view explode(instance) i as inst " + + "where r.datainfo.deletedbyinference=false"; + + public static Country getCountry(String classid, String classname) { + Country nc = new Country(); + nc.setClassid(classid); + nc.setClassname(classname); + nc.setSchemename(DNET_COUNTRY_SCHEMA); + nc.setSchemeid(DNET_COUNTRY_SCHEMA); + nc + .setDataInfo( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_COUNTRY_INSTREPO_CLASS_ID, + PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME)); + return nc; + } + + public static DataInfo getDataInfo( + String inference_provenance, String inference_class_id, String inference_class_name) { + DataInfo di = new DataInfo(); + di.setInferred(true); + di.setDeletedbyinference(false); + di.setTrust("0.85"); + di.setInferenceprovenance(inference_provenance); + di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name)); + return di; + } + + public static Qualifier getQualifier(String inference_class_id, String inference_class_name) { + Qualifier pa = new Qualifier(); + pa.setClassid(inference_class_id); + pa.setClassname(inference_class_name); + pa.setSchemeid(DNET_SCHEMA_ID); + pa.setSchemename(DNET_SCHEMA_NAME); + return pa; + } + + public static Relation getRelation( + String source, + String target, + String rel_class, + String rel_type, + String subrel_type, + String inference_provenance, + String inference_class_id, + String inference_class_name) { + Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setRelClass(rel_class); + r.setRelType(rel_type); + r.setSubRelType(subrel_type); + r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name)); + return r; + } + + public static String getConstraintList(String text, List constraints) { + String ret = " and (" + text + constraints.get(0) + "'"; + for (int i = 1; i < constraints.size(); i++) { + ret += " OR " + text + constraints.get(i) + "'"; + } + ret += ")"; + return ret; + } + + public static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) { + return Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + } + + public static Boolean isTest(ArgumentApplicationParser parser) { + return Optional + .ofNullable(parser.get("isTest")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + } + + public static void createCfHbforResult(SparkSession spark) { + org.apache.spark.sql.Dataset cfhb = spark.sql(cfHbforResultQuery); + cfhb.createOrReplaceTempView("cfhb"); + } + + public static Dataset readPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java new file mode 100644 index 0000000000..75d85e2bad --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java @@ -0,0 +1,120 @@ + +package eu.dnetlib.dhp.bulktag; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.bulktag.community.*; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class SparkBulkTagJob { + + private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob.class); + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkBulkTagJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + Boolean isTest = Optional + .ofNullable(parser.get("isTest")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + log.info("isTest: {} ", isTest); + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class); + log.info("pathMap: {}", new Gson().toJson(protoMappingParams)); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + CommunityConfiguration cc; + + String taggingConf = parser.get("taggingConf"); + + if (isTest) { + cc = CommunityConfigurationFactory.newInstance(taggingConf); + } else { + cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookUpUrl")); + } + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); + }); + } + + private static void execBulkTag( + SparkSession spark, + String inputPath, + String outputPath, + ProtoMap protoMappingParams, + Class resultClazz, + CommunityConfiguration communityConfiguration) { + + ResultTagger resultTagger = new ResultTagger(); + readPath(spark, inputPath, resultClazz) + .map( + (MapFunction) value -> resultTagger + .enrichContextCriteria( + value, communityConfiguration, protoMappingParams), + Encoders.bean(resultClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + public static Dataset readPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java new file mode 100644 index 0000000000..0f45d3beb7 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java @@ -0,0 +1,65 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.gson.Gson; + +/** Created by miriam on 01/08/2018. */ +public class Community implements Serializable { + + private static final Log log = LogFactory.getLog(Community.class); + + private String id; + private List subjects = new ArrayList<>(); + private List providers = new ArrayList<>(); + private List zenodoCommunities = new ArrayList<>(); + + public String toJson() { + final Gson g = new Gson(); + return g.toJson(this); + } + + public boolean isValid() { + return !getSubjects().isEmpty() + || !getProviders().isEmpty() + || !getZenodoCommunities().isEmpty(); + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getSubjects() { + return subjects; + } + + public void setSubjects(List subjects) { + this.subjects = subjects; + } + + public List getProviders() { + return providers; + } + + public void setProviders(List providers) { + this.providers = providers; + } + + public List getZenodoCommunities() { + return zenodoCommunities; + } + + public void setZenodoCommunities(List zenodoCommunities) { + this.zenodoCommunities = zenodoCommunities; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java new file mode 100644 index 0000000000..29ddde15f8 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java @@ -0,0 +1,196 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; + +import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter; +import eu.dnetlib.dhp.bulktag.criteria.Selection; + +/** Created by miriam on 02/08/2018. */ +public class CommunityConfiguration implements Serializable { + + private static final Log log = LogFactory.getLog(CommunityConfiguration.class); + + private Map communities; + + // map subject -> communityid + private Map>> subjectMap = new HashMap<>(); + // map datasourceid -> communityid + private Map>> datasourceMap = new HashMap<>(); + // map zenodocommunityid -> communityid + private Map>> zenodocommunityMap = new HashMap<>(); + + public Map>> getSubjectMap() { + return subjectMap; + } + + public void setSubjectMap(Map>> subjectMap) { + this.subjectMap = subjectMap; + } + + public Map>> getDatasourceMap() { + return datasourceMap; + } + + public void setDatasourceMap( + Map>> datasourceMap) { + this.datasourceMap = datasourceMap; + } + + public Map>> getZenodocommunityMap() { + return zenodocommunityMap; + } + + public void setZenodocommunityMap( + Map>> zenodocommunityMap) { + this.zenodocommunityMap = zenodocommunityMap; + } + + CommunityConfiguration(final Map communities) { + this.communities = communities; + init(); + } + + void init() { + + if (subjectMap == null) { + subjectMap = Maps.newHashMap(); + } + if (datasourceMap == null) { + datasourceMap = Maps.newHashMap(); + } + if (zenodocommunityMap == null) { + zenodocommunityMap = Maps.newHashMap(); + } + + for (Community c : getCommunities().values()) { + // get subjects + final String id = c.getId(); + for (String sbj : c.getSubjects()) { + Pair p = new Pair<>(id, new SelectionConstraints()); + add(sbj.toLowerCase().trim(), p, subjectMap); + } + // get datasources + for (Provider d : c.getProviders()) { + + add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap); + } + // get zenodo communities + for (ZenodoCommunity zc : c.getZenodoCommunities()) { + add( + zc.getZenodoCommunityId(), + new Pair<>(id, zc.getSelCriteria()), + zenodocommunityMap); + } + } + } + + private void add( + String key, + Pair value, + Map>> map) { + List> values = map.get(key); + + if (values == null) { + values = new ArrayList<>(); + map.put(key, values); + } + values.add(value); + } + + public List> getCommunityForSubject(String sbj) { + return subjectMap.get(sbj); + } + + public List> getCommunityForDatasource(String dts) { + return datasourceMap.get(dts); + } + + public List getCommunityForDatasource( + final String dts, final Map> param) { + List> lp = datasourceMap.get(dts); + if (lp == null) + return Lists.newArrayList(); + + return lp + .stream() + .map( + p -> { + if (p.getSnd() == null) + return p.getFst(); + if (((SelectionConstraints) p.getSnd()).verifyCriteria(param)) + return p.getFst(); + else + return null; + }) + .filter(st -> (st != null)) + .collect(Collectors.toList()); + } + + public List> getCommunityForZenodoCommunity(String zc) { + return zenodocommunityMap.get(zc); + } + + public List getCommunityForSubjectValue(String value) { + + return getContextIds(subjectMap.get(value)); + } + + public List getCommunityForDatasourceValue(String value) { + + return getContextIds(datasourceMap.get(value.toLowerCase())); + } + + public List getCommunityForZenodoCommunityValue(String value) { + + return getContextIds(zenodocommunityMap.get(value.toLowerCase())); + } + + private List getContextIds(List> list) { + if (list != null) { + return list.stream().map(p -> p.getFst()).collect(Collectors.toList()); + } + return Lists.newArrayList(); + } + + public Map getCommunities() { + return communities; + } + + public void setCommunities(Map communities) { + this.communities = communities; + } + + public String toJson() { + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(Selection.class, new InterfaceAdapter()); + Gson gson = builder.create(); + + return gson.toJson(this); + } + + public int size() { + return communities.keySet().size(); + } + + public Community getCommunityById(String id) { + return communities.get(id); + } + + public List getCommunityList() { + return Lists.newLinkedList(communities.values()); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java new file mode 100644 index 0000000000..607315f3f8 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java @@ -0,0 +1,138 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; + +import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter; +import eu.dnetlib.dhp.bulktag.criteria.Selection; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory; + +/** Created by miriam on 03/08/2018. */ +public class CommunityConfigurationFactory { + + private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class); + + private static VerbResolver resolver = VerbResolverFactory.newInstance(); + + public static CommunityConfiguration newInstance(final String xml) throws DocumentException { + + log.debug(String.format("parsing community configuration from:\n%s", xml)); + + final Document doc = new SAXReader().read(new StringReader(xml)); + + final Map communities = Maps.newHashMap(); + + for (final Object o : doc.selectNodes("//community")) { + + final Node node = (Node) o; + + final Community community = parseCommunity(node); + + if (community.isValid()) { + communities.put(community.getId(), community); + } + } + + log.info(String.format("loaded %s community configuration profiles", communities.size())); + log.debug(String.format("loaded community configuration:\n%s", communities.toString())); + + return new CommunityConfiguration(communities); + } + + public static CommunityConfiguration fromJson(final String json) { + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(Selection.class, new InterfaceAdapter()); + Gson gson = builder.create(); + final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class); + log.info(String.format("loaded %s community configuration profiles", conf.size())); + conf.init(); + log.info("created inverse maps"); + + return conf; + } + + private static Community parseCommunity(final Node node) { + + final Community c = new Community(); + + c.setId(node.valueOf("./@id")); + + log.info(String.format("community id: %s", c.getId())); + + c.setSubjects(parseSubjects(node)); + c.setProviders(parseDatasources(node)); + c.setZenodoCommunities(parseZenodoCommunities(node)); + return c; + } + + private static List parseSubjects(final Node node) { + + final List subjects = Lists.newArrayList(); + + final List list = node.selectNodes("./subjects/subject"); + + for (Node n : list) { + log.debug("text of the node " + n.getText()); + subjects.add(StringUtils.trim(n.getText())); + } + log.info("size of the subject list " + subjects.size()); + return subjects; + } + + private static List parseDatasources(final Node node) { + final List list = node.selectNodes("./datasources/datasource"); + final List providerList = new ArrayList<>(); + for (Node n : list) { + Provider d = new Provider(); + d.setOpenaireId(n.selectSingleNode("./openaireId").getText()); + d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver); + providerList.add(d); + } + log.info("size of the datasource list " + providerList.size()); + return providerList; + } + + private static List parseZenodoCommunities(final Node node) { + final Node oacommunitynode = node.selectSingleNode("./oacommunity"); + String oacommunity = null; + if (oacommunitynode != null) { + String tmp = oacommunitynode.getText(); + if (StringUtils.isNotBlank(tmp)) + oacommunity = tmp; + } + + final List list = node.selectNodes("./zenodocommunities/zenodocommunity"); + final List zenodoCommunityList = new ArrayList<>(); + for (Node n : list) { + ZenodoCommunity zc = new ZenodoCommunity(); + zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText()); + zc.setSelCriteria(n.selectSingleNode("./selcriteria")); + + zenodoCommunityList.add(zc); + } + if (oacommunity != null) { + ZenodoCommunity zc = new ZenodoCommunity(); + zc.setZenodoCommunityId(oacommunity); + zenodoCommunityList.add(zc); + } + log.info("size of the zenodo community list " + zenodoCommunityList.size()); + return zenodoCommunityList; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java new file mode 100644 index 0000000000..e0856ae8f6 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java @@ -0,0 +1,56 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; + +import eu.dnetlib.dhp.bulktag.criteria.Selection; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; + +public class Constraint implements Serializable { + private String verb; + private String field; + private String value; + private Selection selection; + + public Constraint() { + } + + public String getVerb() { + return verb; + } + + public void setVerb(String verb) { + this.verb = verb; + } + + public String getField() { + return field; + } + + public void setField(String field) { + this.field = field; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + public void setSelection(Selection sel) { + selection = sel; + } + + public void setSelection(VerbResolver resolver) + throws InvocationTargetException, NoSuchMethodException, InstantiationException, + IllegalAccessException { + selection = resolver.getSelectionCriteria(verb, value); + } + + public boolean verifyCriteria(String metadata) { + return selection.apply(metadata); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java new file mode 100644 index 0000000000..b56dfaaa31 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java @@ -0,0 +1,74 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Type; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; + +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; + +/** Created by miriam on 02/08/2018. */ +public class Constraints implements Serializable { + private static final Log log = LogFactory.getLog(Constraints.class); + // private ConstraintEncapsulator ce; + private List constraint; + + public Constraints() { + } + + public List getConstraint() { + return constraint; + } + + public void setConstraint(List constraint) { + this.constraint = constraint; + } + + public void setSc(String json) { + Type collectionType = new TypeToken>() { + }.getType(); + constraint = new Gson().fromJson(json, collectionType); + } + + void setSelection(VerbResolver resolver) { + for (Constraint st : constraint) { + + try { + st.setSelection(resolver); + } catch (NoSuchMethodException e) { + log.error(e.getMessage()); + } catch (IllegalAccessException e) { + log.error(e.getMessage()); + } catch (InvocationTargetException e) { + log.error(e.getMessage()); + } catch (InstantiationException e) { + log.error(e.getMessage()); + } + } + } + + // Constraint in and + public boolean verifyCriteria(final Map> param) { + + for (Constraint sc : constraint) { + boolean verified = false; + for (String value : param.get(sc.getField())) { + if (sc.verifyCriteria(value.trim())) { + verified = true; + } + } + if (!verified) + return verified; + } + return true; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java new file mode 100644 index 0000000000..50e1836fa0 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java @@ -0,0 +1,39 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; + +import com.google.gson.Gson; + +/** Created by miriam on 03/08/2018. */ +public class Pair implements Serializable { + private A fst; + private B snd; + + public A getFst() { + return fst; + } + + public Pair setFst(A fst) { + this.fst = fst; + return this; + } + + public B getSnd() { + return snd; + } + + public Pair setSnd(B snd) { + this.snd = snd; + return this; + } + + public Pair(A a, B b) { + fst = a; + snd = b; + } + + public String toJson() { + return new Gson().toJson(this); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java new file mode 100644 index 0000000000..fd74817199 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java @@ -0,0 +1,12 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; +import java.util.HashMap; + +public class ProtoMap extends HashMap implements Serializable { + + public ProtoMap() { + super(); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java new file mode 100644 index 0000000000..b9c37f4dcc --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java @@ -0,0 +1,61 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Node; + +import com.google.gson.Gson; + +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; + +/** Created by miriam on 01/08/2018. */ +public class Provider implements Serializable { + private static final Log log = LogFactory.getLog(Provider.class); + + private String openaireId; + + private SelectionConstraints selectionConstraints; + + public SelectionConstraints getSelCriteria() { + return selectionConstraints; + } + + public SelectionConstraints getSelectionConstraints() { + return selectionConstraints; + } + + public void setSelectionConstraints(SelectionConstraints selectionConstraints) { + this.selectionConstraints = selectionConstraints; + } + + public void setSelCriteria(SelectionConstraints selCriteria) { + this.selectionConstraints = selCriteria; + } + + public String getOpenaireId() { + return openaireId; + } + + public void setOpenaireId(String openaireId) { + this.openaireId = openaireId; + } + + private void setSelCriteria(String json, VerbResolver resolver) { + log.info("Selection constraints for datasource = " + json); + selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class); + + selectionConstraints.setSelection(resolver); + } + + public void setSelCriteria(Node n, VerbResolver resolver) { + try { + setSelCriteria(n.getText(), resolver); + } catch (Exception e) { + log.info("not set selection criteria... "); + selectionConstraints = null; + } + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java new file mode 100644 index 0000000000..7ec2f916fb --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java @@ -0,0 +1,65 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.util.List; + +import org.dom4j.DocumentException; + +import com.google.common.base.Joiner; + +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +public class QueryInformationSystem { + private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " + + " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() " + + " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept " + + " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept " + + " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept " + + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " + + " return " + + " " + + " { $x//CONFIGURATION/context/@id} " + + " " + + " {for $y in tokenize($subj,',') " + + " return " + + " {$y}} " + + " " + + " " + + " {for $d in $datasources " + + " where $d/param[./@name='enabled']/text()='true' " + + " return " + + " " + + " " + + " {$d//param[./@name='openaireId']/text()} " + + " " + + " " + + " {$d/param[./@name='selcriteria']/text()} " + + " " + + " } " + + " " + + " " + + " {for $zc in $communities " + + " return " + + " " + + " " + + " {$zc/param[./@name='zenodoid']/text()} " + + " " + + " " + + " {$zc/param[./@name='selcriteria']/text()} " + + " " + + " } " + + " " + + " "; + + public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl) + throws ISLookUpException, DocumentException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + final List res = isLookUp.quickSearchProfile(XQUERY); + + final String xmlConf = "" + Joiner.on(" ").join(res) + ""; + + return CommunityConfigurationFactory.newInstance(xmlConf); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java new file mode 100644 index 0000000000..f5a985d15f --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -0,0 +1,247 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.lang3.StringUtils; + +import com.google.gson.Gson; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.schema.oaf.*; + +/** Created by miriam on 02/08/2018. */ +public class ResultTagger implements Serializable { + + private String trust = "0.8"; + + private boolean clearContext(Result result) { + int tmp = result.getContext().size(); + List clist = result + .getContext() + .stream() + .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR))) + .collect(Collectors.toList()); + result.setContext(clist); + return (tmp != clist.size()); + } + + private Map> getParamMap(final Result result, Map params) { + Map> param = new HashMap<>(); + String json = new Gson().toJson(result, Result.class); + DocumentContext jsonContext = JsonPath.parse(json); + if (params == null) { + params = new HashMap<>(); + } + for (String key : params.keySet()) { + try { + param.put(key, jsonContext.read(params.get(key))); + } catch (com.jayway.jsonpath.PathNotFoundException e) { + param.put(key, new ArrayList<>()); + // throw e; + } + } + return param; + } + + public R enrichContextCriteria( + final R result, final CommunityConfiguration conf, final Map criteria) { + + // } + // public Result enrichContextCriteria(final Result result, final CommunityConfiguration + // conf, final Map criteria) { + final Map> param = getParamMap(result, criteria); + + // Verify if the entity is deletedbyinference. In case verify if to clean the context list + // from all the zenodo communities + if (result.getDataInfo().getDeletedbyinference()) { + clearContext(result); + return result; + } + + // communities contains all the communities to be added as context for the result + final Set communities = new HashSet<>(); + + // tagging for Subject + final Set subjects = new HashSet<>(); + Optional> oresultsubj = Optional.ofNullable(result.getSubject()); + if (oresultsubj.isPresent()) { + oresultsubj + .get() + .stream() + .map(subject -> subject.getValue()) + .filter(StringUtils::isNotBlank) + .map(String::toLowerCase) + .map(String::trim) + .collect(Collectors.toCollection(HashSet::new)) + .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s))); + } + + communities.addAll(subjects); + + // Tagging for datasource + final Set datasources = new HashSet<>(); + final Set tmp = new HashSet<>(); + + Optional> oresultinstance = Optional.ofNullable(result.getInstance()); + if (oresultinstance.isPresent()) { + for (Instance i : oresultinstance.get()) { + tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|")); + tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|")); + } + + oresultinstance + .get() + .stream() + .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey())) + .flatMap(p -> Stream.of(p.getFst(), p.getSnd())) + .map(s -> StringUtils.substringAfter(s, "|")) + .collect(Collectors.toCollection(HashSet::new)) + .forEach( + dsId -> datasources + .addAll( + conf.getCommunityForDatasource(dsId, param))); + } + + communities.addAll(datasources); + + /* Tagging for Zenodo Communities */ + final Set czenodo = new HashSet<>(); + + Optional> oresultcontext = Optional.ofNullable(result.getContext()); + if (oresultcontext.isPresent()) { + oresultcontext + .get() + .stream() + .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR)) + .collect(Collectors.toList()) + .forEach( + c -> czenodo + .addAll( + conf + .getCommunityForZenodoCommunityValue( + c + .getId() + .substring( + c.getId().lastIndexOf("/") + 1) + .trim()))); + } + + communities.addAll(czenodo); + + clearContext(result); + + /* Verify if there is something to bulktag */ + if (communities.isEmpty()) { + return result; + } + + result + .getContext() + .stream() + .map( + c -> { + if (communities.contains(c.getId())) { + Optional> opt_dataInfoList = Optional.ofNullable(c.getDataInfo()); + List dataInfoList; + if (opt_dataInfoList.isPresent()) + dataInfoList = opt_dataInfoList.get(); + else { + dataInfoList = new ArrayList<>(); + c.setDataInfo(dataInfoList); + } + if (subjects.contains(c.getId())) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_SUBJECT, + CLASS_NAME_BULKTAG_SUBJECT)); + if (datasources.contains(c.getId())) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_DATASOURCE, + CLASS_NAME_BULKTAG_DATASOURCE)); + if (czenodo.contains(c.getId())) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_CZENODO, + CLASS_NAME_BULKTAG_ZENODO)); + } + return c; + }) + .collect(Collectors.toList()); + + communities + .removeAll( + result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet())); + + if (communities.isEmpty()) + return result; + + List toaddcontext = communities + .stream() + .map( + c -> { + Context context = new Context(); + context.setId(c); + List dataInfoList = new ArrayList<>(); + if (subjects.contains(c)) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_SUBJECT, + CLASS_NAME_BULKTAG_SUBJECT)); + if (datasources.contains(c)) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_DATASOURCE, + CLASS_NAME_BULKTAG_DATASOURCE)); + if (czenodo.contains(c)) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_CZENODO, + CLASS_NAME_BULKTAG_ZENODO)); + context.setDataInfo(dataInfoList); + return context; + }) + .collect(Collectors.toList()); + + result.getContext().addAll(toaddcontext); + return result; + } + + public static DataInfo getDataInfo( + String inference_provenance, String inference_class_id, String inference_class_name) { + DataInfo di = new DataInfo(); + di.setInferred(true); + di.setInferenceprovenance(inference_provenance); + di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name)); + return di; + } + + public static Qualifier getQualifier(String inference_class_id, String inference_class_name) { + Qualifier pa = new Qualifier(); + pa.setClassid(inference_class_id); + pa.setClassname(inference_class_name); + pa.setSchemeid(DNET_PROVENANCE_ACTIONS); + pa.setSchemename(DNET_PROVENANCE_ACTIONS); + return pa; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java new file mode 100644 index 0000000000..71ff61d1b3 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java @@ -0,0 +1,51 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; +import java.lang.reflect.Type; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; + +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; + +public class SelectionConstraints implements Serializable { + private List criteria; + + public SelectionConstraints() { + } + + public List getCriteria() { + return criteria; + } + + public void setCriteria(List criteria) { + this.criteria = criteria; + } + + public void setSc(String json) { + Type collectionType = new TypeToken>() { + }.getType(); + criteria = new Gson().fromJson(json, collectionType); + } + + // Constraints in or + public boolean verifyCriteria(final Map> param) { + for (Constraints selc : criteria) { + if (selc.verifyCriteria(param)) { + return true; + } + } + return false; + } + + public void setSelection(VerbResolver resolver) { + + for (Constraints cs : criteria) { + cs.setSelection(resolver); + } + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java new file mode 100644 index 0000000000..3cdc7c9411 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java @@ -0,0 +1,17 @@ + +package eu.dnetlib.dhp.bulktag.community; + +public class TaggingConstants { + + public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging"; + + public static final String CLASS_ID_SUBJECT = "community:subject"; + public static final String CLASS_ID_DATASOURCE = "community:datasource"; + public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; + + public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/"; + + public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject"; + public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource"; + public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo"; +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java new file mode 100644 index 0000000000..bc6b75fba2 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.bulktag.community; + +import java.io.Serializable; + +import org.dom4j.Node; + +import com.google.gson.Gson; + +/** Created by miriam on 01/08/2018. */ +public class ZenodoCommunity implements Serializable { + + private String zenodoCommunityId; + + private SelectionConstraints selCriteria; + + public String getZenodoCommunityId() { + return zenodoCommunityId; + } + + public void setZenodoCommunityId(String zenodoCommunityId) { + this.zenodoCommunityId = zenodoCommunityId; + } + + public SelectionConstraints getSelCriteria() { + return selCriteria; + } + + public void setSelCriteria(SelectionConstraints selCriteria) { + this.selCriteria = selCriteria; + } + + private void setSelCriteria(String json) { + // Type collectionType = new TypeToken>(){}.getType(); + selCriteria = new Gson().fromJson(json, SelectionConstraints.class); + } + + public void setSelCriteria(Node n) { + if (n == null) { + selCriteria = null; + } else { + setSelCriteria(n.getText()); + } + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java new file mode 100644 index 0000000000..496630fa31 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("contains") +public class ContainsVerb implements Selection, Serializable { + + private String param; + + public ContainsVerb() { + } + + public ContainsVerb(final String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return value.contains(param); + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java new file mode 100644 index 0000000000..a4a6f5663d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("contains_ignorecase") +public class ContainsVerbIgnoreCase implements Selection, Serializable { + + private String param; + + public ContainsVerbIgnoreCase() { + } + + public ContainsVerbIgnoreCase(final String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return value.toLowerCase().contains(param.toLowerCase()); + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java new file mode 100644 index 0000000000..b9088d0128 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("equals") +public class EqualVerb implements Selection, Serializable { + + private String param; + + public EqualVerb() { + } + + public EqualVerb(final String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return value.equals(param); + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java new file mode 100644 index 0000000000..c5f0ce0703 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("equals_ignorecase") +public class EqualVerbIgnoreCase implements Selection, Serializable { + + private String param; + + public EqualVerbIgnoreCase() { + } + + public EqualVerbIgnoreCase(final String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return value.equalsIgnoreCase(param); + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java new file mode 100644 index 0000000000..e9b948b2b8 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java @@ -0,0 +1,43 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.lang.reflect.Type; + +import com.google.gson.*; + +public class InterfaceAdapter implements JsonSerializer, JsonDeserializer { + + private static final String CLASSNAME = "CLASSNAME"; + private static final String DATA = "DATA"; + + public Object deserialize( + JsonElement jsonElement, + Type type, + JsonDeserializationContext jsonDeserializationContext) + throws JsonParseException { + + JsonObject jsonObject = jsonElement.getAsJsonObject(); + JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME); + String className = prim.getAsString(); + Class klass = getObjectClass(className); + return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass); + } + + public JsonElement serialize( + Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) { + JsonObject jsonObject = new JsonObject(); + jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName()); + jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement)); + return jsonObject; + } + + /** **** Helper method to get the className of the object to be deserialized **** */ + public Class getObjectClass(String className) { + try { + return Class.forName(className); + } catch (ClassNotFoundException e) { + // e.printStackTrace(); + throw new JsonParseException(e.getMessage()); + } + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java new file mode 100644 index 0000000000..03ec9804b9 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("not_contains") +public class NotContainsVerb implements Selection, Serializable { + + private String param; + + public NotContainsVerb() { + } + + public NotContainsVerb(final String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return !value.contains(param); + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java new file mode 100644 index 0000000000..b21be83f0e --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("not_contains_ignorecase") +public class NotContainsVerbIgnoreCase implements Selection, Serializable { + + private String param; + + public NotContainsVerbIgnoreCase() { + } + + public NotContainsVerbIgnoreCase(final String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return !(value.toLowerCase().contains(param.toLowerCase())); + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java new file mode 100644 index 0000000000..86bf00012a --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("not_equals") +public class NotEqualVerb implements Selection, Serializable { + + private String param; + + public NotEqualVerb(final String param) { + this.param = param; + } + + public NotEqualVerb() { + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return !value.equals(param); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java new file mode 100644 index 0000000000..c6958a6414 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; + +@VerbClass("not_equals_ignorecase") +public class NotEqualVerbIgnoreCase implements Selection, Serializable { + + private String param; + + public NotEqualVerbIgnoreCase(final String param) { + this.param = param; + } + + public NotEqualVerbIgnoreCase() { + } + + public String getParam() { + return param; + } + + public void setParam(String param) { + this.param = param; + } + + @Override + public boolean apply(String value) { + return !value.equalsIgnoreCase(param); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java new file mode 100644 index 0000000000..ec9fb716d4 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java @@ -0,0 +1,7 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +public interface Selection { + + boolean apply(String value); +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java new file mode 100644 index 0000000000..5b35919bd5 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java @@ -0,0 +1,14 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +@interface VerbClass { + + String value(); +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java new file mode 100644 index 0000000000..3d0db20632 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java @@ -0,0 +1,56 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.util.Map; +import java.util.stream.Collectors; + +import io.github.classgraph.ClassGraph; +import io.github.classgraph.ClassInfo; +import io.github.classgraph.ClassInfoList; +import io.github.classgraph.ScanResult; + +public class VerbResolver implements Serializable { + private Map> map = null; // = new HashMap<>(); + private final ClassGraph classgraph = new ClassGraph(); + + public VerbResolver() { + + try (ScanResult scanResult = // Assign scanResult in try-with-resources + classgraph // Create a new ClassGraph instance + .verbose() // If you want to enable logging to stderr + .enableAllInfo() // Scan classes, methods, fields, annotations + .whitelistPackages( + "eu.dnetlib.dhp.bulktag.criteria") // Scan com.xyz and subpackages + .scan()) { // Perform the scan and return a ScanResult + + ClassInfoList routeClassInfoList = scanResult + .getClassesWithAnnotation( + "eu.dnetlib.dhp.bulktag.criteria.VerbClass"); + + this.map = routeClassInfoList + .stream() + .collect( + Collectors + .toMap( + value -> (String) ((ClassInfo) value) + .getAnnotationInfo() + .get(0) + .getParameterValues() + .get(0) + .getValue(), + value -> (Class) ((ClassInfo) value).loadClass())); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public Selection getSelectionCriteria(String name, String param) + throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, + InstantiationException { + + // return Class.forName(tmp_map.get(name)). + return map.get(name).getDeclaredConstructor((String.class)).newInstance(param); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java new file mode 100644 index 0000000000..0bb801999a --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java @@ -0,0 +1,10 @@ + +package eu.dnetlib.dhp.bulktag.criteria; + +public class VerbResolverFactory { + + public static VerbResolver newInstance() { + + return new VerbResolver(); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java new file mode 100644 index 0000000000..271cc6bb36 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java @@ -0,0 +1,25 @@ + +package eu.dnetlib.dhp.countrypropagation; + +import java.io.Serializable; + +public class CountrySbs implements Serializable { + private String classid; + private String classname; + + public String getClassid() { + return classid; + } + + public void setClassid(String classid) { + this.classid = classid; + } + + public String getClassname() { + return classname; + } + + public void setClassname(String classname) { + this.classname = classname; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java new file mode 100644 index 0000000000..642192f734 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java @@ -0,0 +1,25 @@ + +package eu.dnetlib.dhp.countrypropagation; + +import java.io.Serializable; + +public class DatasourceCountry implements Serializable { + private String dataSourceId; + private CountrySbs country; + + public String getDataSourceId() { + return dataSourceId; + } + + public void setDataSourceId(String dataSourceId) { + this.dataSourceId = dataSourceId; + } + + public CountrySbs getCountry() { + return country; + } + + public void setCountry(CountrySbs country) { + this.country = country; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java new file mode 100644 index 0000000000..e91a1e48a7 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -0,0 +1,121 @@ + +package eu.dnetlib.dhp.countrypropagation; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; + +/** + * For the association of the country to the datasource The association is computed only for datasource of specific type + * or having whitelisted ids The country is registered in the Organization associated to the Datasource, so the relation + * provides between Datasource and Organization is exploited to get the country for the datasource + */ +public class PrepareDatasourceCountryAssociation { + + private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareDatasourceCountryAssociation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareDatasourceCountryAssociation( + spark, + Arrays.asList(parser.get("whitelist").split(";")), + Arrays.asList(parser.get("allowedtypes").split(";")), + inputPath, + outputPath); + }); + } + + private static void prepareDatasourceCountryAssociation( + SparkSession spark, + List whitelist, + List allowedtypes, + String inputPath, + String outputPath) { + String whitelisted = ""; + for (String i : whitelist) { + whitelisted += " OR id = '" + i + "'"; + } + + Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class); + Dataset relation = readPath(spark, inputPath + "/relation", Relation.class); + Dataset organization = readPath(spark, inputPath + "/organization", Organization.class); + + datasource.createOrReplaceTempView("datasource"); + relation.createOrReplaceTempView("relation"); + organization.createOrReplaceTempView("organization"); + + String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country " + + "FROM ( SELECT id " + + " FROM datasource " + + " WHERE (datainfo.deletedbyinference = false " + + whitelisted + + ") " + + getConstraintList("datasourcetype.classid = '", allowedtypes) + + ") d " + + "JOIN ( SELECT source, target " + + " FROM relation " + + " WHERE relclass = '" + + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + "' " + + " AND datainfo.deletedbyinference = false ) rel " + + "ON d.id = rel.source " + + "JOIN (SELECT id, country " + + " FROM organization " + + " WHERE datainfo.deletedbyinference = false " + + " AND length(country.classid) > 0) o " + + "ON o.id = rel.target"; + + spark + .sql(query) + .as(Encoders.bean(DatasourceCountry.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputPath); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java new file mode 100644 index 0000000000..34b3764131 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java @@ -0,0 +1,98 @@ + +package eu.dnetlib.dhp.countrypropagation; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.*; +import org.apache.spark.sql.Dataset; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; + +public class PrepareResultCountrySet { + private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class); + + private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet " + + "FROM ( SELECT id, country " + + "FROM datasource_country JOIN cfhb ON cf = dataSourceId " + + "UNION ALL " + + "SELECT id, country FROM datasource_country " + + "JOIN cfhb ON hb = dataSourceId ) tmp " + + "GROUP BY id"; + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareResultCountrySet.class + .getResourceAsStream( + "/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String datasourcecountrypath = parser.get("preparedInfoPath"); + log.info("preparedInfoPath: {}", datasourcecountrypath); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + getPotentialResultToUpdate( + spark, + inputPath, + outputPath, + datasourcecountrypath, + resultClazz); + }); + } + + private static void getPotentialResultToUpdate( + SparkSession spark, + String inputPath, + String outputPath, + String datasourcecountrypath, + Class resultClazz) { + + Dataset result = readPath(spark, inputPath, resultClazz); + result.createOrReplaceTempView("result"); + // log.info("number of results: {}", result.count()); + createCfHbforResult(spark); + + Dataset datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class); + + datasource_country.createOrReplaceTempView("datasource_country"); + // log.info("datasource_country number : {}", datasource_country.count()); + + spark + .sql(RESULT_COUNTRYSET_QUERY) + .as(Encoders.bean(ResultCountrySet.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Append) + .json(outputPath); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java new file mode 100644 index 0000000000..8c29424f29 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java @@ -0,0 +1,26 @@ + +package eu.dnetlib.dhp.countrypropagation; + +import java.io.Serializable; +import java.util.ArrayList; + +public class ResultCountrySet implements Serializable { + private String resultId; + private ArrayList countrySet; + + public String getResultId() { + return resultId; + } + + public void setResultId(String resultId) { + this.resultId = resultId; + } + + public ArrayList getCountrySet() { + return countrySet; + } + + public void setCountrySet(ArrayList countrySet) { + this.countrySet = countrySet; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java new file mode 100644 index 0000000000..9dc17701bc --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -0,0 +1,132 @@ + +package eu.dnetlib.dhp.countrypropagation; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Country; +import eu.dnetlib.dhp.schema.oaf.Result; +import scala.Tuple2; + +public class SparkCountryPropagationJob { + + private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + SparkCountryPropagationJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String sourcePath = parser.get("sourcePath"); + log.info("sourcePath: {}", sourcePath); + + String preparedInfoPath = parser.get("preparedInfoPath"); + log.info("preparedInfoPath: {}", preparedInfoPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> execPropagation( + spark, + sourcePath, + preparedInfoPath, + outputPath, + resultClazz, + saveGraph)); + } + + private static void execPropagation( + SparkSession spark, + String sourcePath, + String preparedInfoPath, + String outputPath, + Class resultClazz, + boolean saveGraph) { + + if (saveGraph) { + // updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath); + log.info("Reading Graph table from: {}", sourcePath); + Dataset res = readPath(spark, sourcePath, resultClazz); + + log.info("Reading prepared info: {}", preparedInfoPath); + Dataset prepared = spark + .read() + .json(preparedInfoPath) + .as(Encoders.bean(ResultCountrySet.class)); + + res + .joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer") + .map(getCountryMergeFn(), Encoders.bean(resultClazz)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputPath); + } + } + + private static MapFunction, R> getCountryMergeFn() { + return (MapFunction, R>) t -> { + Optional.ofNullable(t._2()).ifPresent(r -> { + t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); + }); + return t._1(); + }; + } + + private static List merge(List c1, List c2) { + HashSet countries = c1 + .stream() + .map(c -> c.getClassid()) + .collect(Collectors.toCollection(HashSet::new)); + + return c2 + .stream() + .filter(c -> !countries.contains(c.getClassid())) + .map(c -> getCountry(c.getClassid(), c.getClassname())) + .collect(Collectors.toList()); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java new file mode 100644 index 0000000000..a5fcab360d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java @@ -0,0 +1,43 @@ + +package eu.dnetlib.dhp.orcidtoresultfromsemrel; + +public class AutoritativeAuthor { + + private String name; + private String surname; + private String fullname; + private String orcid; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getSurname() { + return surname; + } + + public void setSurname(String surname) { + this.surname = surname; + } + + public String getFullname() { + return fullname; + } + + public void setFullname(String fullname) { + this.fullname = fullname; + } + + public String getOrcid() { + return orcid; + } + + public void setOrcid(String orcid) { + this.orcid = orcid; + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java new file mode 100644 index 0000000000..3e16b4b4b3 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -0,0 +1,125 @@ + +package eu.dnetlib.dhp.orcidtoresultfromsemrel; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class PrepareResultOrcidAssociationStep1 { + private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class); + + public static void main(String[] args) throws Exception { + String jsonConf = IOUtils + .toString( + PrepareResultOrcidAssociationStep1.class + .getResourceAsStream( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + + final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); + log.info("resultType: {}", resultType); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + String inputRelationPath = inputPath + "/relation"; + log.info("inputRelationPath: {}", inputRelationPath); + + String inputResultPath = inputPath + "/" + resultType; + log.info("inputResultPath: {}", inputResultPath); + + String outputResultPath = outputPath + "/" + resultType; + log.info("outputResultPath: {}", outputResultPath); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + prepareInfo( + spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel); + }); + } + + private static void prepareInfo( + SparkSession spark, + String inputRelationPath, + String inputResultPath, + String outputResultPath, + Class resultClazz, + List allowedsemrel) { + + Dataset relation = readPath(spark, inputRelationPath, Relation.class); + relation.createOrReplaceTempView("relation"); + + log.info("Reading Graph table from: {}", inputResultPath); + Dataset result = readPath(spark, inputResultPath, resultClazz); + result.createOrReplaceTempView("result"); + + String query = " select target resultId, author authorList" + + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " + + " from ( " + + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " + + " from result " + + " lateral view explode (author) a as MyT " + + " lateral view explode (MyT.pid) p as MyP " + + " where MyP.qualifier.classid = 'ORCID') tmp " + + " group by id) r_t " + + " join (" + + " select source, target " + + " from relation " + + " where datainfo.deletedbyinference = false " + + getConstraintList(" relclass = '", allowedsemrel) + + ") rel_rel " + + " on source = id"; + spark + .sql(query) + .as(Encoders.bean(ResultOrcidList.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputResultPath); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java new file mode 100644 index 0000000000..65d8811bc7 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java @@ -0,0 +1,97 @@ + +package eu.dnetlib.dhp.orcidtoresultfromsemrel; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import scala.Tuple2; + +public class PrepareResultOrcidAssociationStep2 { + private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareResultOrcidAssociationStep2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + mergeInfo(spark, inputPath, outputPath); + }); + } + + private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) { + + Dataset resultOrcidAssoc = readPath(spark, inputPath + "/publication", ResultOrcidList.class) + .union(readPath(spark, inputPath + "/dataset", ResultOrcidList.class)) + .union(readPath(spark, inputPath + "/otherresearchproduct", ResultOrcidList.class)) + .union(readPath(spark, inputPath + "/software", ResultOrcidList.class)); + + resultOrcidAssoc + .toJavaRDD() + .mapToPair(r -> new Tuple2<>(r.getResultId(), r)) + .reduceByKey( + (a, b) -> { + if (a == null) { + return b; + } + if (b == null) { + return a; + } + Set orcid_set = new HashSet<>(); + a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid())); + b + .getAuthorList() + .stream() + .forEach( + aa -> { + if (!orcid_set.contains(aa.getOrcid())) { + a.getAuthorList().add(aa); + orcid_set.add(aa.getOrcid()); + } + }); + return a; + }) + .map(c -> c._2()) + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(outputPath, GzipCodec.class); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java new file mode 100644 index 0000000000..54b415d1c6 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java @@ -0,0 +1,27 @@ + +package eu.dnetlib.dhp.orcidtoresultfromsemrel; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +public class ResultOrcidList implements Serializable { + String resultId; + List authorList = new ArrayList<>(); + + public String getResultId() { + return resultId; + } + + public void setResultId(String resultId) { + this.resultId = resultId; + } + + public List getAuthorList() { + return authorList; + } + + public void setAuthorList(List authorList) { + this.authorList = authorList; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java new file mode 100644 index 0000000000..ebb75a5a67 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -0,0 +1,199 @@ + +package eu.dnetlib.dhp.orcidtoresultfromsemrel; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.List; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import scala.Tuple2; + +public class SparkOrcidToResultFromSemRelJob { + private static final Logger log = LoggerFactory.getLogger(SparkOrcidToResultFromSemRelJob.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkOrcidToResultFromSemRelJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String possibleUpdates = parser.get("possibleUpdatesPath"); + log.info("possibleUpdatesPath: {}", possibleUpdates); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + if (saveGraph) + execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz); + }); + } + + private static void execPropagation( + SparkSession spark, + String possibleUpdatesPath, + String inputPath, + String outputPath, + Class resultClazz) { + + // read possible updates (resultId and list of possible orcid to add + Dataset possible_updates = readPath(spark, possibleUpdatesPath, ResultOrcidList.class); + // read the result we have been considering + Dataset result = readPath(spark, inputPath, resultClazz); + // make join result left_outer with possible updates + + result + .joinWith( + possible_updates, + result.col("id").equalTo(possible_updates.col("resultId")), + "left_outer") + .map(authorEnrichFn(), Encoders.bean(resultClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + private static MapFunction, R> authorEnrichFn() { + return (MapFunction, R>) value -> { + R ret = value._1(); + Optional rol = Optional.ofNullable(value._2()); + if (rol.isPresent()) { + List toenrich_author = ret.getAuthor(); + List autoritativeAuthors = rol.get().getAuthorList(); + for (Author author : toenrich_author) { + if (!containsAllowedPid(author)) { + enrichAuthor(author, autoritativeAuthors); + } + } + } + + return ret; + }; + } + + private static void enrichAuthor(Author a, List au) { + for (AutoritativeAuthor aa : au) { + if (enrichAuthor(aa, a)) { + return; + } + } + } + + private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) { + boolean toaddpid = false; + + if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) { + if (StringUtils.isNoneEmpty(author.getSurname())) { + if (autoritative_author + .getSurname() + .trim() + .equalsIgnoreCase(author.getSurname().trim())) { + + // have the same surname. Check the name + if (StringUtils.isNoneEmpty(autoritative_author.getName())) { + if (StringUtils.isNoneEmpty(author.getName())) { + if (autoritative_author + .getName() + .trim() + .equalsIgnoreCase(author.getName().trim())) { + toaddpid = true; + } + // they could be differently written (i.e. only the initials of the name + // in one of the two + if (autoritative_author + .getName() + .trim() + .substring(0, 0) + .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { + toaddpid = true; + } + } + } + } + } + } + if (toaddpid) { + StructuredProperty p = new StructuredProperty(); + p.setValue(autoritative_author.getOrcid()); + p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID)); + p + .setDataInfo( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID, + PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME)); + + Optional> authorPid = Optional.ofNullable(author.getPid()); + if (authorPid.isPresent()) { + authorPid.get().add(p); + } else { + author.setPid(Lists.newArrayList(p)); + } + + } + return toaddpid; + } + + private static boolean containsAllowedPid(Author a) { + Optional> pids = Optional.ofNullable(a.getPid()); + if (!pids.isPresent()) { + return false; + } + for (StructuredProperty pid : pids.get()) { + if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) { + return true; + } + } + return false; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java new file mode 100644 index 0000000000..05dcdc6928 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -0,0 +1,126 @@ + +package eu.dnetlib.dhp.projecttoresult; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.PropagationConstant.getConstraintList; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class PrepareProjectResultsAssociation { + private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareProjectResultsAssociation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String potentialUpdatePath = parser.get("potentialUpdatePath"); + log.info("potentialUpdatePath {}: ", potentialUpdatePath); + + String alreadyLinkedPath = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath: {} ", alreadyLinkedPath); + + final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + prepareResultProjProjectResults( + spark, + inputPath, + potentialUpdatePath, + alreadyLinkedPath, + allowedsemrel); + }); + } + + private static void prepareResultProjProjectResults( + SparkSession spark, + String inputPath, + String potentialUpdatePath, + String alreadyLinkedPath, + List allowedsemrel) { + + Dataset relation = readPath(spark, inputPath, Relation.class); + relation.createOrReplaceTempView("relation"); + + String resproj_relation_query = "SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND relClass = '" + + RELATION_RESULT_PROJECT_REL_CLASS + + "'"; + + Dataset resproj_relation = spark.sql(resproj_relation_query); + resproj_relation.createOrReplaceTempView("resproj_relation"); + + String potential_update_query = "SELECT resultId, collect_set(projectId) projectSet " + + "FROM ( " + + "SELECT r1.target resultId, r2.target projectId " + + " FROM (SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + getConstraintList(" relClass = '", allowedsemrel) + + " ) r1" + + " JOIN resproj_relation r2 " + + " ON r1.source = r2.source " + + " ) tmp " + + "GROUP BY resultId "; + + spark + .sql(potential_update_query) + .as(Encoders.bean(ResultProjectSet.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(potentialUpdatePath); + + String result_projectset_query = "SELECT source resultId, collect_set(target) projectSet " + + "FROM resproj_relation " + + "GROUP BY source"; + + spark + .sql(result_projectset_query) + .as(Encoders.bean(ResultProjectSet.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(alreadyLinkedPath); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java new file mode 100644 index 0000000000..1d5280874b --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java @@ -0,0 +1,26 @@ + +package eu.dnetlib.dhp.projecttoresult; + +import java.io.Serializable; +import java.util.ArrayList; + +public class ResultProjectSet implements Serializable { + private String resultId; + private ArrayList projectSet; + + public String getResultId() { + return resultId; + } + + public void setResultId(String resultId) { + this.resultId = resultId; + } + + public ArrayList getProjectSet() { + return projectSet; + } + + public void setProjectSet(ArrayList project) { + this.projectSet = project; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java new file mode 100644 index 0000000000..36694b3dd5 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java @@ -0,0 +1,147 @@ + +package eu.dnetlib.dhp.projecttoresult; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.oaf.Relation; +import scala.Tuple2; + +public class SparkResultToProjectThroughSemRelJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + SparkResultToProjectThroughSemRelJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + final String potentialUpdatePath = parser.get("potentialUpdatePath"); + log.info("potentialUpdatePath {}: ", potentialUpdatePath); + + final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); + + final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph")); + log.info("saveGraph: {}", saveGraph); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + execPropagation( + spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph); + }); + } + + private static void execPropagation( + SparkSession spark, + String outputPath, + String alreadyLinkedPath, + String potentialUpdatePath, + Boolean saveGraph) { + + Dataset toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class); + Dataset alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class); + + if (saveGraph) { + toaddrelations + .joinWith( + alreadyLinked, + toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")), + "left_outer") + .flatMap(mapRelationRn(), Encoders.bean(Relation.class)) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(outputPath); + } + } + + private static FlatMapFunction, Relation> mapRelationRn() { + return (FlatMapFunction, Relation>) value -> { + List new_relations = new ArrayList<>(); + ResultProjectSet potential_update = value._1(); + Optional already_linked = Optional.ofNullable(value._2()); + if (already_linked.isPresent()) { + already_linked + .get() + .getProjectSet() + .stream() + .forEach( + (p -> { + if (potential_update + .getProjectSet() + .contains(p)) { + potential_update.getProjectSet().remove(p); + } + })); + } + String resId = potential_update.getResultId(); + potential_update + .getProjectSet() + .stream() + .forEach( + projectId -> { + new_relations + .add( + getRelation( + resId, + projectId, + RELATION_RESULT_PROJECT_REL_CLASS, + RELATION_RESULTPROJECT_REL_TYPE, + RELATION_RESULTPROJECT_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); + new_relations + .add( + getRelation( + projectId, + resId, + RELATION_PROJECT_RESULT_REL_CLASS, + RELATION_RESULTPROJECT_REL_TYPE, + RELATION_RESULTPROJECT_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); + }); + return new_relations.iterator(); + }; + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java new file mode 100644 index 0000000000..7d786058a1 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java @@ -0,0 +1,21 @@ + +package eu.dnetlib.dhp.resulttocommunityfromorganization; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +public class OrganizationMap extends HashMap> { + + public OrganizationMap() { + super(); + } + + public List get(String key) { + + if (super.get(key) == null) { + return new ArrayList<>(); + } + return super.get(key); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java new file mode 100644 index 0000000000..e2d4d56878 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -0,0 +1,130 @@ + +package eu.dnetlib.dhp.resulttocommunityfromorganization; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.*; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class PrepareResultCommunitySet { + + private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareResultCommunitySet.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final OrganizationMap organizationMap = new Gson() + .fromJson( + parser.get("organizationtoresultcommunitymap"), + OrganizationMap.class); + log.info("organizationMap: {}", new Gson().toJson(organizationMap)); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + prepareInfo(spark, inputPath, outputPath, organizationMap); + }); + } + + private static void prepareInfo( + SparkSession spark, + String inputPath, + String outputPath, + OrganizationMap organizationMap) { + + Dataset relation = readPath(spark, inputPath, Relation.class); + relation.createOrReplaceTempView("relation"); + + String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges " + + "FROM (SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND relClass = '" + + RELATION_RESULT_ORGANIZATION_REL_CLASS + + "') result_organization " + + "LEFT JOIN (SELECT source, collect_set(target) org_set " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND relClass = '" + + RELATION_REPRESENTATIVERESULT_RESULT_CLASS + + "' " + + " GROUP BY source) organization_organization " + + "ON result_organization.target = organization_organization.source "; + + Dataset result_organizationset = spark + .sql(query) + .as(Encoders.bean(ResultOrganizations.class)); + + result_organizationset + .map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class)) + .filter(Objects::nonNull) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + private static MapFunction mapResultCommunityFn( + OrganizationMap organizationMap) { + return (MapFunction) value -> { + String rId = value.getResultId(); + Optional> orgs = Optional.ofNullable(value.getMerges()); + String oTarget = value.getOrgId(); + Set communitySet = new HashSet<>(); + if (organizationMap.containsKey(oTarget)) { + communitySet.addAll(organizationMap.get(oTarget)); + } + if (orgs.isPresent()) + for (String oId : orgs.get()) { + if (organizationMap.containsKey(oId)) { + communitySet.addAll(organizationMap.get(oId)); + } + } + if (communitySet.size() > 0) { + ResultCommunityList rcl = new ResultCommunityList(); + rcl.setResultId(rId); + ArrayList communityList = new ArrayList<>(); + communityList.addAll(communitySet); + rcl.setCommunityList(communityList); + return rcl; + } + return null; + }; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java new file mode 100644 index 0000000000..e3275745d8 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java @@ -0,0 +1,26 @@ + +package eu.dnetlib.dhp.resulttocommunityfromorganization; + +import java.io.Serializable; +import java.util.ArrayList; + +public class ResultCommunityList implements Serializable { + private String resultId; + private ArrayList communityList; + + public String getResultId() { + return resultId; + } + + public void setResultId(String resultId) { + this.resultId = resultId; + } + + public ArrayList getCommunityList() { + return communityList; + } + + public void setCommunityList(ArrayList communityList) { + this.communityList = communityList; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java new file mode 100644 index 0000000000..3ea9d41d66 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.dhp.resulttocommunityfromorganization; + +import java.io.Serializable; +import java.util.ArrayList; + +public class ResultOrganizations implements Serializable { + private String resultId; + private String orgId; + private ArrayList merges; + + public String getResultId() { + return resultId; + } + + public void setResultId(String resultId) { + this.resultId = resultId; + } + + public String getOrgId() { + return orgId; + } + + public void setOrgId(String orgId) { + this.orgId = orgId; + } + + public ArrayList getMerges() { + return merges; + } + + public void setMerges(ArrayList merges) { + this.merges = merges; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java new file mode 100644 index 0000000000..71275cc7f0 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java @@ -0,0 +1,137 @@ + +package eu.dnetlib.dhp.resulttocommunityfromorganization; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; +import scala.Tuple2; + +public class SparkResultToCommunityFromOrganizationJob { + + private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromOrganizationJob.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkResultToCommunityFromOrganizationJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String possibleupdatespath = parser.get("preparedInfoPath"); + log.info("preparedInfoPath: {}", possibleupdatespath); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + if (saveGraph) + execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); + }); + } + + private static void execPropagation( + SparkSession spark, + String inputPath, + String outputPath, + Class resultClazz, + String possibleUpdatesPath) { + + Dataset possibleUpdates = readPath(spark, possibleUpdatesPath, ResultCommunityList.class); + Dataset result = readPath(spark, inputPath, resultClazz); + + result + .joinWith( + possibleUpdates, + result.col("id").equalTo(possibleUpdates.col("resultId")), + "left_outer") + .map(resultCommunityFn(), Encoders.bean(resultClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + private static MapFunction, R> resultCommunityFn() { + return (MapFunction, R>) value -> { + R ret = value._1(); + Optional rcl = Optional.ofNullable(value._2()); + if (rcl.isPresent()) { + ArrayList communitySet = rcl.get().getCommunityList(); + List contextList = ret + .getContext() + .stream() + .map(con -> con.getId()) + .collect(Collectors.toList()); + Result res = new Result(); + res.setId(ret.getId()); + List propagatedContexts = new ArrayList<>(); + for (String cId : communitySet) { + if (!contextList.contains(cId)) { + Context newContext = new Context(); + newContext.setId(cId); + newContext + .setDataInfo( + Arrays + .asList( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID, + PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME))); + propagatedContexts.add(newContext); + } + } + res.setContext(propagatedContexts); + ret.mergeFrom(res); + } + return ret; + }; + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java new file mode 100644 index 0000000000..4f5ac25521 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -0,0 +1,167 @@ + +package eu.dnetlib.dhp.resulttocommunityfromsemrel; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +public class PrepareResultCommunitySetStep1 { + private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class); + + private static final String COMMUNITY_LIST_XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')" + + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']" + + " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'" + + " return $x//CONFIGURATION/context/@id/string()"; + + /** + * associates to each result the set of community contexts they are associated to; associates to each target of a + * relation with allowed semantics the set of community context it could possibly inherit from the source of the + * relation + */ + // TODO + private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context " + + "from (select id, collect_set(co.id) community_context " + + " from result " + + " lateral view explode (context) c as co " + + " where datainfo.deletedbyinference = false %s group by id) p " + + " JOIN " + + " (select source, target from relation " + + " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source"; + + /** + * a dataset for example could be linked to more than one publication. For each publication linked to that dataset + * the previous query will produce a row: targetId set of community context the target could possibly inherit with + * the following query there will be a single row for each result linked to more than one result of the result type + * currently being used + */ + // TODO + private static final String RESULT_COMMUNITY_LIST_QUERY = "select resultId , collect_set(co) communityList " + + "from result_context " + + "lateral view explode (community_context) c as co " + + "where length(co) > 0 " + + "group by resultId"; + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareResultCommunitySetStep1.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + + final String isLookupUrl = parser.get("isLookUpUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + final List communityIdList = getCommunityList(isLookupUrl); + log.info("communityIdList: {}", new Gson().toJson(communityIdList)); + + final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); + log.info("resultType: {}", resultType); + + Class resultClazz = (Class) Class.forName(resultClassName); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + prepareInfo( + spark, + inputPath, + outputPath, + allowedsemrel, + resultClazz, + resultType, + communityIdList); + }); + } + + private static void prepareInfo( + SparkSession spark, + String inputPath, + String outputPath, + List allowedsemrel, + Class resultClazz, + String resultType, + List communityIdList) { + + final String inputResultPath = inputPath + "/" + resultType; + log.info("Reading Graph table from: {}", inputResultPath); + + final String inputRelationPath = inputPath + "/relation"; + log.info("Reading relation table from: {}", inputResultPath); + + Dataset relation = readPath(spark, inputRelationPath, Relation.class); + relation.createOrReplaceTempView("relation"); + + Dataset result = readPath(spark, inputResultPath, resultClazz); + result.createOrReplaceTempView("result"); + + final String outputResultPath = outputPath + "/" + resultType; + log.info("writing output results to: {}", outputResultPath); + + String resultContextQuery = String + .format( + RESULT_CONTEXT_QUERY_TEMPLATE, + getConstraintList(" co.id = '", communityIdList), + getConstraintList(" relClass = '", allowedsemrel)); + + Dataset result_context = spark.sql(resultContextQuery); + result_context.createOrReplaceTempView("result_context"); + + spark + .sql(RESULT_COMMUNITY_LIST_QUERY) + .as(Encoders.bean(ResultCommunityList.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputResultPath); + } + + public static List getCommunityList(final String isLookupUrl) throws ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + return isLookUp.quickSearchProfile(COMMUNITY_LIST_XQUERY); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java new file mode 100644 index 0000000000..723aa89606 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java @@ -0,0 +1,101 @@ + +package eu.dnetlib.dhp.resulttocommunityfromsemrel; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import scala.Tuple2; + +public class PrepareResultCommunitySetStep2 { + private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep2.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareResultCommunitySetStep2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + mergeInfo(spark, inputPath, outputPath); + }); + } + + private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) { + + Dataset resultOrcidAssocCommunityList = readPath( + spark, inputPath + "/publication", ResultCommunityList.class) + .union(readPath(spark, inputPath + "/dataset", ResultCommunityList.class)) + .union(readPath(spark, inputPath + "/otherresearchproduct", ResultCommunityList.class)) + .union(readPath(spark, inputPath + "/software", ResultCommunityList.class)); + + resultOrcidAssocCommunityList + .toJavaRDD() + .mapToPair(r -> new Tuple2<>(r.getResultId(), r)) + .reduceByKey( + (a, b) -> { + if (a == null) { + return b; + } + if (b == null) { + return a; + } + Set community_set = new HashSet<>(); + a.getCommunityList().stream().forEach(aa -> community_set.add(aa)); + b + .getCommunityList() + .stream() + .forEach( + aa -> { + if (!community_set.contains(aa)) { + a.getCommunityList().add(aa); + community_set.add(aa); + } + }); + return a; + }) + .map(c -> c._2()) + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(outputPath, GzipCodec.class); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java new file mode 100644 index 0000000000..0c613d1b4a --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java @@ -0,0 +1,143 @@ + +package eu.dnetlib.dhp.resulttocommunityfromsemrel; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.oaf.*; +import scala.Tuple2; + +public class SparkResultToCommunityThroughSemRelJob { + + private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityThroughSemRelJob.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + SparkResultToCommunityThroughSemRelJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String preparedInfoPath = parser.get("preparedInfoPath"); + log.info("preparedInfoPath: {}", preparedInfoPath); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); + + Class resultClazz = (Class) Class.forName(resultClassName); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + if (saveGraph) { + execPropagation( + spark, inputPath, outputPath, preparedInfoPath, resultClazz); + } + }); + } + + private static void execPropagation( + SparkSession spark, + String inputPath, + String outputPath, + String preparedInfoPath, + Class resultClazz) { + + Dataset possibleUpdates = readPath(spark, preparedInfoPath, ResultCommunityList.class); + Dataset result = readPath(spark, inputPath, resultClazz); + + result + .joinWith( + possibleUpdates, + result.col("id").equalTo(possibleUpdates.col("resultId")), + "left_outer") + .map(contextUpdaterFn(), Encoders.bean(resultClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + private static MapFunction, R> contextUpdaterFn() { + return (MapFunction, R>) value -> { + R ret = value._1(); + Optional rcl = Optional.ofNullable(value._2()); + if (rcl.isPresent()) { + Set context_set = new HashSet<>(); + ret.getContext().stream().forEach(c -> context_set.add(c.getId())); + List contextList = rcl + .get() + .getCommunityList() + .stream() + .map( + c -> { + if (!context_set.contains(c)) { + Context newContext = new Context(); + newContext.setId(c); + newContext + .setDataInfo( + Arrays + .asList( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, + PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME))); + return newContext; + } + return null; + }) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + Result r = new Result(); + r.setId(ret.getId()); + r.setContext(contextList); + ret.mergeFrom(r); + } + + return ret; + }; + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java new file mode 100644 index 0000000000..e6b13dfa42 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java @@ -0,0 +1,26 @@ + +package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; + +import java.io.Serializable; + +public class DatasourceOrganization implements Serializable { + + private String datasourceId; + private String organizationId; + + public String getDatasourceId() { + return datasourceId; + } + + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } + + public String getOrganizationId() { + return organizationId; + } + + public void setOrganizationId(String organizationId) { + this.organizationId = organizationId; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java new file mode 100644 index 0000000000..f8fe1668fd --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -0,0 +1,122 @@ + +package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class PrepareResultInstRepoAssociation { + + private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareResultInstRepoAssociation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath"); + log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath); + + final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + readNeededResources(spark, inputPath); + prepareDatasourceOrganization(spark, datasourceOrganizationPath); + prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); + }); + } + + private static void prepareAlreadyLinkedAssociation( + SparkSession spark, String alreadyLinkedPath) { + String query = "Select source resultId, collect_set(target) organizationSet " + + "from relation " + + "where datainfo.deletedbyinference = false " + + "and relClass = '" + + RELATION_RESULT_ORGANIZATION_REL_CLASS + + "' " + + "group by source"; + + spark + .sql(query) + .as(Encoders.bean(ResultOrganizationSet.class)) + // TODO retry to stick with datasets + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); + } + + private static void readNeededResources(SparkSession spark, String inputPath) { + Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class); + datasource.createOrReplaceTempView("datasource"); + + Dataset relation = readPath(spark, inputPath + "/relation", Relation.class); + relation.createOrReplaceTempView("relation"); + + Dataset organization = readPath(spark, inputPath + "/organization", Organization.class); + organization.createOrReplaceTempView("organization"); + } + + private static void prepareDatasourceOrganization( + SparkSession spark, String datasourceOrganizationPath) { + + String query = "SELECT source datasourceId, target organizationId " + + "FROM ( SELECT id " + + "FROM datasource " + + "WHERE datasourcetype.classid = '" + + INSTITUTIONAL_REPO_TYPE + + "' " + + "AND datainfo.deletedbyinference = false ) d " + + "JOIN ( SELECT source, target " + + "FROM relation " + + "WHERE relclass = '" + + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + "' " + + "AND datainfo.deletedbyinference = false ) rel " + + "ON d.id = rel.source "; + + spark + .sql(query) + .as(Encoders.bean(DatasourceOrganization.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(datasourceOrganizationPath); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java new file mode 100644 index 0000000000..3bce14cdbf --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java @@ -0,0 +1,26 @@ + +package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; + +import java.io.Serializable; +import java.util.ArrayList; + +public class ResultOrganizationSet implements Serializable { + private String resultId; + private ArrayList organizationSet; + + public String getResultId() { + return resultId; + } + + public void setResultId(String resultId) { + this.resultId = resultId; + } + + public ArrayList getOrganizationSet() { + return organizationSet; + } + + public void setOrganizationSet(ArrayList organizationSet) { + this.organizationSet = organizationSet; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java new file mode 100644 index 0000000000..86634d43fc --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -0,0 +1,193 @@ + +package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + +import java.util.*; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.broadcast.Broadcast; +import org.apache.spark.sql.*; +import org.apache.spark.sql.Dataset; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; +import scala.Tuple2; + +public class SparkResultToOrganizationFromIstRepoJob { + + private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromIstRepoJob.class); + + private static final String RESULT_ORGANIZATIONSET_QUERY = "SELECT id resultId, collect_set(organizationId) organizationSet " + + "FROM ( SELECT id, organizationId " + + "FROM rels " + + "JOIN cfhb " + + " ON cf = datasourceId " + + "UNION ALL " + + "SELECT id , organizationId " + + "FROM rels " + + "JOIN cfhb " + + " ON hb = datasourceId ) tmp " + + "GROUP BY id"; + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + SparkResultToOrganizationFromIstRepoJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String datasourceorganization = parser.get("datasourceOrganizationPath"); + log.info("datasourceOrganizationPath: {}", datasourceorganization); + + final String alreadylinked = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath: {}", alreadylinked); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + if (saveGraph) + execPropagation( + spark, + datasourceorganization, + alreadylinked, + inputPath, + outputPath, + resultClazz); + }); + } + + private static void execPropagation( + SparkSession spark, + String datasourceorganization, + String alreadyLinkedPath, + String inputPath, + String outputPath, + Class clazz) { + + Dataset ds_org = readPath(spark, datasourceorganization, DatasourceOrganization.class); + + Dataset potentialUpdates = getPotentialRelations(spark, inputPath, clazz, ds_org); + + Dataset alreadyLinked = readPath(spark, alreadyLinkedPath, ResultOrganizationSet.class); + + potentialUpdates + .joinWith( + alreadyLinked, + potentialUpdates.col("resultId").equalTo(alreadyLinked.col("resultId")), + "left_outer") + .flatMap(createRelationFn(), Encoders.bean(Relation.class)) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(outputPath); + } + + private static FlatMapFunction, Relation> createRelationFn() { + return (FlatMapFunction, Relation>) value -> { + List new_relations = new ArrayList<>(); + ResultOrganizationSet potential_update = value._1(); + Optional already_linked = Optional.ofNullable(value._2()); + List organization_list = potential_update.getOrganizationSet(); + if (already_linked.isPresent()) { + already_linked + .get() + .getOrganizationSet() + .stream() + .forEach( + rId -> { + if (organization_list.contains(rId)) { + organization_list.remove(rId); + } + }); + } + String resultId = potential_update.getResultId(); + organization_list + .stream() + .forEach( + orgId -> { + new_relations + .add( + getRelation( + orgId, + resultId, + RELATION_ORGANIZATION_RESULT_REL_CLASS, + RELATION_RESULTORGANIZATION_REL_TYPE, + RELATION_RESULTORGANIZATION_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); + new_relations + .add( + getRelation( + resultId, + orgId, + RELATION_RESULT_ORGANIZATION_REL_CLASS, + RELATION_RESULTORGANIZATION_REL_TYPE, + RELATION_RESULTORGANIZATION_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); + }); + return new_relations.iterator(); + }; + } + + private static Dataset getPotentialRelations( + SparkSession spark, + String inputPath, + Class resultClazz, + Dataset ds_org) { + + Dataset result = readPath(spark, inputPath, resultClazz); + result.createOrReplaceTempView("result"); + createCfHbforResult(spark); + + ds_org.createOrReplaceTempView("rels"); + + return spark + .sql(RESULT_ORGANIZATIONSET_QUERY) + .as(Encoders.bean(ResultOrganizationSet.class)); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json new file mode 100644 index 0000000000..a37d7d168b --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json @@ -0,0 +1,51 @@ +[ + { + "paramName":"is", + "paramLongName":"isLookUpUrl", + "paramDescription": "URL of the isLookUp Service", + "paramRequired": true + }, + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "pm", + "paramLongName":"pathMap", + "paramDescription": "the json path associated to each selection field", + "paramRequired": true + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "test", + "paramLongName": "isTest", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "tg", + "paramLongName": "taggingConf", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml new file mode 100644 index 0000000000..fe82ae1940 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml @@ -0,0 +1,54 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml new file mode 100644 index 0000000000..754aba4f29 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -0,0 +1,216 @@ + + + + sourcePath + the source path + + + isLookUpUrl + the isLookup service endpoint + + + pathMap + the json path associated to each selection field + + + outputPath + the output path + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + bulkTagging-publication + eu.dnetlib.dhp.bulktag.SparkBulkTagJob + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/publication + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${outputPath}/publication + --pathMap${pathMap} + --isLookUpUrl${isLookUpUrl} + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + bulkTagging-dataset + eu.dnetlib.dhp.bulktag.SparkBulkTagJob + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/dataset + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${outputPath}/dataset + --pathMap${pathMap} + --isLookUpUrl${isLookUpUrl} + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + bulkTagging-orp + eu.dnetlib.dhp.bulktag.SparkBulkTagJob + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/otherresearchproduct + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${outputPath}/otherresearchproduct + --pathMap${pathMap} + --isLookUpUrl${isLookUpUrl} + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + bulkTagging-software + eu.dnetlib.dhp.bulktag.SparkBulkTagJob + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/software + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${outputPath}/software + --pathMap${pathMap} + --isLookUpUrl${isLookUpUrl} + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json new file mode 100644 index 0000000000..984b40774e --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json @@ -0,0 +1,44 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": false + }, + { + "paramName":"sg", + "paramLongName":"saveGraph", + "paramDescription": "true if the new version of the graph must be saved", + "paramRequired": false + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "preparedInfoPath", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json new file mode 100644 index 0000000000..95d4c1c607 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "whitelist", + "paramDescription": "the datasource having a type different from the allowed ones but that we want to add anyway", + "paramRequired": true + }, + { + "paramName": "at", + "paramLongName": "allowedtypes", + "paramDescription": "the allowed datasource types for country propagation", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json new file mode 100644 index 0000000000..5efa3dbd63 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"out", + "paramLongName":"outputPath", + "paramDescription": "the output path", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "preparedInfoPath", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml new file mode 100644 index 0000000000..2744ea92ba --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml new file mode 100644 index 0000000000..fc877071df --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml @@ -0,0 +1,376 @@ + + + + sourcePath + the source path + + + whitelist + the white list + + + allowedtypes + the allowed types + + + outputPath + the output path + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + yarn + cluster + PrepareDatasourceCountryAssociation + eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath} + --whitelist${whitelist} + --allowedtypes${allowedtypes} + --hive_metastore_uris${hive_metastore_uris} + --outputPath${workingDir}/preparedInfo + + + + + + + + + + + + + + + yarn + cluster + prepareResultCountry-Publication + eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/publication + --outputPath${workingDir}/publication + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --preparedInfoPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + prepareResultCountry-Dataset + eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/dataset + --outputPath${workingDir}/dataset + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --preparedInfoPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + prepareResultCountry-ORP + eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/otherresearchproduct + --outputPath${workingDir}/otherresearchproduct + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --preparedInfoPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + prepareResultCountry-Software + eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/software + --outputPath${workingDir}/software + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --preparedInfoPath${workingDir}/preparedInfo + + + + + + + + + + + + + + + + + yarn + cluster + countryPropagationForPublications + eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/publication + --preparedInfoPath${workingDir}/publication + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${outputPath}/publication + + + + + + + + yarn + cluster + countryPropagationForDataset + eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/dataset + --preparedInfoPath${workingDir}/dataset + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${outputPath}/dataset + + + + + + + + yarn + cluster + countryPropagationForORP + eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/otherresearchproduct + --preparedInfoPath${workingDir}/otherresearchproduct + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${outputPath}/otherresearchproduct + + + + + + + + yarn + cluster + countryPropagationForSoftware + eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath}/software + --preparedInfoPath${workingDir}/software + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${outputPath}/software + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json new file mode 100644 index 0000000000..d8aa7eb9a9 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json @@ -0,0 +1,50 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"sg", + "paramLongName":"saveGraph", + "paramDescription": "true if the new version of the graph must be saved", + "paramRequired": false + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName":"pu", + "paramLongName":"possibleUpdatesPath", + "paramDescription": "the path the the association resultId orcid author list can be found", + "paramRequired": true + }, + { + "paramName":"test", + "paramLongName":"isTest", + "paramDescription": "true if it is executing a test", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json new file mode 100644 index 0000000000..08648d61a1 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"as", + "paramLongName":"allowedsemrels", + "paramDescription": "the allowed sematinc relations for propagation", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json new file mode 100644 index 0000000000..1a67134a6b --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json @@ -0,0 +1,20 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml new file mode 100644 index 0000000000..8d2c341057 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml new file mode 100644 index 0000000000..e4429b710c --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml @@ -0,0 +1,372 @@ + + + + sourcePath + the source path + + + allowedsemrels + the semantic relationships allowed for propagation + + + outputPath + the output path + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + + + + + + + + + yarn + cluster + ORCIDPropagation-PreparePhase1-Publications + eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} + + + + + + + + yarn + cluster + ORCIDPropagation-PreparePhase1-Dataset + eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} + + + + + + + + yarn + cluster + ORCIDPropagation-PreparePhase1-ORP + eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} + + + + + + + + yarn + cluster + ORCIDPropagation-PreparePhase1-Software + eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} + + + + + + + + + + yarn + cluster + ORCIDPropagation-PreparePhase2 + eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep2 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${workingDir}/preparedInfo/targetOrcidAssoc + --outputPath${workingDir}/preparedInfo/mergedOrcidAssoc + + + + + + + + + + + + + + + yarn + cluster + ORCIDPropagation-Publication + eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=3840 + + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/publication + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${outputPath}/publication + --saveGraph${saveGraph} + + + + + + + yarn + cluster + ORCIDPropagation-Dataset + eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/dataset + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${outputPath}/dataset + --saveGraph${saveGraph} + + + + + + + yarn + cluster + ORCIDPropagation-ORP + eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/otherresearchproduct + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${outputPath}/otherresearchproduct + --saveGraph${saveGraph} + + + + + + + yarn + cluster + ORCIDPropagation-Software + eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/software + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${outputPath}/software + --saveGraph${saveGraph} + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json new file mode 100644 index 0000000000..a70dbd6a08 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json @@ -0,0 +1,33 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + + { + "paramName":"asr", + "paramLongName":"allowedsemrels", + "paramDescription": "the types of the allowed datasources. Split by ;", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName":"pu", + "paramLongName":"potentialUpdatePath", + "paramDescription": "the path of the potential updates ", + "paramRequired": true + }, + { + "paramName":"al", + "paramLongName":"alreadyLinkedPath", + "paramDescription": "the path of the already linked project result_set", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json new file mode 100644 index 0000000000..7f44ba03cd --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json @@ -0,0 +1,44 @@ +[ + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName":"sg", + "paramLongName":"saveGraph", + "paramDescription": "true if the new version of the graph must be saved", + "paramRequired": false + }, + { + "paramName":"pu", + "paramLongName":"potentialUpdatePath", + "paramDescription": "the path of the potential updates ", + "paramRequired": true + }, + { + "paramName":"al", + "paramLongName":"alreadyLinkedPath", + "paramDescription": "the path of the already linked project result_set", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "test", + "paramLongName": "isTest", + "paramDescription": "true if it is a test running", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml new file mode 100644 index 0000000000..caf3c60500 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml @@ -0,0 +1,63 @@ + + + jobTracker + yarnRM + + + + nameNode + + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml new file mode 100644 index 0000000000..24e1d3b7fa --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml @@ -0,0 +1,187 @@ + + + + sourcePath + the source path + + + allowedsemrels + the allowed semantics + + + outputPath + the output path + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/publication + ${nameNode}/${outputPath}/publication + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/dataset + ${nameNode}/${outputPath}/dataset + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/otherresearchproduct + ${nameNode}/${outputPath}/otherresearchproduct + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/software + ${nameNode}/${outputPath}/software + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + yarn + cluster + PrepareProjectResultsAssociation + eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/relation + --allowedsemrels${allowedsemrels} + --hive_metastore_uris${hive_metastore_uris} + --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + yarn + cluster + ProjectToResultPropagation + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --saveGraph${saveGraph} + --hive_metastore_uris${hive_metastore_uris} + --outputPath${outputPath}/relation + --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json new file mode 100644 index 0000000000..eebc1a0cac --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json @@ -0,0 +1,51 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName":"sg", + "paramLongName":"saveGraph", + "paramDescription": "true if the new version of the graph must be saved", + "paramRequired": false + }, + { + "paramName":"test", + "paramLongName":"isTest", + "paramDescription": "true if it is executing a test", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "preparedInfoPath", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": true + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json new file mode 100644 index 0000000000..8df509abff --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json @@ -0,0 +1,33 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"ocm", + "paramLongName":"organizationtoresultcommunitymap", + "paramDescription": "the map for the association organization communities", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml new file mode 100644 index 0000000000..2744ea92ba --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml new file mode 100644 index 0000000000..d481cad052 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml @@ -0,0 +1,239 @@ + + + + sourcePath + the source path + + + organizationtoresultcommunitymap + organization community map + + + outputPath + the output path + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + yarn + cluster + Prepare-Community-Result-Organization + eu.dnetlib.dhp.resulttocommunityfromorganization.PrepareResultCommunitySet + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath}/relation + --hive_metastore_uris${hive_metastore_uris} + --outputPath${workingDir}/preparedInfo/resultCommunityList + --organizationtoresultcommunitymap${organizationtoresultcommunitymap} + + + + + + + + + + + + + + + yarn + cluster + community2resultfromorganization-Publication + eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList + --sourcePath${sourcePath}/publication + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${outputPath}/publication + --saveGraph${saveGraph} + + + + + + + + yarn + cluster + community2resultfromorganization-Dataset + eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList + --sourcePath${sourcePath}/dataset + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${outputPath}/dataset + --saveGraph${saveGraph} + + + + + + + + yarn + cluster + community2resultfromorganization-ORP + eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList + --sourcePath${sourcePath}/otherresearchproduct + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${outputPath}/otherresearchproduct + --saveGraph${saveGraph} + + + + + + + + yarn + cluster + community2resultfromorganization-Software + eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList + --sourcePath${sourcePath}/software + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${outputPath}/software + --saveGraph${saveGraph} + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json new file mode 100644 index 0000000000..a40ce375ee --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json @@ -0,0 +1,52 @@ +[ + + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"sg", + "paramLongName":"saveGraph", + "paramDescription": "true if the new version of the graph must be saved", + "paramRequired": false + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "preparedInfoPath", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": true + }, + { + "paramName":"test", + "paramLongName":"isTest", + "paramDescription": "true if it is executing a test", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json new file mode 100644 index 0000000000..3ba3c8e9c7 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json new file mode 100644 index 0000000000..8c99da673c --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json @@ -0,0 +1,44 @@ +[ + { + "paramName":"is", + "paramLongName":"isLookUpUrl", + "paramDescription": "URL of the isLookUp Service", + "paramRequired": true + }, + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"as", + "paramLongName":"allowedsemrels", + "paramDescription": "the allowed semantic relations for propagation", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml new file mode 100644 index 0000000000..2744ea92ba --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml new file mode 100644 index 0000000000..81b51443c6 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml @@ -0,0 +1,366 @@ + + + + sourcePath + the source path + + + allowedsemrels + the semantic relationships allowed for propagation + + + isLookUpUrl + the isLookup service endpoint + + + outputPath + the output path + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-Publications + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookUpUrl${isLookUpUrl} + + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-Dataset + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookUpUrl${isLookUpUrl} + + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-ORP + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookUpUrl${isLookUpUrl} + + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-Software + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookUpUrl${isLookUpUrl} + + + + + + + + + + yarn + cluster + ResultToCommunityEmRelPropagation-PreparePhase2 + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep2 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${workingDir}/preparedInfo/targetCommunityAssoc + --outputPath${workingDir}/preparedInfo/mergedCommunityAssoc + + + + + + + + + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-Publication + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/publication + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${outputPath}/publication + --saveGraph${saveGraph} + + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-Dataset + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/dataset + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${outputPath}/dataset + --saveGraph${saveGraph} + + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-ORP + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/otherresearchproduct + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${outputPath}/otherresearchproduct + --saveGraph${saveGraph} + + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-Software + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/software + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${outputPath}/software + --saveGraph${saveGraph} + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json new file mode 100644 index 0000000000..c744963503 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName":"dop", + "paramLongName":"datasourceOrganizationPath", + "paramDescription": "path where to store/find association from datasource and organization", + "paramRequired": true + }, + { + "paramName":"alp", + "paramLongName":"alreadyLinkedPath", + "paramDescription": "path where to store/find already linked results and organizations", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json new file mode 100644 index 0000000000..d2b076c827 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json @@ -0,0 +1,56 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hive_metastore_uris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName":"sg", + "paramLongName":"saveGraph", + "paramDescription": "true if the new version of the graph must be saved", + "paramRequired": false + }, + { + "paramName":"dop", + "paramLongName":"datasourceOrganizationPath", + "paramDescription": "path where to store/find association from datasource and organization", + "paramRequired": true + }, + { + "paramName":"alp", + "paramLongName":"alreadyLinkedPath", + "paramDescription": "path where to store/find already linked results and organizations", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + }, + { + "paramName": "test", + "paramLongName": "isTest", + "paramDescription": "true if it is a test running", + "paramRequired": false + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml new file mode 100644 index 0000000000..2744ea92ba --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml new file mode 100644 index 0000000000..a1b7f4ad79 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml @@ -0,0 +1,284 @@ + + + + sourcePath + the source path + + + outputPath + sets the outputPath + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/publication + ${nameNode}/${outputPath}/publication + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/dataset + ${nameNode}/${outputPath}/dataset + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/otherresearchproduct + ${nameNode}/${outputPath}/otherresearchproduct + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/software + ${nameNode}/${outputPath}/software + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + yarn + cluster + PrepareResultOrganizationAssociation + eu.dnetlib.dhp.resulttoorganizationfrominstrepo.PrepareResultInstRepoAssociation + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + + + + + + + + yarn + cluster + resultToOrganizationFromInstRepoPropagationForPublications + eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath}/publication + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${outputPath}/relation + --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + yarn + cluster + resultToOrganizationFromInstRepoPropagationForDataset + eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath}/dataset + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${outputPath}/relation + --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + yarn + cluster + resultToOrganizationFromInstRepoPropagationForORP + eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath}/otherresearchproduct + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${outputPath}/relation + --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + yarn + cluster + resultToOrganizationFromInstRepoPropagationForSoftware + eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath}/software + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${outputPath}/relation + --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java new file mode 100644 index 0000000000..72e0a63fa2 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java @@ -0,0 +1,772 @@ + +package eu.dnetlib.dhp.bulktag; + +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class BulkTagJobTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static final String MOCK_IS_LOOK_UP_URL = "BASEURL:8280/is/services/isLookUp"; + + public static final String pathMap = "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}"; + + private static SparkSession spark; + + private static Path workingDir; + + private static final Logger log = LoggerFactory.getLogger(BulkTagJobTest.class); + + private static String taggingConf = ""; + + static { + try { + taggingConf = IOUtils + .toString( + BulkTagJobTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml")); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(BulkTagJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(BulkTagJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(BulkTagJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void noUpdatesTest() throws Exception { + final String pathMap = BulkTagJobTest.pathMap; + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", + getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates").getPath(), + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + Assertions.assertEquals(0, spark.sql(query).count()); + } + + @Test + public void bulktagBySubjectNoPreviousContextTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext") + .getPath(); + final String pathMap = BulkTagJobTest.pathMap; + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + Assertions.assertEquals(5, spark.sql(query).count()); + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions + .assertEquals( + 5, idExplodeCommunity.filter("provenance = 'community:subject'").count()); + Assertions + .assertEquals( + 5, + idExplodeCommunity.filter("name = 'Bulktagging for Community - Subject'").count()); + + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count()); + + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") + .count()); + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter( + "community = 'covid-19' and id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") + .count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter("id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'") + .count()); + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "(community = 'covid-19' or community = 'aginfra') and id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'") + .count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter("id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'") + .count()); + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "(community = 'mes' or community = 'fam') and id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'") + .count()); + } + + @Test + public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception { + final String sourcePath = getClass() + .getResource( + "/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance") + .getPath(); + final String pathMap = BulkTagJobTest.pathMap; + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyT.id = 'covid-19' "; + + Assertions.assertEquals(3, spark.sql(query).count()); + + org.apache.spark.sql.Dataset communityContext = spark.sql(query); + + Assertions + .assertEquals( + 2, + communityContext + .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") + .count()); + Assertions + .assertEquals( + 1, + communityContext + .filter( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'community:subject'") + .count()); + Assertions + .assertEquals( + 1, + communityContext + .filter( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'propagation:community:productsthroughsemrel'") + .count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from dataset " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + Assertions + .assertEquals( + 2, + spark + .sql(query) + .select("datainfosize") + .where( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' a" + + "nd community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + } + + @Test + public void bulktagByDatasourceTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource") + .getPath(); + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-outputPath", workingDir.toString() + "/publication", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/publication") + .map(item -> OBJECT_MAPPER.readValue(item, Publication.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Publication.class)); + + verificationDataset.createOrReplaceTempView("publication"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from publication " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + + Assertions.assertEquals(5, idExplodeCommunity.count()); + Assertions + .assertEquals( + 5, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + Assertions + .assertEquals( + 5, + idExplodeCommunity + .filter("name = 'Bulktagging for Community - Datasource'") + .count()); + + Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count()); + + Assertions + .assertEquals( + 3, + idExplodeCommunity + .filter( + "community = 'fam' and (id = '50|ec_fp7health::000085c89f4b96dc2269bd37edb35306' " + + "or id = '50|ec_fp7health::000b9e61f83f5a4b0c35777b7bccdf38' " + + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')") + .count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "community = 'aginfra' and (id = '50|ec_fp7health::000c8195edd542e4e64ebb32172cbf89' " + + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')") + .count()); + } + + @Test + public void bulktagByZenodoCommunityTest() throws Exception { + final String sourcePath = getClass() + .getResource( + "/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity") + .getPath(); + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct", + "-outputPath", workingDir.toString() + "/orp", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/orp") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(OtherResearchProduct.class)); + + verificationDataset.createOrReplaceTempView("orp"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from orp " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions.assertEquals(8, idExplodeCommunity.count()); + + Assertions + .assertEquals( + 8, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count()); + Assertions + .assertEquals( + 8, + idExplodeCommunity.filter("name = 'Bulktagging for Community - Zenodo'").count()); + + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'beopen'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'mes'").count()); + + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter( + "id = '50|od______2017::0750a4d0782265873d669520f5e33c07' " + + "and community = 'covid-19'") + .count()); + Assertions + .assertEquals( + 3, + idExplodeCommunity + .filter( + "id = '50|od______2017::1bd97baef19dbd2db3203b112bb83bc5' and " + + "(community = 'aginfra' or community = 'mes' or community = 'fam')") + .count()); + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter( + "id = '50|od______2017::1e400f1747487fd15998735c41a55c72' " + + "and community = 'beopen'") + .count()); + Assertions + .assertEquals( + 3, + idExplodeCommunity + .filter( + "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' and " + + "(community = 'beopen' or community = 'fam' or community = 'mes')") + .count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from orp " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + Assertions + .assertEquals( + 2, + spark + .sql(query) + .select("datainfosize") + .where( + "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' a" + + "nd community = 'beopen'") + .collectAsList() + .get(0) + .getInt(0)); + + // verify the zenodo community context is not present anymore in the records + query = "select id, MyT.id community " + + "from orp " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD "; + + org.apache.spark.sql.Dataset tmp2 = spark.sql(query); + + Assertions + .assertEquals( + 0, + tmp2 + .select("community") + .where(tmp2.col("community").contains(ZENODO_COMMUNITY_INDICATOR)) + .count()); + } + + @Test + public void bulktagBySubjectDatasourceTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource") + .getPath(); + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions.assertEquals(7, idExplodeCommunity.count()); + + Assertions + .assertEquals( + 5, idExplodeCommunity.filter("provenance = 'community:subject'").count()); + Assertions + .assertEquals( + 2, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from dataset " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + org.apache.spark.sql.Dataset tmp2 = spark.sql(query); + + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and " + + "community = 'aginfra'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 1, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and " + + "community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and " + + "community = 'fam'") + .collectAsList() + .get(0) + .getInt(0)); + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and " + + "community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 1, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and " + + "community = 'fam'") + .collectAsList() + .get(0) + .getInt(0)); + Assertions + .assertEquals( + 1, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and " + + "community = 'mes'") + .collectAsList() + .get(0) + .getInt(0)); + } + + @Test + public void bulktagBySubjectDatasourceZenodoCommunityTest() throws Exception { + + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/software/").getPath(), + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-outputPath", workingDir.toString() + "/software", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/software") + .map(item -> OBJECT_MAPPER.readValue(item, Software.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Software.class)); + + verificationDataset.createOrReplaceTempView("software"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from software " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions.assertEquals(10, idExplodeCommunity.count()); + + idExplodeCommunity.show(false); + Assertions + .assertEquals( + 3, idExplodeCommunity.filter("provenance = 'community:subject'").count()); + Assertions + .assertEquals( + 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + Assertions + .assertEquals( + 4, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count()); + + Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dh-ch'").count()); + Assertions.assertEquals(4, idExplodeCommunity.filter("community = 'aginfra'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dariah'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "provenance = 'community:zenodocommunity' and " + + "id = '50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4' and (" + + "community = 'dh-ch' or community = 'dariah')") + .count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from software " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + org.apache.spark.sql.Dataset tmp2 = spark.sql(query); + + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______1582::501b25d420f808c8eddcd9b16e917f11' and " + + "community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 3, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______1582::581621232a561b7e8b4952b18b8b0e56' and " + + "community = 'aginfra'") + .collectAsList() + .get(0) + .getInt(0)); + } + + @Test + public void bulktagDatasourcewithConstraintsTest() throws Exception { + + final String sourcePath = getClass() + .getResource( + "/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints") + .getPath(); + SparkBulkTagJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + + idExplodeCommunity.show(false); + Assertions.assertEquals(3, idExplodeCommunity.count()); + + Assertions + .assertEquals( + 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/CommunityConfigurationFactoryTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/CommunityConfigurationFactoryTest.java new file mode 100644 index 0000000000..ca737b79f7 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/CommunityConfigurationFactoryTest.java @@ -0,0 +1,85 @@ + +package eu.dnetlib.dhp.bulktag; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.util.*; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.dom4j.DocumentException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import com.google.gson.Gson; + +import eu.dnetlib.dhp.bulktag.community.CommunityConfiguration; +import eu.dnetlib.dhp.bulktag.community.CommunityConfigurationFactory; +import eu.dnetlib.dhp.bulktag.community.Constraint; +import eu.dnetlib.dhp.bulktag.community.SelectionConstraints; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; + +/** Created by miriam on 03/08/2018. */ +public class CommunityConfigurationFactoryTest { + + private final VerbResolver resolver = new VerbResolver(); + + @Test + public void parseTest() throws DocumentException, IOException { + String xml = IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml")); + final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); + Assertions.assertEquals(5, cc.size()); + cc + .getCommunityList() + .forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId()))); + } + + @Test + public void applyVerb() + throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, + InstantiationException { + Constraint sc = new Constraint(); + sc.setVerb("not_contains"); + sc.setField("contributor"); + sc.setValue("DARIAH"); + sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue())); + String metadata = "This work has been partially supported by DARIAH-EU infrastructure"; + Assertions.assertFalse(sc.verifyCriteria(metadata)); + } + + @Test + public void loadSelCriteriaTest() throws DocumentException, IOException { + String xml = IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml")); + final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); + Map> param = new HashMap<>(); + param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi"))); + param + .put( + "description", + new ArrayList<>( + Collections + .singletonList( + "This work has been partially supported by DARIAH-EU infrastructure"))); + param + .put( + "contributor", + new ArrayList<>( + Collections + .singletonList( + "Author X helped to write the paper. X works for DARIAH"))); + List comm = cc + .getCommunityForDatasource( + "openaire____::1cfdb2e14977f31a98e0118283401f32", param); + Assertions.assertEquals(1, comm.size()); + Assertions.assertEquals("dariah", comm.get(0)); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java new file mode 100644 index 0000000000..88ad43b6be --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java @@ -0,0 +1,265 @@ + +package eu.dnetlib.dhp.countrypropagation; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Country; +import eu.dnetlib.dhp.schema.oaf.Software; +import scala.Tuple2; + +public class CountryPropagationJobTest { + + private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(CountryPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(CountryPropagationJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(CountryPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void testCountryPropagationSoftware() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/countrypropagation/sample/software") + .getPath(); + final String preparedInfoPath = getClass() + .getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo") + .getPath(); + SparkCountryPropagationJob + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", sourcePath, + "-saveGraph", "true", + "-resultTableName", Software.class.getCanonicalName(), + "-outputPath", workingDir.toString() + "/software", + "-preparedInfoPath", preparedInfoPath + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/software") + .map(item -> OBJECT_MAPPER.readValue(item, Software.class)); + + // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); + + Assertions.assertEquals(10, tmp.count()); + + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Software.class)); + + Assertions.assertEquals(6, verificationDs.filter("size(country) > 0").count()); + Assertions.assertEquals(3, verificationDs.filter("size(country) = 1").count()); + Assertions.assertEquals(3, verificationDs.filter("size(country) = 2").count()); + Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count()); + + Dataset countryExploded = verificationDs + .flatMap( + (FlatMapFunction) row -> row.getCountry().iterator(), Encoders.bean(Country.class)) + .map((MapFunction) c -> c.getClassid(), Encoders.STRING()); + + Assertions.assertEquals(9, countryExploded.count()); + + Assertions.assertEquals(1, countryExploded.filter("value = 'FR'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'TR'").count()); + Assertions.assertEquals(2, countryExploded.filter("value = 'IT'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'US'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'MX'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'CH'").count()); + Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count()); + + Dataset> countryExplodedWithCountryclassid = verificationDs + .flatMap((FlatMapFunction>) row -> { + List> prova = new ArrayList(); + List country_list = row.getCountry(); + country_list + .stream() + .forEach( + c -> prova + .add( + new Tuple2<>( + row.getId(), c.getClassid()))); + return prova.iterator(); + }, Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + + Assertions.assertEquals(9, countryExplodedWithCountryclassid.count()); + + countryExplodedWithCountryclassid.show(false); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'FR' ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'TR' ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'IT' or _2 = 'MX') ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'IT' or _2 = 'US') ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'JP'") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'CH' or _2 = 'JP') ") + .count()); + + Dataset> countryExplodedWithCountryclassname = verificationDs + .flatMap( + (FlatMapFunction>) row -> { + List> prova = new ArrayList(); + List country_list = row.getCountry(); + country_list + .stream() + .forEach( + c -> prova + .add( + new Tuple2<>( + row.getId(), + c.getClassname()))); + return prova.iterator(); + }, + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + + countryExplodedWithCountryclassname.show(false); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'France' ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'Turkey' ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'Italy' or _2 = 'Mexico') ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'Italy' or _2 = 'United States') ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'Japan' ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'Switzerland' or _2 = 'Japan') ") + .count()); + + Dataset> countryExplodedWithCountryProvenance = verificationDs + .flatMap( + (FlatMapFunction>) row -> { + List> prova = new ArrayList(); + List country_list = row.getCountry(); + country_list + .stream() + .forEach( + c -> prova + .add( + new Tuple2<>( + row.getId(), + c + .getDataInfo() + .getInferenceprovenance()))); + return prova.iterator(); + }, + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + + Assertions + .assertEquals( + 7, countryExplodedWithCountryProvenance.filter("_2 = 'propagation'").count()); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java new file mode 100644 index 0000000000..edd2e7ba73 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java @@ -0,0 +1,252 @@ + +package eu.dnetlib.dhp.orcidtoresultfromsemrel; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Dataset; + +public class OrcidPropagationJobTest { + + private static final Logger log = LoggerFactory.getLogger(OrcidPropagationJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(OrcidPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(OrcidPropagationJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(OrcidPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void noUpdateTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate") + .getPath(); + final String possibleUpdatesPath = getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") + .getPath(); + SparkOrcidToResultFromSemRelJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-resultTableName", Dataset.class.getCanonicalName(), + "-outputPath", workingDir.toString() + "/dataset", + "-possibleUpdatesPath", possibleUpdatesPath + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); + + Assertions.assertEquals(10, tmp.count()); + + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP " + + "where MyP.datainfo.inferenceprovenance = 'propagation'"; + + Assertions.assertEquals(0, spark.sql(query).count()); + } + + @Test + public void oneUpdateTest() throws Exception { + SparkOrcidToResultFromSemRelJob + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate") + .getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-possibleUpdatesPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") + .getPath() + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); + + Assertions.assertEquals(10, tmp.count()); + + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP " + + "where MyP.datainfo.inferenceprovenance = 'propagation'"; + + org.apache.spark.sql.Dataset propagatedAuthors = spark.sql(query); + + Assertions.assertEquals(1, propagatedAuthors.count()); + + Assertions + .assertEquals( + 1, + propagatedAuthors + .filter( + "id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' " + + "and name = 'Vajinder' and surname = 'Kumar' and pidType = 'ORCID'") + .count()); + + Assertions.assertEquals(1, propagatedAuthors.filter("pid = '0000-0002-8825-3517'").count()); + } + + @Test + public void twoUpdatesTest() throws Exception { + SparkOrcidToResultFromSemRelJob + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates") + .getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-possibleUpdatesPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") + .getPath() + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP " + + "where MyP.datainfo.inferenceprovenance = 'propagation'"; + + org.apache.spark.sql.Dataset propagatedAuthors = spark.sql(query); + + Assertions.assertEquals(2, propagatedAuthors.count()); + + Assertions + .assertEquals( + 1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count()); + Assertions + .assertEquals( + 1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count()); + + query = "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP "; + + org.apache.spark.sql.Dataset authorsExplodedPids = spark.sql(query); + + Assertions + .assertEquals( + 2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count()); + Assertions + .assertEquals( + 1, + authorsExplodedPids + .filter( + "name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'") + .count()); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java new file mode 100644 index 0000000000..abed028e10 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java @@ -0,0 +1,248 @@ + +package eu.dnetlib.dhp.projecttoresult; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class ProjectPropagationJobTest { + + private static final Logger log = LoggerFactory.getLogger(ProjectPropagationJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(ProjectPropagationJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(ProjectPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + /** + * There are no new relations to be added. All the possible relations have already been linked with the project in + * the graph + * + * @throws Exception + */ + @Test + public void NoUpdateTest() throws Exception { + + final String potentialUpdateDate = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(); + SparkResultToProjectThroughSemRelJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-potentialUpdatePath", potentialUpdateDate, + "-alreadyLinkedPath", alreadyLinkedPath, + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(0, tmp.count()); + } + + /** + * All the possible updates will produce a new relation. No relations are already linked in the grpha + * + * @throws Exception + */ + @Test + public void UpdateTenTest() throws Exception { + final String potentialUpdatePath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(); + SparkResultToProjectThroughSemRelJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-potentialUpdatePath", potentialUpdatePath, + "-alreadyLinkedPath", alreadyLinkedPath, + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + // got 20 new relations because "produces" and "isProducedBy" are added + Assertions.assertEquals(10, tmp.count()); + + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + + Assertions.assertEquals(5, verificationDs.filter("relClass = 'produces'").count()); + Assertions.assertEquals(5, verificationDs.filter("relClass = 'isProducedBy'").count()); + + Assertions + .assertEquals( + 5, + verificationDs + .filter( + (FilterFunction) r -> r.getSource().startsWith("50") + && r.getTarget().startsWith("40") + && r.getRelClass().equals("isProducedBy")) + .count()); + Assertions + .assertEquals( + 5, + verificationDs + .filter( + (FilterFunction) r -> r.getSource().startsWith("40") + && r.getTarget().startsWith("50") + && r.getRelClass().equals("produces")) + .count()); + + verificationDs.createOrReplaceTempView("temporary"); + + Assertions + .assertEquals( + 10, + spark + .sql( + "Select * from temporary where datainfo.inferenceprovenance = 'propagation'") + .count()); + } + + /** + * One of the relations in the possible updates is already linked to the project in the graph. All the others are + * not. There will be 9 new associations leading to 18 new relations + * + * @throws Exception + */ + @Test + public void UpdateMixTest() throws Exception { + final String potentialUpdatepath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(); + SparkResultToProjectThroughSemRelJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-potentialUpdatePath", potentialUpdatepath, + "-alreadyLinkedPath", alreadyLinkedPath, + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + // JavaRDD tmp = sc.textFile("/tmp/relation") + // .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + // got 20 new relations because "produces" and "isProducedBy" are added + Assertions.assertEquals(8, tmp.count()); + + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + + Assertions.assertEquals(4, verificationDs.filter("relClass = 'produces'").count()); + Assertions.assertEquals(4, verificationDs.filter("relClass = 'isProducedBy'").count()); + + Assertions + .assertEquals( + 4, + verificationDs + .filter( + (FilterFunction) r -> r.getSource().startsWith("50") + && r.getTarget().startsWith("40") + && r.getRelClass().equals("isProducedBy")) + .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + (FilterFunction) r -> r.getSource().startsWith("40") + && r.getTarget().startsWith("50") + && r.getRelClass().equals("produces")) + .count()); + + verificationDs.createOrReplaceTempView("temporary"); + + Assertions + .assertEquals( + 8, + spark + .sql( + "Select * from temporary where datainfo.inferenceprovenance = 'propagation'") + .count()); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java new file mode 100644 index 0000000000..d739516fc1 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java @@ -0,0 +1,323 @@ + +package eu.dnetlib.dhp.resulttocommunityfromorganization; + +import static org.apache.spark.sql.functions.*; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest; +import eu.dnetlib.dhp.schema.oaf.Dataset; + +public class ResultToCommunityJobTest { + + private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(ResultToCommunityJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(OrcidPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void testSparkResultToCommunityFromOrganizationJob() throws Exception { + final String preparedInfoPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo") + .getPath(); + SparkResultToCommunityFromOrganizationJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/sample") + .getPath(), + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-preparedInfoPath", preparedInfoPath + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'propagation'"; + + org.apache.spark.sql.Dataset resultExplodedProvenance = spark.sql(query); + Assertions.assertEquals(5, resultExplodedProvenance.count()); + Assertions + .assertEquals( + 0, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe'") + .count()); + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b'") + .count()); + Assertions + .assertEquals( + "beopen", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b")) + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + 2, + resultExplodedProvenance + .filter("id = '50|od________18::8887b1df8b563c4ea851eb9c882c9d7b'") + .count()); + Assertions + .assertEquals( + "mes", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "euromarine", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) + .sort(desc("community")) + .collectAsList() + .get(1) + .getString(0)); + + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") + .count()); + Assertions + .assertEquals( + "mes", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6'") + .count()); + Assertions + .assertEquals( + "mes", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); + + query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD "; + + org.apache.spark.sql.Dataset resultCommunityId = spark.sql(query); + + Assertions.assertEquals(10, resultCommunityId.count()); + + Assertions + .assertEquals( + 1, + resultCommunityId + .filter("id = '50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe'") + .count()); + Assertions + .assertEquals( + "beopen", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe")) + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + 1, + resultCommunityId + .filter("id = '50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b'") + .count()); + + Assertions + .assertEquals( + 3, + resultCommunityId + .filter("id = '50|od________18::8887b1df8b563c4ea851eb9c882c9d7b'") + .count()); + Assertions + .assertEquals( + "beopen", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) + .sort(desc("community")) + .collectAsList() + .get(2) + .getString(0)); + + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") + .count()); + Assertions + .assertEquals( + "euromarine", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) + .sort(desc("community")) + .collectAsList() + .get(1) + .getString(0)); + + Assertions + .assertEquals( + 3, + resultCommunityId + .filter("id = '50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6'") + .count()); + Assertions + .assertEquals( + "euromarine", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) + .sort(desc("community")) + .collectAsList() + .get(2) + .getString(0)); + Assertions + .assertEquals( + "ni", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java new file mode 100644 index 0000000000..a8e1ab8414 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java @@ -0,0 +1,275 @@ + +package eu.dnetlib.dhp.resulttocommunityfromsemrel; + +import static org.apache.spark.sql.functions.desc; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest; +import eu.dnetlib.dhp.schema.oaf.Dataset; + +public class ResultToCommunityJobTest { + + private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(ResultToCommunityJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(ResultToCommunityJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void test1() throws Exception { + SparkResultToCommunityThroughSemRelJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample") + .getPath(), + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-preparedInfoPath", getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo") + .getPath() + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'propagation'"; + + org.apache.spark.sql.Dataset resultExplodedProvenance = spark.sql(query); + Assertions.assertEquals(5, resultExplodedProvenance.count()); + + Assertions + .assertEquals( + 0, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'") + .count()); + + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'") + .count()); + Assertions + .assertEquals( + "dh-ch", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::0489ae524201eedaa775da282dce35e7")) + .collectAsList() + .get(0) + .getString(0)); + + Assertions + .assertEquals( + 3, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'") + .count()); + List rowList = resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("mes", rowList.get(0).getString(0)); + Assertions.assertEquals("fam", rowList.get(1).getString(0)); + Assertions.assertEquals("ee", rowList.get(2).getString(0)); + + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'") + .count()); + Assertions + .assertEquals( + "aginfra", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc")) + .collectAsList() + .get(0) + .getString(0)); + + query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD "; + + org.apache.spark.sql.Dataset resultCommunityId = spark.sql(query); + + Assertions.assertEquals(10, resultCommunityId.count()); + + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::0489ae524201eedaa775da282dce35e7")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("dh-ch", rowList.get(0).getString(0)); + Assertions.assertEquals("beopen", rowList.get(1).getString(0)); + + Assertions + .assertEquals( + 3, + resultCommunityId + .filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("mes", rowList.get(0).getString(0)); + Assertions.assertEquals("fam", rowList.get(1).getString(0)); + Assertions.assertEquals("ee", rowList.get(2).getString(0)); + + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("beopen", rowList.get(0).getString(0)); + Assertions.assertEquals("aginfra", rowList.get(1).getString(0)); + + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("euromarine", rowList.get(1).getString(0)); + Assertions.assertEquals("ni", rowList.get(0).getString(0)); + + Assertions + .assertEquals( + 1, + resultCommunityId + .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") + .count()); + Assertions + .assertEquals( + "euromarine", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) + .collectAsList() + .get(0) + .getString(0)); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java new file mode 100644 index 0000000000..435b766052 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java @@ -0,0 +1,270 @@ + +package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class ResultToOrganizationJobTest { + + private static final Logger log = LoggerFactory.getLogger(ResultToOrganizationJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(SparkResultToOrganizationFromIstRepoJob.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(SparkResultToOrganizationFromIstRepoJob.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(SparkResultToOrganizationFromIstRepoJob.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + /** + * No modifications done to the sample sets, so that no possible updates are created + * + * @throws Exception + */ + @Test + public void NoUpdateTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") + .getPath(); + final String datasourceOrganizationPath = getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked") + .getPath(); + SparkResultToOrganizationFromIstRepoJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-datasourceOrganizationPath", datasourceOrganizationPath, + "-alreadyLinkedPath", alreadyLinkedPath, + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(0, tmp.count()); + } + + /** + * Testing set with modified association between datasource and organization. Copied some hostedby collectedfrom + * from the software sample set. No intersection with the already linked (all the possible new relations, will + * became new relations) + * + * @throws Exception + */ + @Test + public void UpdateNoMixTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") + .getPath(); + final String datasourceOrganizationPath = getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked") + .getPath(); + SparkResultToOrganizationFromIstRepoJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-datasourceOrganizationPath", datasourceOrganizationPath, + "-alreadyLinkedPath", alreadyLinkedPath, + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(20, tmp.count()); + + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + Assertions + .assertEquals( + 8, + verificationDs + .filter("target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("target = '20|opendoar____::4429502fa1936b0941f4647b69b844c8'") + .count()); + + Assertions + .assertEquals( + 2, + verificationDs + .filter( + "source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and " + + "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' " + + "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')") + .count()); + } + + @Test + public void UpdateMixTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix") + .getPath(); + final String datasourceOrganizationPath = getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked") + .getPath(); + SparkResultToOrganizationFromIstRepoJob + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-datasourceOrganizationPath", datasourceOrganizationPath, + "-alreadyLinkedPath", alreadyLinkedPath, + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + + Assertions.assertEquals(8, verificationDs.count()); + + Assertions + .assertEquals( + 2, + verificationDs + .filter("source = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("source = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523'") + .count()); + + Assertions + .assertEquals( + 1, + verificationDs + .filter("source = '20|wt__________::a72760363ca885e6bef165804770e00c'") + .count()); + + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'hasAuthorInstitution' and substring(source, 1,2) = '50'") + .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'isAuthorInstitutionOf' and substring(source, 1,2) = '20'") + .count()); + + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'hasAuthorInstitution' and " + + "substring(source, 1,2) = '50' and substring(target, 1, 2) = '20'") + .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'isAuthorInstitutionOf' and " + + "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'") + .count()); + } +} diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.json new file mode 100644 index 0000000000..d21dc4ceda --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.json @@ -0,0 +1,694 @@ +{"communities": { + "clarin": { + "id": "clarin", + "subjects": [], + "datasources": [ + { + "openaireId": "re3data_____::a507cdacc5bbcc08761c92185dee5cab" + } + ], + "zenodoCommunities": [ + + ] + }, + "ee": { + "id": "ee", + "subjects": [ + "SDG13 - Climate action", + "SDG8 - Decent work and economic\n\t\t\t\t\tgrowth", + "SDG15 - Life on land", + "SDG2 - Zero hunger", + "SDG17 - Partnerships for the\n\t\t\t\t\tgoals", + "SDG10 - Reduced inequalities", + "SDG5 - Gender equality", + "SDG12 - Responsible\n\t\t\t\t\tconsumption and production", + "SDG14 - Life below water", + "SDG6 - Clean water and\n\t\t\t\t\tsanitation", + "SDG11 - Sustainable cities and communities", + "SDG1 - No poverty", + "SDG3 -\n\t\t\t\t\tGood health and well being", + "SDG7 - Affordable and clean energy", + "SDG4 - Quality\n\t\t\t\t\teducation", + "SDG9 - Industry innovation and infrastructure", + "SDG16 - Peace justice\n\t\t\t\t\tand strong institutions" + ], + "datasources": [ + + ], + "zenodoCommunities": [ + + ] + }, + "aginfra": { + "id": "aginfra", + "subjects": [ + "animal production and health", + "fisheries and aquaculture", + "food safety and human nutrition", + "information management", + "food technology", + "agri-food education and extension", + "natural resources and environment", + "food system", + "engineering technology and Research", + "agriculture", + "food safety risk assessment", + "food security", + "farming practices and systems", + "plant production and protection", + "agri-food economics and policy", + "food distribution", + "forestry" + ], + "datasources": [ + { + "openaireId": "opendoar____::1a551829d50f1400b0dab21fdd969c04" + }, + { + "openaireId": "opendoar____::49af6c4e558a7569d80eee2e035e2bd7" + }, + { + "openaireId": "opendoar____::0266e33d3f546cb5436a10798e657d97" + }, + { + "openaireId": "opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06" + }, + { + "openaireId": "opendoar____::41bfd20a38bb1b0bec75acf0845530a7" + }, + { + "openaireId": "opendoar____::87ae6fb631f7c8a627e8e28785d9992d" + } + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "edenis" + }, + { + "zenodoCommunityId": "efsa-pilot" + }, + { + "zenodoCommunityId": "egene3" + }, + { + "zenodoCommunityId": "efsa-kj" + }, + { + "zenodoCommunityId": "euromixproject" + }, + { + "zenodoCommunityId": "discardless" + }, + { + "zenodoCommunityId": "sedinstcjfst" + }, + { + "zenodoCommunityId": "afinet-kc" + }, + { + "zenodoCommunityId": "2231-4784" + }, + { + "zenodoCommunityId": "2231-0606" + }, + { + "zenodoCommunityId": "solace" + }, + { + "zenodoCommunityId": "pa17" + }, + { + "zenodoCommunityId": "smartakis" + }, + { + "zenodoCommunityId": "sedinstcjae" + }, + { + "zenodoCommunityId": "phenology_camera" + }, + { + "zenodoCommunityId": "aginfra" + }, + { + "zenodoCommunityId": "erosa" + }, + { + "zenodoCommunityId": "bigdatagrapes" + } + ] + }, + "fam": { + "id": "fam", + "subjects": [ + "Stock Assessment", + "pelagic", + "Fish farming", + "EMFF", + "Fisheries", + "Fishermen", + "maximum sustainable yield", + "trawler", + "Fishing vessel", + "Fisherman", + "Fishing gear", + "RFMO", + "Fish Aggregating Device", + "Bycatch", + "Fishery", + "common fisheries policy", + "Fishing fleet", + "Aquaculture" + ], + "datasources": [ + { + "openaireId": "doajarticles::8cec81178926caaca531afbd8eb5d64c" + }, + { + "openaireId": "doajarticles::0f7a7f30b5400615cae1829f3e743982" + }, + { + "openaireId": "doajarticles::9740f7f5af3e506d2ad2c215cdccd51a" + }, + { + "openaireId": "doajarticles::9f3fbaae044fa33cb7069b72935a3254" + }, + { + "openaireId": "doajarticles::cb67f33eb9819f5c624ce0313957f6b3" + }, + { + "openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906" + }, + { + "openaireId": "doajarticles::554cde3be9e5c4588b4c4f9f503120cb" + }, + { + "openaireId": "tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b" + }, + { + "openaireId": "doajarticles::57c5d3837da943e93b28ec4db82ec7a5" + }, + { + "openaireId": "doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1" + }, + { + "openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906" + }, + { + "openaireId": "doajarticles::dca64612dfe0963fffc119098a319957" + }, + { + "openaireId": "doajarticles::dd70e44479f0ade25aa106aef3e87a0a" + } + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "discardless" + }, + { + "zenodoCommunityId": "farfish2020" + }, + { + "zenodoCommunityId": "facts" + }, + { + "zenodoCommunityId": "climefish" + }, + { + "zenodoCommunityId": "proeel" + }, + { + "zenodoCommunityId": "primefish" + }, + { + "zenodoCommunityId": "h2020_vicinaqua" + }, + { + "zenodoCommunityId": "meece" + }, + { + "zenodoCommunityId": "rlsadb" + } + ] + }, + "instruct": { + "id": "instruct", + "subjects": [ + + ], + "datasources": [ + + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "instruct" + }, + { + "zenodoCommunityId": "west-life" + } + ] + }, + "mes": { + "id": "mes", + "subjects": [ + "marine", + "ocean", + "fish", + "aqua", + "sea" + ], + "datasources": [ + + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "adriplan" + }, + { + "zenodoCommunityId": "devotes-project" + }, + { + "zenodoCommunityId": "euro-basin" + }, + { + "zenodoCommunityId": "naclim" + }, + { + "zenodoCommunityId": "discardless" + }, + { + "zenodoCommunityId": "assisibf" + }, + { + "zenodoCommunityId": "meece" + }, + { + "zenodoCommunityId": "facts" + }, + { + "zenodoCommunityId": "proeel" + }, + { + "zenodoCommunityId": "aquatrace" + }, + { + "zenodoCommunityId": "myfish" + }, + { + "zenodoCommunityId": "atlas" + }, + { + "zenodoCommunityId": "blue-actionh2020" + }, + { + "zenodoCommunityId": "sponges" + }, + { + "zenodoCommunityId": "merces_project" + }, + { + "zenodoCommunityId": "bigdataocean" + }, + { + "zenodoCommunityId": "columbus" + }, + { + "zenodoCommunityId": "h2020-aquainvad-ed" + }, + { + "zenodoCommunityId": "aquarius" + }, + { + "zenodoCommunityId": "southern-ocean-observing-system" + }, + { + "zenodoCommunityId": "eawag" + }, + { + "zenodoCommunityId": "mossco" + }, + { + "zenodoCommunityId": "onc" + }, + { + "zenodoCommunityId": "oceanbiogeochemistry" + }, + { + "zenodoCommunityId": "oceanliteracy" + }, + { + "zenodoCommunityId": "openearth" + }, + { + "zenodoCommunityId": "ocean" + }, + { + "zenodoCommunityId": "calcifierraman" + }, + { + "zenodoCommunityId": "bermudabream" + }, + { + "zenodoCommunityId": "brcorp1" + }, + { + "zenodoCommunityId": "mce" + }, + { + "zenodoCommunityId": "biogeochem" + }, + { + "zenodoCommunityId": "ecc2014" + }, + { + "zenodoCommunityId": "fisheries" + }, + { + "zenodoCommunityId": "sedinstcjfas" + }, + { + "zenodoCommunityId": "narmada" + }, + { + "zenodoCommunityId": "umr-entropie" + }, + { + "zenodoCommunityId": "farfish2020" + }, + { + "zenodoCommunityId": "primefish" + }, + { + "zenodoCommunityId": "zf-ilcs" + }, + { + "zenodoCommunityId": "climefish" + }, + { + "zenodoCommunityId": "afrimed_eu" + }, + { + "zenodoCommunityId": "spi-ace" + }, + { + "zenodoCommunityId": "cice-consortium" + }, + { + "zenodoCommunityId": "nemo-ocean" + }, + { + "zenodoCommunityId": "mesopp-h2020" + }, + { + "zenodoCommunityId": "marxiv" + } + ] + }, + "ni": { + "id": "ni", + "subjects": [ + "brain mapping", + "brain imaging", + "electroencephalography", + "arterial spin labelling", + "brain fingerprinting", + "brain", + "neuroimaging", + "Multimodal Brain Image Analysis", + "fMRI", + "neuroinformatics", + "fetal brain", + "brain ultrasonic imaging", + "topographic brain mapping", + "diffusion tensor imaging", + "computerized knowledge assessment", + "connectome mapping", + "brain magnetic resonance imaging", + "brain abnormalities" + ], + "datasources": [ + { + "openaireId": "re3data_____::5b9bf9171d92df854cf3c520692e9122" + }, + { + "openaireId": "doajarticles::c7d3de67dc77af72f6747157441252ec" + }, + { + "openaireId": "re3data_____::8515794670370f49c1d176c399c714f5" + }, + { + "openaireId": "doajarticles::d640648c84b10d425f96f11c3de468f3" + }, + { + "openaireId": "doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a" + }, + { + "openaireId": "rest________::fb1a3d4523c95e63496e3bc7ba36244b" + } + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "neuroinformatics" + }, + { + "zenodoCommunityId": "hbp" + }, + { + "zenodoCommunityId": "from_neuroscience_to_machine_learning" + }, + { + "zenodoCommunityId": "ci2c" + }, + { + "zenodoCommunityId": "opensourcebrain" + }, + { + "zenodoCommunityId": "brainspeak" + }, + { + "zenodoCommunityId": "braincom" + }, + { + "zenodoCommunityId": "nextgenvis" + }, + { + "zenodoCommunityId": "meso-brain" + }, + { + "zenodoCommunityId": "neuroplasticity-workshop" + }, + { + "zenodoCommunityId": "bionics" + }, + { + "zenodoCommunityId": "brainmattrain-676408" + }, + { + "zenodoCommunityId": "repronim" + }, + { + "zenodoCommunityId": "affectiveneuro" + }, + { + "zenodoCommunityId": "con" + }, + { + "zenodoCommunityId": "lab_neurol_sperim_irfmn_irccs_milano_it" + } + ] + }, + "dariah": { + "id": "dariah", + "subjects": [ + + ], + "datasources": [ + { + "openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a", + "sc": { + "cl": { + "criteria": [ + { + "ce": { + "constraint": [ + { + "verb": "contains", + "field": "contributor", + "value": "DARIAH" + } + ] + } + } + ] + } + } + } + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "dimpo" + } + ] + }, + "rda": { + "id": "rda", + "subjects": [ + + ], + "datasources": [ + + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "rda" + } + ] + }, + "dh-ch": { + "id": "dh-ch", + "subjects": [ + "modern art", + "metadata", + "monuments", + "sites", + "field walking", + "frescoes", + "excavation", + "ontologies", + "mapping", + "cities", + "temples", + "lithics", + "roads", + "digital cultural heritage", + "interoperability", + "archaeological reports", + "churches", + "standards", + "archaeological stratigraphy", + "buidings", + "digital humanities", + "survey", + "archaeological sites", + "CIDOC CRM", + "decorations", + "classic art", + "stratigraphy", + "digital archaeology", + "walls", + "data science", + "chapels", + "paintings", + "archaeology", + "fair data", + "mosaics", + "data visualization", + "burials", + "medieval art", + "castles", + "statues", + "natural language processing", + "inscriptions", + "vaults", + "open data", + "contemporary art", + "3D", + "pottery", + "site", + "metadata schema", + "architectural", + "vessels" + ], + "datasources": [ + { + "openaireId": "re3data_____::9ebe127e5f3a0bf401875690f3bb6b81" + }, + { + "openaireId": "doajarticles::c6cd4b532e12868c1d760a8d7cda6815" + }, + { + "openaireId": "doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b" + }, + { + "openaireId": "doajarticles::6eb31d13b12bc06bbac06aef63cf33c9" + }, + { + "openaireId": "doajarticles::0da84e9dfdc8419576169e027baa8028" + }, + { + "openaireId": "re3data_____::84e123776089ce3c7a33db98d9cd15a8" + }, + { + "openaireId": "openaire____::c5502a43e76feab55dd00cf50f519125" + }, + { + "openaireId": "re3data_____::a48f09c562b247a9919acfe195549b47" + }, + { + "openaireId": "opendoar____::97275a23ca44226c9964043c8462be96" + } + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "storm" + }, + { + "zenodoCommunityId": "crosscult" + }, + { + "zenodoCommunityId": "wholodance_eu" + }, + { + "zenodoCommunityId": "digcur2013" + }, + { + "zenodoCommunityId": "gravitate" + }, + { + "zenodoCommunityId": "dipp2014" + }, + { + "zenodoCommunityId": "digitalhumanities" + }, + { + "zenodoCommunityId": "dimpo" + }, + { + "zenodoCommunityId": "adho" + }, + { + "zenodoCommunityId": "chc" + }, + { + "zenodoCommunityId": "wahr" + }, + { + "zenodoCommunityId": "ibe" + }, + { + "zenodoCommunityId": "ariadne" + }, + { + "zenodoCommunityId": "parthenos-hub" + }, + { + "zenodoCommunityId": "parthenos-training" + }, + { + "zenodoCommunityId": "gandhara" + }, + { + "zenodoCommunityId": "cmsouthasia" + }, + { + "zenodoCommunityId": "nilgirihills" + }, + { + "zenodoCommunityId": "shamsa_mustecio" + }, + { + "zenodoCommunityId": "bodhgaya" + } + ] + } + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml new file mode 100644 index 0000000000..e2cc41063d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml @@ -0,0 +1,176 @@ + + + + + + + + + + + + + + + + + + + + SDG13 - Climate action + SDG8 - Decent work and economic growth + SDG15 - Life on land + SDG2 - Zero hunger + SDG17 - Partnerships for the goals + SDG10 - Reduced inequalities + SDG5 - Gender equality + SDG12 - Responsible consumption and production + SDG14 - Life below water + SDG6 - Clean water and sanitation + SDG11 - Sustainable cities and communities + SDG1 - No poverty + SDG3 - Good health and well being + SDG7 - Affordable and clean energy + SDG4 - Quality education + SDG9 - Industry innovation and infrastructure + SDG16 - Peace justice and strong institutions + + + + + 123 + + + + + + + + + + + + + + + + + brain mapping + brain imaging + electroencephalography + arterial spin labelling + brain fingerprinting + brain + neuroimaging + Multimodal Brain Image Analysis + fMRI + neuroinformatics + fetal brain + brain ultrasonic imaging + topographic brain mapping + diffusion tensor imaging + computerized knowledge assessment + connectome mapping + brain magnetic resonance imaging + brain abnormalities + + + + re3data_____::5b9bf9171d92df854cf3c520692e9122 + + + + doajarticles::c7d3de67dc77af72f6747157441252ec + + + + re3data_____::8515794670370f49c1d176c399c714f5 + + + + doajarticles::d640648c84b10d425f96f11c3de468f3 + + + + doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a + + + + + + + + marine + ocean + fish + aqua + sea + + + + re3data_____::9633d1e8c4309c833c2c442abeb0cfeb + + + + + + + + animal production and health + fisheries and aquaculture + food safety and human nutrition + information management + food technology + agri-food education and extension + natural resources and environment + food system + engineering technology and Research + agriculture + food safety risk assessment + food security + farming practices and systems + plant production and protection + agri-food economics and policy + food distribution + forestry + + + + opendoar____::1a551829d50f1400b0dab21fdd969c04 + + + + opendoar____::49af6c4e558a7569d80eee2e035e2bd7 + + + + opendoar____::0266e33d3f546cb5436a10798e657d97 + + + + opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06 + + + + opendoar____::41bfd20a38bb1b0bec75acf0845530a7 + + + + opendoar____::87ae6fb631f7c8a627e8e28785d9992d + + + + + + + oac_clarin + + + + re3data_____::a507cdacc5bbcc08761c92185dee5cab + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.json new file mode 100644 index 0000000000..6aa4275d6d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.json @@ -0,0 +1,37 @@ +{ + "communities": { + "dariah": { + "id": "dariah", + "subjects": [ + + ], + "datasources": [ + { + "openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a", + "sc": { + "cl": { + "criteria": [ + { + "ce": { + "constraint": [ + { + "verb": "contains", + "field": "contributor", + "value": "DARIAH" + } + ] + } + } + ] + } + } + } + ], + "zenodoCommunities": [ + { + "zenodoCommunityId": "dimpo" + } + ] + } + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml new file mode 100644 index 0000000000..cd5ea38d0e --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + + + SDG13 - Climate action + SDG8 - Decent work and economic growth + SDG15 - Life on land + SDG2 - Zero hunger + SDG17 - Partnerships for the goals + SDG10 - Reduced inequalities + SDG5 - Gender equality + SDG12 - Responsible consumption and production + SDG14 - Life below water + SDG6 - Clean water and sanitation + SDG11 - Sustainable cities and communities + SDG1 - No poverty + SDG3 - Good health and well being + SDG7 - Affordable and clean energy + SDG4 - Quality education + SDG9 - Industry innovation and infrastructure + SDG16 - Peace justice and strong institutions + + + + + 123 + + + + + + + + + + + + + + + + + brain mapping + brain imaging + electroencephalography + arterial spin labelling + brain fingerprinting + brain + neuroimaging + Multimodal Brain Image Analysis + fMRI + neuroinformatics + fetal brain + brain ultrasonic imaging + topographic brain mapping + diffusion tensor imaging + computerized knowledge assessment + connectome mapping + brain magnetic resonance imaging + brain abnormalities + + + + re3data_____::5b9bf9171d92df854cf3c520692e9122 + + + + doajarticles::c7d3de67dc77af72f6747157441252ec + + + + re3data_____::8515794670370f49c1d176c399c714f5 + + + + doajarticles::d640648c84b10d425f96f11c3de468f3 + + + + doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a + + + + + + + + marine + ocean + fish + aqua + sea + + + + re3data_____::9633d1e8c4309c833c2c442abeb0cfeb + + + + + + + + animal production and health + fisheries and aquaculture + food safety and human nutrition + information management + food technology + agri-food education and extension + natural resources and environment + food system + engineering technology and Research + agriculture + food safety risk assessment + food security + farming practices and systems + plant production and protection + agri-food economics and policy + food distribution + forestry + + + + opendoar____::1a551829d50f1400b0dab21fdd969c04 + + + + opendoar____::49af6c4e558a7569d80eee2e035e2bd7 + + + + opendoar____::0266e33d3f546cb5436a10798e657d97 + + + + opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06 + + + + opendoar____::41bfd20a38bb1b0bec75acf0845530a7 + + + + opendoar____::87ae6fb631f7c8a627e8e28785d9992d + + + + + + + oac_clarin + + + + re3data_____::a507cdacc5bbcc08761c92185dee5cab + + + + + + + oaa_dariah + + + + openaire____::1cfdb2e14977f31a98e0118283401f32 + {"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]} + + + + + + dimpo + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.json new file mode 100644 index 0000000000..411a64fede --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.json @@ -0,0 +1,44 @@ +{"communities": + {"ee": + {"id":"ee", + "subjects":["SDG13 - Climate action","SDG8 - Decent work and economic growth","SDG15 - Life on land","SDG2 - Zero hunger","SDG17 - Partnerships for the goals","SDG10 - Reduced inequalities","SDG5 - Gender equality","SDG12 - Responsible consumption and production","SDG14 - Life below water","SDG6 - Clean water and sanitation","SDG11 - Sustainable cities and communities","SDG1 - No poverty","SDG3 - Good health and well being","SDG7 - Affordable and clean energy","SDG4 - Quality education","SDG9 - Industry innovation and infrastructure","SDG16 - Peace justice and strong institutions"], + "datasources":[], + "zenodoCommunities":[], + "organizationCommunity":[] + }, + "instruct": + {"id":"instruct", + "subjects":[], + "datasources":[], + "zenodoCommunities":[{"zenodoCommunityId":"instruct"},{"zenodoCommunityId":"west-life"}],"organizationCommunity":[]}, + "egi":{"id":"egi","subjects":[],"datasources":[],"zenodoCommunities":[{"zenodoCommunityId":"zenodo"}],"organizationCommunity":[]}, + "covid-19":{"id":"covid-19","subjects":["COVID-19","SARS-CoV-2","2019-nCoV","Severe acute respiratory syndrome coronavirus 2","2019 novel coronavirus","coronavirus disease 2019","coronavirus disease-19","HCoV-19","mesh:COVID-19","mesh:C000657245"], + "datasources":[{"openaireId":"opendoar____::358aee4cc897452c00244351e4d91f69","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"re3data_____::7b0ad08687b2c960d5aeef06f811d5e6","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"} + ]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"driver______::bee53aa31dc2cbb538c10c2b65fa5824","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"openaire____::437f4b072b1aa198adcbc35910ff3b98","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"openaire____::081b82f96300b6a6e3d282bad31cb6e2","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"openaire____::9e3be59865b2c1c335d32dae2fe7b254","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]} + ,{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"share_______::4719356ec8d7d55d3feb384ce879ad6c","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"share_______::bbd802baad85d1fd440f32a7a3a2c2b1","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}},{"openaireId":"opendoar____::6f4922f45568161a8cdf4ad2299f6d23","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}}],"zenodoCommunities":[{"zenodoCommunityId":"chicago-covid-19"},{"zenodoCommunityId":"covid-19-senacyt-panama-sample"},{"zenodoCommunityId":"covid-19-tx-rct-stats-review"},{"zenodoCommunityId":"covid_19_senacyt_abc_panama"}],"organizationCommunity":[]}, + "dariah":{"id":"dariah","subjects":[],"datasources":[{"openaireId":"opendoar____::7e7757b1e12abcb736ab9a754ffb617a","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}},{"openaireId":"opendoar____::96da2f590cd7246bbde0051047b0d6f7","selectionConstraints":{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}}],"zenodoCommunities":[{"zenodoCommunityId":"dimpo"}],"organizationCommunity":[]},"rda":{"id":"rda","subjects":[],"datasources":[],"zenodoCommunities":[{"zenodoCommunityId":"rda"}],"organizationCommunity":[]},"clarin":{"id":"clarin","subjects":[],"datasources":[{"openaireId":"re3data_____::a507cdacc5bbcc08761c92185dee5cab"}],"zenodoCommunities":[],"organizationCommunity":[]},"aginfra":{"id":"aginfra","subjects":["animal production and health","fisheries and aquaculture","food safety and human nutrition","information management","food technology","agri-food education and extension","natural resources and environment","food system","engineering technology and Research","agriculture","food safety risk assessment","food security","farming practices and systems","plant production and protection","agri-food economics and policy","Agri-food","food distribution","forestry"],"datasources":[{"openaireId":"opendoar____::1a551829d50f1400b0dab21fdd969c04"},{"openaireId":"opendoar____::49af6c4e558a7569d80eee2e035e2bd7"},{"openaireId":"opendoar____::0266e33d3f546cb5436a10798e657d97"},{"openaireId":"opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06"},{"openaireId":"opendoar____::41bfd20a38bb1b0bec75acf0845530a7"},{"openaireId":"opendoar____::87ae6fb631f7c8a627e8e28785d9992d"}],"zenodoCommunities":[{"zenodoCommunityId":"edenis"},{"zenodoCommunityId":"efsa-pilot"},{"zenodoCommunityId":"egene3"},{"zenodoCommunityId":"efsa-kj"},{"zenodoCommunityId":"euromixproject"},{"zenodoCommunityId":"discardless"},{"zenodoCommunityId":"sedinstcjfst"},{"zenodoCommunityId":"afinet-kc"},{"zenodoCommunityId":"2231-4784"},{"zenodoCommunityId":"2231-0606"},{"zenodoCommunityId":"solace"},{"zenodoCommunityId":"pa17"},{"zenodoCommunityId":"smartakis"},{"zenodoCommunityId":"sedinstcjae"},{"zenodoCommunityId":"phenology_camera"},{"zenodoCommunityId":"aginfra"},{"zenodoCommunityId":"erosa"},{"zenodoCommunityId":"bigdatagrapes"}],"organizationCommunity":[]},"fam":{"id":"fam","subjects":["Stock Assessment","pelagic","Acoustic","Fish farming","Fisheries","Fishermen","maximum sustainable yield","trawler","Fishing vessel","Fisherman","Fishing gear","mackerel","RFMO","Fish Aggregating Device","Bycatch","Fishery","common fisheries policy","Fishing fleet","Aquaculture"],"datasources":[{"openaireId":"doajarticles::8cec81178926caaca531afbd8eb5d64c"},{"openaireId":"doajarticles::0f7a7f30b5400615cae1829f3e743982"},{"openaireId":"doajarticles::9740f7f5af3e506d2ad2c215cdccd51a"},{"openaireId":"doajarticles::9f3fbaae044fa33cb7069b72935a3254"},{"openaireId":"doajarticles::cb67f33eb9819f5c624ce0313957f6b3"},{"openaireId":"doajarticles::e21c97cbb7a209afc75703681c462906"},{"openaireId":"doajarticles::554cde3be9e5c4588b4c4f9f503120cb"},{"openaireId":"tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b"},{"openaireId":"doajarticles::57c5d3837da943e93b28ec4db82ec7a5"},{"openaireId":"doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1"},{"openaireId":"doajarticles::e21c97cbb7a209afc75703681c462906"},{"openaireId":"doajarticles::dca64612dfe0963fffc119098a319957"},{"openaireId":"doajarticles::dd70e44479f0ade25aa106aef3e87a0a"}],"zenodoCommunities":[{"zenodoCommunityId":"discardless"},{"zenodoCommunityId":"farfish2020"},{"zenodoCommunityId":"facts"},{"zenodoCommunityId":"climefish"},{"zenodoCommunityId":"proeel"},{"zenodoCommunityId":"primefish"},{"zenodoCommunityId":"h2020_vicinaqua"},{"zenodoCommunityId":"meece"},{"zenodoCommunityId":"rlsadb"},{"zenodoCommunityId":"iotc_ctoi"}],"organizationCommunity":[]},"beopen":{"id":"beopen","subjects":["Green Transport","City mobility systems","Vulnerable road users","Traffic engineering","Transport electrification","Mobility","Intermodal freight transport","Clean vehicle fleets","Intelligent mobility","Inflight refueling","District mobility systems","Navigation and control systems for optimised planning and routing","European Space Technology Platform","European Transport networks","Green cars","Inter-modality infrastructures","Advanced Take Off and Landing Ideas","Sustainable urban systems","port-area railway networks","Innovative forms of urban transport","Alliance for Logistics Innovation through Collaboration in Europe","Advisory Council for Aeronautics Research in Europe","Mobility services for people and goods","Guidance and traffic management","Passenger mobility","Smart mobility and services","transport innovation","high-speed railway","Vehicle design","Inland shipping","public transportation","aviation’s climate impact","Road transport","On-demand public transport","Personal Air Transport","Transport","transport vulnerability","Pipeline transport","European Association of Aviation Training and Education Organisations","Defrosting of railway infrastructure","Inclusive and affordable transport","River Information Services","jel:L92","Increased use of public transport","Seamless mobility","STRIA","trolleybus transport","Intelligent Transport System","Low-emission alternative energy for transport","Shared mobility for people and goods","Business model for urban mobility","Interoperability of transport systems","Cross-border train slot booking","Air transport","Transport pricing","Sustainable transport","European Rail Transport Research Advisory Council","Alternative aircraft configurations","Transport and Mobility","Railways applications","urban transport","Environmental impact of transport","urban freight delivery systems","Automated Road Transport","Alternative fuels in public transport","Active LIDAR-sensor for GHG-measurements","Autonomous logistics operations","Rational use of motorised transport","Network and traffic management systems","electrification of railway wagons","Single European Sky","Electrified road systems","transportation planning","Railway dynamics","Motorway of the Sea","smart railway communications","Maritime transport","Environmental- friendly transport","Combined transport","Connected automated driving technology","Innovative freight logistics services","automated and shared vehicles","Alternative Aircraft Systems","Land-use and transport interaction","Public transport system","Business plan for shared mobility","Shared mobility","Growing of mobility demand","European Road Transport Research Advisory Council","WATERBORNE ETP","Effective transport management system","Short Sea Shipping","air traffic management","Sea hubs and the motorways of the sea","Urban mobility solutions","Smart city planning","Maritime spatial planning","EUropean rail Research Network of Excellence","Transport governance","ENERGY CONSUMPTION BY THE TRANSPORT SECTOR","Integrated urban plan","inland waterway services","European Conference of Transport Research Institutes","air vehicles","E-freight","Automated Driving","Automated ships","pricing for cross-border passenger transport","Vehicle efficiency","Railway transport","Electric vehicles","Road traffic monitoring","Deep sea shipping","Circular economy in transport","Traffic congestion","air transport system","Urban logistics","Rail transport","OpenStreetMap","high speed rail","Transportation engineering","Intermodal travel information","Flight Data Recorders","Advanced driver assistance systems","long distance freight transport","Inland waterway transport","Smart mobility","Mobility integration","Personal Rapid Transit system","Safety measures \\u0026 requirements for roads","Green rail transport","Electrical","Vehicle manufacturing","Future Airport Layout","Rail technologies","European Intermodal Research Advisory Council","inland navigation","Automated urban vehicles","ECSS-standards","Traveller services","Polluting transport","Air Traffic Control","Cooperative and connected and automated transport","Innovative powertrains","Quality of transport system and services","door-to- door logistics chain","Inter-modal aspects of urban mobility","travel (and mobility)","Innovative freight delivery systems","urban freight delivery infrastructures"],"datasources":[{"openaireId":"doajarticles::1c5bdf8fca58937894ad1441cca99b76"},{"openaireId":"doajarticles::b37a634324a45c821687e6e80e6f53b4"},{"openaireId":"doajarticles::4bf64f2a104040e4e055cd9594b2d77c"},{"openaireId":"doajarticles::479ca537c12755d1868bbf02938a900c"},{"openaireId":"doajarticles::55f31df96a60e2309f45b7c265fcf7a2"},{"openaireId":"doajarticles::c52a09891a5301f9986ebbfe3761810c"},{"openaireId":"doajarticles::379807bc7f6c71a227ef1651462c414c"},{"openaireId":"doajarticles::36069db531a00b85a2e8fb301f4bdc19"},{"openaireId":"doajarticles::b6a898da311ded96fabf49c520b80d5d"},{"openaireId":"doajarticles::d0753d9180b35a271d8b4a31f449749f"},{"openaireId":"doajarticles::172050a92511838393a3fe237ae47e31"},{"openaireId":"doajarticles::301ed96c62abb160a3e29796efe5c95c"},{"openaireId":"doajarticles::0f4f805b3d842f2c7f1b077c3426fa59"},{"openaireId":"doajarticles::ba73728b84437b8d48ae287b867c7215"},{"openaireId":"doajarticles::86faef424d804309ccf45f692523aa48"},{"openaireId":"doajarticles::73bd758fa41671de70964c3ecba013af"},{"openaireId":"doajarticles::e661fc0bdb24af42b740a08f0ddc6cf4"},{"openaireId":"doajarticles::a6d3052047d5dbfbd43d95b4afb0f3d7"},{"openaireId":"doajarticles::ca61df07089acc53a1569bde6673d82a"},{"openaireId":"doajarticles::237dd6f1606600459d0297abd8ed9976"},{"openaireId":"doajarticles::fba6191177ede7c51ea1cdf58eae7f8b"}],"zenodoCommunities":[{"zenodoCommunityId":"jsdtl"},{"zenodoCommunityId":"utc-martrec"},{"zenodoCommunityId":"utc-uti"},{"zenodoCommunityId":"stp"},{"zenodoCommunityId":"c2smart"},{"zenodoCommunityId":"stride-utc"},{"zenodoCommunityId":"crowd4roads"},{"zenodoCommunityId":"lemo"},{"zenodoCommunityId":"imov3d"},{"zenodoCommunityId":"tra2018"},{"zenodoCommunityId":"optimum"},{"zenodoCommunityId":"stars"},{"zenodoCommunityId":"iecteim"},{"zenodoCommunityId":"iccpt2019"}],"organizationCommunity":[]},"science-innovation-policy": + {"id":"science-innovation-policy","subjects":["Sustainability-oriented science policy", "STI policies", "science—society relations", + "Science & Technology Policy", "Innovation policy", "science policy", "Policy and Law"], + "datasources":[{"openaireId":"doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d"}, + {"openaireId":"doajarticles::ae4c7286c79590f19fdca670156ce816"}, + {"openaireId":"doajarticles::0f664bce92ce953e0c7a92068c46bfb3"}, + {"openaireId":"doajarticles::00017183dc4c858fb77541985323a4ef"}, + {"openaireId":"doajarticles::93b306f458cce3d7aaaf58c0a725f4f9"}, + {"openaireId":"doajarticles::9dbf8fbf3e9fe0fe1fc01e55fbd90bfc"}, + {"openaireId":"doajarticles::a2bda8785c863279bba4b8f34827b4c9"}, + {"openaireId":"doajarticles::019a1fcb42c3fea1c1b689df76330b58"}, + {"openaireId":"doajarticles::0daa8281938831e9c82bfed8b55a2975"}, + {"openaireId":"doajarticles::f67ad6d268162079b3abd51a24468744"}, + {"openaireId":"doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d"}, + {"openaireId":"doajarticles::ad114356e196a4a3d84dda59c720dacd"}, + {"openaireId":"doajarticles::01e8a54fdecaaf354c67a2dd74ae7d4f"}, + {"openaireId":"doajarticles::449305f096b10a9464449ff2d0e10e06"}, + {"openaireId":"doajarticles::982c0c0ac378256254cce2fa6572bb6c"}, + {"openaireId":"doajarticles::49d6ed47138884566ce93cf0ccb12c02"}, + {"openaireId":"doajarticles::a98e820dbc2e8ee0fc84ab66f263267c"}, + {"openaireId":"doajarticles::50b1ce37427b36368f8f0f1317e47f83"}, + {"openaireId":"doajarticles::f0ec29b7450b2ac5d0ad45327eeb531a"}, + {"openaireId":"doajarticles::d8d421d3b0349a7aaa93758b27a54e84"}, + {"openaireId":"doajarticles::7ffc35ac5133da01d421ccf8af5b70bc"} + ],"zenodoCommunities":[{"zenodoCommunityId":"risis"}],"organizationCommunity":[]},"mes":{"id":"mes","subjects":["marine","ocean","fish","aqua","sea"],"datasources":[],"zenodoCommunities":[{"zenodoCommunityId":"adriplan"},{"zenodoCommunityId":"devotes-project"},{"zenodoCommunityId":"euro-basin"},{"zenodoCommunityId":"naclim"},{"zenodoCommunityId":"discardless"},{"zenodoCommunityId":"assisibf"},{"zenodoCommunityId":"meece"},{"zenodoCommunityId":"facts"},{"zenodoCommunityId":"proeel"},{"zenodoCommunityId":"aquatrace"},{"zenodoCommunityId":"myfish"},{"zenodoCommunityId":"atlas"},{"zenodoCommunityId":"blue-actionh2020"},{"zenodoCommunityId":"sponges"},{"zenodoCommunityId":"merces_project"},{"zenodoCommunityId":"bigdataocean"},{"zenodoCommunityId":"columbus"},{"zenodoCommunityId":"h2020-aquainvad-ed"},{"zenodoCommunityId":"aquarius"},{"zenodoCommunityId":"southern-ocean-observing-system"},{"zenodoCommunityId":"eawag"},{"zenodoCommunityId":"mossco"},{"zenodoCommunityId":"onc"},{"zenodoCommunityId":"oceanbiogeochemistry"},{"zenodoCommunityId":"oceanliteracy"},{"zenodoCommunityId":"openearth"},{"zenodoCommunityId":"ocean"},{"zenodoCommunityId":"calcifierraman"},{"zenodoCommunityId":"bermudabream"},{"zenodoCommunityId":"brcorp1"},{"zenodoCommunityId":"mce"},{"zenodoCommunityId":"biogeochem"},{"zenodoCommunityId":"ecc2014"},{"zenodoCommunityId":"fisheries"},{"zenodoCommunityId":"sedinstcjfas"},{"zenodoCommunityId":"narmada"},{"zenodoCommunityId":"umr-entropie"},{"zenodoCommunityId":"farfish2020"},{"zenodoCommunityId":"primefish"},{"zenodoCommunityId":"zf-ilcs"},{"zenodoCommunityId":"climefish"},{"zenodoCommunityId":"afrimed_eu"},{"zenodoCommunityId":"spi-ace"},{"zenodoCommunityId":"cice-consortium"},{"zenodoCommunityId":"nemo-ocean"},{"zenodoCommunityId":"mesopp-h2020"},{"zenodoCommunityId":"marxiv"}],"organizationCommunity":[]},"ni":{"id":"ni","subjects":["brain mapping","brain imaging","electroencephalography","arterial spin labelling","brain fingerprinting","brain","neuroimaging","Multimodal Brain Image Analysis","fMRI","neuroinformatics","fetal brain","brain ultrasonic imaging","topographic brain mapping","diffusion tensor imaging","computerized knowledge assessment","connectome mapping","brain magnetic resonance imaging","brain abnormalities"],"datasources":[{"openaireId":"re3data_____::5b9bf9171d92df854cf3c520692e9122"},{"openaireId":"doajarticles::c7d3de67dc77af72f6747157441252ec"},{"openaireId":"re3data_____::8515794670370f49c1d176c399c714f5"},{"openaireId":"doajarticles::d640648c84b10d425f96f11c3de468f3"},{"openaireId":"doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"},{"openaireId":"rest________::fb1a3d4523c95e63496e3bc7ba36244b"}],"zenodoCommunities":[{"zenodoCommunityId":"neuroinformatics"},{"zenodoCommunityId":"hbp"},{"zenodoCommunityId":"from_neuroscience_to_machine_learning"},{"zenodoCommunityId":"ci2c"},{"zenodoCommunityId":"opensourcebrain"},{"zenodoCommunityId":"brainspeak"},{"zenodoCommunityId":"braincom"},{"zenodoCommunityId":"nextgenvis"},{"zenodoCommunityId":"meso-brain"},{"zenodoCommunityId":"neuroplasticity-workshop"},{"zenodoCommunityId":"bionics"},{"zenodoCommunityId":"brainmattrain-676408"},{"zenodoCommunityId":"repronim"},{"zenodoCommunityId":"affectiveneuro"},{"zenodoCommunityId":"con"},{"zenodoCommunityId":"lab_neurol_sperim_irfmn_irccs_milano_it"}],"organizationCommunity":[]},"dh-ch":{"id":"dh-ch","subjects":["modern art","monuments","europeana data model","sites","field walking","frescoes","LIDO metadata schema","art history","excavation","Arts and Humanities General","cities","coins","temples","numismatics","lithics","roads","environmental archaeology","digital cultural heritage","archaeological reports","history","CRMba","churches","cultural heritage","archaeological stratigraphy","religious art","buidings","digital humanities","survey","archaeological sites","linguistic studies","bioarchaeology","architectural orders","palaeoanthropology","fine arts","europeana","CIDOC CRM","decorations","classic art","stratigraphy","digital archaeology","intangible cultural heritage","walls","humanities","chapels","CRMtex","Language and Literature","paintings","archaeology","fair data","mosaics","burials","architecture","medieval art","castles","CARARE metadata schema","statues","natural language processing","inscriptions","CRMsci","vaults","contemporary art","Arts and Humanities","CRMarchaeo","pottery","site","architectural","vessels"],"datasources":[{"openaireId":"re3data_____::9ebe127e5f3a0bf401875690f3bb6b81"},{"openaireId":"doajarticles::c6cd4b532e12868c1d760a8d7cda6815"},{"openaireId":"doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b"},{"openaireId":"doajarticles::6eb31d13b12bc06bbac06aef63cf33c9"},{"openaireId":"doajarticles::0da84e9dfdc8419576169e027baa8028"},{"openaireId":"re3data_____::84e123776089ce3c7a33db98d9cd15a8"},{"openaireId":"openaire____::c5502a43e76feab55dd00cf50f519125"},{"openaireId":"re3data_____::a48f09c562b247a9919acfe195549b47"},{"openaireId":"opendoar____::97275a23ca44226c9964043c8462be96"}],"zenodoCommunities":[{"zenodoCommunityId":"storm"},{"zenodoCommunityId":"crosscult"},{"zenodoCommunityId":"wholodance_eu"},{"zenodoCommunityId":"digcur2013"},{"zenodoCommunityId":"gravitate"},{"zenodoCommunityId":"dipp2014"},{"zenodoCommunityId":"digitalhumanities"},{"zenodoCommunityId":"dimpo"},{"zenodoCommunityId":"adho"},{"zenodoCommunityId":"chc"},{"zenodoCommunityId":"wahr"},{"zenodoCommunityId":"ibe"},{"zenodoCommunityId":"ariadne"},{"zenodoCommunityId":"parthenos-hub"},{"zenodoCommunityId":"parthenos-training"},{"zenodoCommunityId":"gandhara"},{"zenodoCommunityId":"cmsouthasia"},{"zenodoCommunityId":"nilgirihills"},{"zenodoCommunityId":"shamsa_mustecio"},{"zenodoCommunityId":"bodhgaya"}],"organizationCommunity":[]}}} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml new file mode 100644 index 0000000000..a44372e4de --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml @@ -0,0 +1,1411 @@ + + + + + + + zenodo + + + + + + + + + + + + + + + + + + + + + + re3data_____::a507cdacc5bbcc08761c92185dee5cab + + + + + + + + + + + + rda + + + + + + + + SDG13 - Climate action + SDG8 - Decent work and economic growth + SDG15 - Life on land + SDG2 - Zero hunger + SDG17 - Partnerships for the goals + SDG10 - Reduced inequalities + SDG5 - Gender equality + SDG12 - Responsible consumption and production + SDG14 - Life below water + SDG6 - Clean water and sanitation + SDG11 - Sustainable cities and communities + SDG1 - No poverty + SDG3 - Good health and well being + SDG7 - Affordable and clean energy + SDG4 - Quality education + SDG9 - Industry innovation and infrastructure + SDG16 - Peace justice and strong institutions + + + + + + + + modern art + monuments + europeana data model + sites + field walking + frescoes + LIDO metadata schema + art history + excavation + Arts and Humanities General + cities + coins + temples + numismatics + lithics + roads + environmental archaeology + digital cultural heritage + archaeological reports + history + CRMba + churches + cultural heritage + archaeological stratigraphy + religious art + buidings + digital humanities + survey + archaeological sites + linguistic studies + bioarchaeology + architectural orders + palaeoanthropology + fine arts + europeana + CIDOC CRM + decorations + classic art + stratigraphy + digital archaeology + intangible cultural heritage + walls + humanities + chapels + CRMtex + Language and Literature + paintings + archaeology + fair data + mosaics + burials + architecture + medieval art + castles + CARARE metadata schema + statues + natural language processing + inscriptions + CRMsci + vaults + contemporary art + Arts and Humanities + CRMarchaeo + pottery + site + architectural + vessels + + + + re3data_____::9ebe127e5f3a0bf401875690f3bb6b81 + + + + doajarticles::c6cd4b532e12868c1d760a8d7cda6815 + + + + doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b + + + + doajarticles::6eb31d13b12bc06bbac06aef63cf33c9 + + + + doajarticles::0da84e9dfdc8419576169e027baa8028 + + + + re3data_____::84e123776089ce3c7a33db98d9cd15a8 + + + + openaire____::c5502a43e76feab55dd00cf50f519125 + + + + re3data_____::a48f09c562b247a9919acfe195549b47 + + + + opendoar____::97275a23ca44226c9964043c8462be96 + + + + + + storm + + + + crosscult + + + + wholodance_eu + + + + digcur2013 + + + + gravitate + + + + dipp2014 + + + + digitalhumanities + + + + dimpo + + + + adho + + + + chc + + + + wahr + + + + ibe + + + + ariadne + + + + parthenos-hub + + + + parthenos-training + + + + gandhara + + + + cmsouthasia + + + + nilgirihills + + + + shamsa_mustecio + + + + bodhgaya + + + + + + + + Stock Assessment + pelagic + Acoustic + Fish farming + Fisheries + Fishermen + maximum sustainable yield + trawler + Fishing vessel + Fisherman + Fishing gear + mackerel + RFMO + Fish Aggregating Device + Bycatch + Fishery + common fisheries policy + Fishing fleet + Aquaculture + + + + doajarticles::8cec81178926caaca531afbd8eb5d64c + + + + doajarticles::0f7a7f30b5400615cae1829f3e743982 + + + + doajarticles::9740f7f5af3e506d2ad2c215cdccd51a + + + + doajarticles::9f3fbaae044fa33cb7069b72935a3254 + + + + doajarticles::cb67f33eb9819f5c624ce0313957f6b3 + + + + doajarticles::e21c97cbb7a209afc75703681c462906 + + + + doajarticles::554cde3be9e5c4588b4c4f9f503120cb + + + + tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b + + + + doajarticles::57c5d3837da943e93b28ec4db82ec7a5 + + + + doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1 + + + + doajarticles::e21c97cbb7a209afc75703681c462906 + + + + doajarticles::dca64612dfe0963fffc119098a319957 + + + + doajarticles::dd70e44479f0ade25aa106aef3e87a0a + + + + + + discardless + + + + farfish2020 + + + + facts + + + + climefish + + + + proeel + + + + primefish + + + + h2020_vicinaqua + + + + meece + + + + rlsadb + + + + iotc_ctoi + + + + + + + + brain mapping + brain imaging + electroencephalography + arterial spin labelling + brain fingerprinting + brain + neuroimaging + Multimodal Brain Image Analysis + fMRI + neuroinformatics + fetal brain + brain ultrasonic imaging + topographic brain mapping + diffusion tensor imaging + computerized knowledge assessment + connectome mapping + brain magnetic resonance imaging + brain abnormalities + + + + re3data_____::5b9bf9171d92df854cf3c520692e9122 + + + + doajarticles::c7d3de67dc77af72f6747157441252ec + + + + re3data_____::8515794670370f49c1d176c399c714f5 + + + + doajarticles::d640648c84b10d425f96f11c3de468f3 + + + + doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a + + + + rest________::fb1a3d4523c95e63496e3bc7ba36244b + + + + + + neuroinformatics + + + + hbp + + + + from_neuroscience_to_machine_learning + + + + ci2c + + + + opensourcebrain + + + + brainspeak + + + + braincom + + + + nextgenvis + + + + meso-brain + + + + neuroplasticity-workshop + + + + bionics + + + + brainmattrain-676408 + + + + repronim + + + + affectiveneuro + + + + con + + + + lab_neurol_sperim_irfmn_irccs_milano_it + + + + + + + + marine + ocean + fish + aqua + sea + + + + + adriplan + + + + devotes-project + + + + euro-basin + + + + naclim + + + + discardless + + + + assisibf + + + + meece + + + + facts + + + + proeel + + + + aquatrace + + + + myfish + + + + atlas + + + + blue-actionh2020 + + + + sponges + + + + merces_project + + + + bigdataocean + + + + columbus + + + + h2020-aquainvad-ed + + + + aquarius + + + + southern-ocean-observing-system + + + + eawag + + + + mossco + + + + onc + + + + oceanbiogeochemistry + + + + oceanliteracy + + + + openearth + + + + ocean + + + + calcifierraman + + + + bermudabream + + + + brcorp1 + + + + mce + + + + biogeochem + + + + ecc2014 + + + + fisheries + + + + sedinstcjfas + + + + narmada + + + + umr-entropie + + + + farfish2020 + + + + primefish + + + + zf-ilcs + + + + climefish + + + + afrimed_eu + + + + spi-ace + + + + cice-consortium + + + + nemo-ocean + + + + mesopp-h2020 + + + + marxiv + + + + + + + + + + + instruct + + + + west-life + + + + + + + + + + + + + + animal production and health + fisheries and aquaculture + food safety and human nutrition + information management + food technology + agri-food education and extension + natural resources and environment + food system + engineering technology and Research + agriculture + food safety risk assessment + food security + farming practices and systems + plant production and protection + agri-food economics and policy + Agri-food + food distribution + forestry + + + + opendoar____::1a551829d50f1400b0dab21fdd969c04 + + + + opendoar____::49af6c4e558a7569d80eee2e035e2bd7 + + + + opendoar____::0266e33d3f546cb5436a10798e657d97 + + + + opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06 + + + + opendoar____::41bfd20a38bb1b0bec75acf0845530a7 + + + + opendoar____::87ae6fb631f7c8a627e8e28785d9992d + + + + + + edenis + + + + efsa-pilot + + + + egene3 + + + + efsa-kj + + + + euromixproject + + + + discardless + + + + sedinstcjfst + + + + afinet-kc + + + + 2231-4784 + + + + 2231-0606 + + + + solace + + + + pa17 + + + + smartakis + + + + sedinstcjae + + + + phenology_camera + + + + aginfra + + + + erosa + + + + bigdatagrapes + + + + + + + + + + opendoar____::7e7757b1e12abcb736ab9a754ffb617a + {"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]} + + + opendoar____::96da2f590cd7246bbde0051047b0d6f7 + {"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]} + + + + + dimpo + + + + + + + + + + + + + + + + + + + + Green Transport + City mobility systems + Vulnerable road users + Traffic engineering + Transport electrification + Mobility + Intermodal freight transport + Clean vehicle fleets + Intelligent mobility + Inflight refueling + District mobility systems + Navigation and control systems for optimised planning and routing + European Space Technology Platform + European Transport networks + Green cars + Inter-modality infrastructures + Advanced Take Off and Landing Ideas + Sustainable urban systems + port-area railway networks + Innovative forms of urban transport + Alliance for Logistics Innovation through Collaboration in Europe + Advisory Council for Aeronautics Research in Europe + Mobility services for people and goods + Guidance and traffic management + Passenger mobility + Smart mobility and services + transport innovation + high-speed railway + Vehicle design + Inland shipping + public transportation + aviation’s climate impact + Road transport + On-demand public transport + Personal Air Transport + Transport + transport vulnerability + Pipeline transport + European Association of Aviation Training and Education Organisations + Defrosting of railway infrastructure + Inclusive and affordable transport + River Information Services + jel:L92 + Increased use of public transport + Seamless mobility + STRIA + trolleybus transport + Intelligent Transport System + Low-emission alternative energy for transport + Shared mobility for people and goods + Business model for urban mobility + Interoperability of transport systems + Cross-border train slot booking + Air transport + Transport pricing + Sustainable transport + European Rail Transport Research Advisory Council + Alternative aircraft configurations + Transport and Mobility + Railways applications + urban transport + Environmental impact of transport + urban freight delivery systems + Automated Road Transport + Alternative fuels in public transport + Active LIDAR-sensor for GHG-measurements + Autonomous logistics operations + Rational use of motorised transport + Network and traffic management systems + electrification of railway wagons + Single European Sky + Electrified road systems + transportation planning + Railway dynamics + Motorway of the Sea + smart railway communications + Maritime transport + Environmental- friendly transport + Combined transport + Connected automated driving technology + Innovative freight logistics services + automated and shared vehicles + Alternative Aircraft Systems + Land-use and transport interaction + Public transport system + Business plan for shared mobility + Shared mobility + Growing of mobility demand + European Road Transport Research Advisory Council + WATERBORNE ETP + Effective transport management system + Short Sea Shipping + air traffic management + Sea hubs and the motorways of the sea + Urban mobility solutions + Smart city planning + Maritime spatial planning + EUropean rail Research Network of Excellence + Transport governance + ENERGY CONSUMPTION BY THE TRANSPORT SECTOR + Integrated urban plan + inland waterway services + European Conference of Transport Research Institutes + air vehicles + E-freight + Automated Driving + Automated ships + pricing for cross-border passenger transport + Vehicle efficiency + Railway transport + Electric vehicles + Road traffic monitoring + Deep sea shipping + Circular economy in transport + Traffic congestion + air transport system + Urban logistics + Rail transport + OpenStreetMap + high speed rail + Transportation engineering + Intermodal travel information + Flight Data Recorders + Advanced driver assistance systems + long distance freight transport + Inland waterway transport + Smart mobility + Mobility integration + Personal Rapid Transit system + Safety measures & requirements for roads + Green rail transport + Electrical + Vehicle manufacturing + Future Airport Layout + Rail technologies + European Intermodal Research Advisory Council + inland navigation + Automated urban vehicles + ECSS-standards + Traveller services + Polluting transport + Air Traffic Control + Cooperative and connected and automated transport + Innovative powertrains + Quality of transport system and services + door-to- door logistics chain + Inter-modal aspects of urban mobility + travel (and mobility) + Innovative freight delivery systems + urban freight delivery infrastructures + + + + doajarticles::1c5bdf8fca58937894ad1441cca99b76 + + + + doajarticles::b37a634324a45c821687e6e80e6f53b4 + + + + doajarticles::4bf64f2a104040e4e055cd9594b2d77c + + + + doajarticles::479ca537c12755d1868bbf02938a900c + + + + doajarticles::55f31df96a60e2309f45b7c265fcf7a2 + + + + doajarticles::c52a09891a5301f9986ebbfe3761810c + + + + doajarticles::379807bc7f6c71a227ef1651462c414c + + + + doajarticles::36069db531a00b85a2e8fb301f4bdc19 + + + + doajarticles::b6a898da311ded96fabf49c520b80d5d + + + + doajarticles::d0753d9180b35a271d8b4a31f449749f + + + + doajarticles::172050a92511838393a3fe237ae47e31 + + + + doajarticles::301ed96c62abb160a3e29796efe5c95c + + + + doajarticles::0f4f805b3d842f2c7f1b077c3426fa59 + + + + doajarticles::ba73728b84437b8d48ae287b867c7215 + + + + doajarticles::86faef424d804309ccf45f692523aa48 + + + + doajarticles::73bd758fa41671de70964c3ecba013af + + + + doajarticles::e661fc0bdb24af42b740a08f0ddc6cf4 + + + + doajarticles::a6d3052047d5dbfbd43d95b4afb0f3d7 + + + + doajarticles::ca61df07089acc53a1569bde6673d82a + + + + doajarticles::237dd6f1606600459d0297abd8ed9976 + + + + doajarticles::fba6191177ede7c51ea1cdf58eae7f8b + + + + + + jsdtl + + + + utc-martrec + + + + utc-uti + + + + stp + + + + c2smart + + + + stride-utc + + + + crowd4roads + + + + lemo + + + + imov3d + + + + tra2018 + + + + optimum + + + + stars + + + + iecteim + + + + iccpt2019 + + + + + + + + + + + + + + + + + + + + + + + + + + Sustainability-oriented science policy + STI policies + science—society relations + Science & Technology Policy + Innovation policy + science policy + Policy and Law + + + + doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d + + + + doajarticles::ae4c7286c79590f19fdca670156ce816 + + + + doajarticles::0f664bce92ce953e0c7a92068c46bfb3 + + + + doajarticles::00017183dc4c858fb77541985323a4ef + + + + doajarticles::93b306f458cce3d7aaaf58c0a725f4f9 + + + + doajarticles::9dbf8fbf3e9fe0fe1fc01e55fbd90bfc + + + + doajarticles::a2bda8785c863279bba4b8f34827b4c9 + + + + doajarticles::019a1fcb42c3fea1c1b689df76330b58 + + + + doajarticles::0daa8281938831e9c82bfed8b55a2975 + + + + doajarticles::f67ad6d268162079b3abd51a24468744 + + + + doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d + + + + doajarticles::ad114356e196a4a3d84dda59c720dacd + + + + doajarticles::01e8a54fdecaaf354c67a2dd74ae7d4f + + + + doajarticles::449305f096b10a9464449ff2d0e10e06 + + + + doajarticles::982c0c0ac378256254cce2fa6572bb6c + + + + doajarticles::49d6ed47138884566ce93cf0ccb12c02 + + + + doajarticles::a98e820dbc2e8ee0fc84ab66f263267c + + + + doajarticles::50b1ce37427b36368f8f0f1317e47f83 + + + + doajarticles::f0ec29b7450b2ac5d0ad45327eeb531a + + + + doajarticles::d8d421d3b0349a7aaa93758b27a54e84 + + + + doajarticles::7ffc35ac5133da01d421ccf8af5b70bc + + + + + + risis + + + + + + + + COVID-19 + Severe acute respiratory syndrome coronavirus 2 + SARS-CoV-2 + COVID19 + 2019 novel coronavirus + coronavirus disease 2019 + HCoV-19 + mesh:C000657245 + 2019-nCoV + coronavirus disease-19 + mesh:COVID-19 + COVID2019 + + + + opendoar____::358aee4cc897452c00244351e4d91f69 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + re3data_____::7b0ad08687b2c960d5aeef06f811d5e6 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + driver______::bee53aa31dc2cbb538c10c2b65fa5824 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + openaire____::437f4b072b1aa198adcbc35910ff3b98 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + openaire____::081b82f96300b6a6e3d282bad31cb6e2 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + openaire____::9e3be59865b2c1c335d32dae2fe7b254 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + opendoar____::8b6dd7db9af49e67306feb59a8bdc52c + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + share_______::4719356ec8d7d55d3feb384ce879ad6c + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + share_______::bbd802baad85d1fd440f32a7a3a2c2b1 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + opendoar____::6f4922f45568161a8cdf4ad2299f6d23 + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + + + + re3data_____::7980778c78fb4cf0fab13ce2159030dc + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]} + + + re3data_____::978378def740bbf2bfb420de868c460b + {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]} + + + + + chicago-covid-19 + + + + covid-19-senacyt-panama-sample + + + + covid-19-tx-rct-stats-review + + + + covid_19_senacyt_abc_panama + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates/dataset_10.json.gz new file mode 100644 index 0000000000..bd29d59ae9 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz new file mode 100644 index 0000000000..2eb33c5a49 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz new file mode 100644 index 0000000000..ee62cd7913 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext/dataset_10.json.gz new file mode 100644 index 0000000000..cf3c3aa7b6 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/update_subject_datasource/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/update_subject_datasource/dataset_10.json.gz new file mode 100644 index 0000000000..fdc76a04c8 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/update_subject_datasource/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz new file mode 100644 index 0000000000..fdc76a04c8 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz new file mode 100644 index 0000000000..ea9e212bd4 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/publication_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/publication_10.json.gz new file mode 100644 index 0000000000..99c4015e71 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/publication_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/software/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/software/software_10.json.gz new file mode 100644 index 0000000000..3dcadf41d7 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/software/software_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/preparedInfo/preparedInfo.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/preparedInfo/preparedInfo.json.gz new file mode 100644 index 0000000000..d838bee992 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/preparedInfo/preparedInfo.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/sample/software/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/sample/software/software_10.json.gz new file mode 100644 index 0000000000..547ddee05b Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/sample/software/software_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc/mergedOrcid_17.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc/mergedOrcid_17.json.gz new file mode 100644 index 0000000000..118291ec57 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc/mergedOrcid_17.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate/dataset_10.json.gz new file mode 100644 index 0000000000..778f722624 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz new file mode 100644 index 0000000000..f4630a7a46 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz new file mode 100644 index 0000000000..141bed5f35 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked/alreadyLinked.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked/alreadyLinked.json.gz new file mode 100644 index 0000000000..9dc35adae6 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked/alreadyLinked.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates/potentialUpdates.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates/potentialUpdates.json.gz new file mode 100644 index 0000000000..bd0c4f29bf Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates/potentialUpdates.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates/potentialUpdates.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates/potentialUpdates.json.gz new file mode 100644 index 0000000000..90de4ac35f Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates/potentialUpdates.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates/potentialUpdates.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates/potentialUpdates.json.gz new file mode 100644 index 0000000000..400e2dc93e Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates/potentialUpdates.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo/resultCommunityList.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo/resultCommunityList.json.gz new file mode 100644 index 0000000000..8b452d0e17 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo/resultCommunityList.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/sample/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/sample/dataset_10.json.gz new file mode 100644 index 0000000000..dccc28c879 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/sample/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo/mergedResultCommunityList.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo/mergedResultCommunityList.json.gz new file mode 100644 index 0000000000..371427e5f1 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo/mergedResultCommunityList.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample/dataset_10.json.gz new file mode 100644 index 0000000000..bf77a6eba9 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz new file mode 100644 index 0000000000..ee822e372a Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz new file mode 100644 index 0000000000..074789c0bf Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix/software_10.json.gz new file mode 100644 index 0000000000..d97498cddc Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix/software_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix/software_10.json.gz new file mode 100644 index 0000000000..c0b27d2aa3 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix/software_10.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz new file mode 100644 index 0000000000..d9b92debaa Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz new file mode 100644 index 0000000000..9ab6f1fa02 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz new file mode 100644 index 0000000000..ee822e372a Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz differ diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz new file mode 100644 index 0000000000..323d66d6e7 Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 503e4c5047..739c7a4629 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -24,8 +24,8 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index e96c410669..e5e348642c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -30,8 +30,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; -import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Dataset; @@ -94,7 +94,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication log.info("Processing orgs..."); smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); - log.info("Processing relations ds <-> orgs ..."); + log.info("Processing relationsNoRemoval ds <-> orgs ..."); smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); log.info("Processing projects <-> orgs ..."); @@ -370,6 +370,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final DataInfo info = dataInfo( false, null, false, false, + qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9"); final List collectedFrom = listKeyValues( @@ -460,7 +461,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final Boolean inferred = rs.getBoolean("inferred"); final String trust = rs.getString("trust"); return dataInfo( + deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); + } private Qualifier prepareQualifierSplitting(final String s) { @@ -516,6 +519,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null; final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null; + final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null; if (issn != null || eissn != null || lissn != null) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz index c2389b7677..a5b8c8774c 100644 Binary files a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz differ diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml index 0467e618f6..7c918a0d70 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml @@ -4,7 +4,7 @@ - + Data Provision [OCEAN] @@ -131,6 +131,16 @@ + + Set the target path to store the blacklisted graph + + blacklistedGraphPath + /tmp/beta_provision/graph/12_graph_blacklisted + + + + + Set the lookup address @@ -155,64 +165,8 @@ Set the map of associations organization, community list for the propagation of community to result through organization propagationOrganizationCommunityMap - - { - "20|corda__h2020::3fb05a9524c3f790391261347852f638":["mes","euromarine"], - "20|corda__h2020::e8dbe14cca9bf6fce09d468872f813f8":["mes","euromarine"], - "20|snsf________::9b253f265e3bef5cae6d881fdf61aceb":["mes","euromarine"], - "20|rcuk________::e054eea0a47665af8c3656b5785ccf76":["mes","euromarine"], - "20|corda__h2020::edc18d67c9b11fb616ca9f6e1db1b151":["mes","euromarine"], - "20|rcuk________::d5736d9da90521ddcdc7828a05a85e9a":["mes","euromarine"], - "20|corda__h2020::f5d418d3aa1cf817ddefcc3fdc039f27":["mes","euromarine"], - "20|snsf________::8fa091f8f25a846779acb4ea97b50aef":["mes","euromarine"], - "20|corda__h2020::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"], - "20|corda_______::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"], - "20|snsf________::31d0a100e54e3cdb3c6f52d91e638c78":["mes","euromarine"], - "20|corda__h2020::ea379ef91b8cc86f9ac5edc4169292db":["mes","euromarine"], - "20|corda__h2020::f75ee2ee48e5cb0ec8c8d30aaa8fef70":["mes","euromarine"], - "20|rcuk________::e16010089551a1a9182a94604fc0ea59":["mes","euromarine"], - "20|corda__h2020::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"], - "20|corda_______::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"], - "20|grid________::b2cbbf5eadbbf87d534b022bad3191d7":["mes","euromarine"], - "20|snsf________::74730ef1439d7f7636a8be58a6b471b8":["mes","euromarine"], - "20|nsf_________::ad72e19043a5a467e35f9b444d11563e":["mes","euromarine"], - "20|rcuk________::0fc3e92500290902a2d38ec2445e74c3":["mes","euromarine"], - "20|grid________::ad2c29905da0eb3c06b3fa80cacd89ea":["mes","euromarine"], - "20|corda__h2020::30b53e4d63d3724f00acb9cbaca40860":["mes","euromarine"], - "20|corda__h2020::f60f84bee14ad93f0db0e49af1d5c317":["mes","euromarine"], - "20|corda__h2020::7bf251ac3765b5e89d82270a1763d09f":["mes","euromarine"], - "20|corda__h2020::65531bd11be9935948c7f2f4db1c1832":["mes","euromarine"], - "20|corda__h2020::e0e98f86bbc76638bbb72a8fe2302946":["mes","euromarine"], - "20|snsf________::3eb43582ac27601459a8d8b3e195724b":["mes","euromarine"], - "20|corda__h2020::af2481dab65d06c8ea0ae02b5517b9b6":["mes","euromarine"], - "20|corda__h2020::c19d05cfde69a50d3ebc89bd0ee49929":["mes","euromarine"], - "20|corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0":["mes","euromarine"], - "20|rcuk________::f33c02afb0dc66c49d0ed97ca5dd5cb0":["beopen"], - "20|grid________::a867f78acdc5041b34acfe4f9a349157":["beopen"], - "20|grid________::7bb116a1a9f95ab812bf9d2dea2be1ff":["beopen"], - "20|corda__h2020::6ab0e0739dbe625b99a2ae45842164ad":["beopen"], - "20|corda__h2020::8ba50792bc5f4d51d79fca47d860c602":["beopen"], - "20|corda_______::8ba50792bc5f4d51d79fca47d860c602":["beopen"], - "20|corda__h2020::e70e9114979e963eef24666657b807c3":["beopen"], - "20|corda_______::e70e9114979e963eef24666657b807c3":["beopen"], - "20|corda_______::15911e01e9744d57205825d77c218737":["beopen"], - "20|opendoar____::056a41e24e2a9a67215e87bbee6a80ab":["beopen"], - "20|opendoar____::7f67f2e6c6fbb0628f8160fcd3d92ae3":["beopen"], - "20|grid________::a8ecfd7c084e561168bcbe6bf0daf3e3":["beopen"], - "20|corda_______::7bbe6cc5d8ec1864739a04b0d020c9e9":["beopen"], - "20|corda_______::3ff558e30c2e434d688539548300b050":["beopen"], - "20|corda__h2020::5ffee5b3b83b33a8cf0e046877bd3a39":["beopen"], - "20|corda__h2020::5187217e2e806a6df3579c46f82401bc":["beopen"], - "20|grid________::5fa7e2709bcd945e26bfa18689adeec1":["beopen"], - "20|corda_______::d8696683c53027438031a96ad27c3c07":["beopen"], - "20|corda__h2020::d8696683c53027438031a96ad27c3c07":["beopen"], - "20|rcuk________::23a79ebdfa59790864e4a485881568c1":["beopen"], - "20|corda__h2020::b76cf8fe49590a966953c37e18608af9":["beopen"], - "20|grid________::d2f0204126ee709244a488a4cd3b91c2":["beopen"], - "20|corda__h2020::05aba9d2ed17533d15221e5655ac11e6":["beopen"], - "20|grid________::802401579481dc32062bdee69f5e6a34":["beopen"], - "20|corda__h2020::3f6d9d54cac975a517ba6b252c81582d":["beopen"] - } + {"20|corda__h2020::3fb05a9524c3f790391261347852f638":["mes","euromarine"], "20|corda__h2020::e8dbe14cca9bf6fce09d468872f813f8":["mes","euromarine"], "20|snsf________::9b253f265e3bef5cae6d881fdf61aceb":["mes","euromarine"],"20|rcuk________::e054eea0a47665af8c3656b5785ccf76":["mes","euromarine"],"20|corda__h2020::edc18d67c9b11fb616ca9f6e1db1b151":["mes","euromarine"],"20|rcuk________::d5736d9da90521ddcdc7828a05a85e9a":["mes","euromarine"],"20|corda__h2020::f5d418d3aa1cf817ddefcc3fdc039f27":["mes","euromarine"],"20|snsf________::8fa091f8f25a846779acb4ea97b50aef":["mes","euromarine"],"20|corda__h2020::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"],"20|corda_______::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"],"20|snsf________::31d0a100e54e3cdb3c6f52d91e638c78":["mes","euromarine"],"20|corda__h2020::ea379ef91b8cc86f9ac5edc4169292db":["mes","euromarine"],"20|corda__h2020::f75ee2ee48e5cb0ec8c8d30aaa8fef70":["mes","euromarine"],"20|rcuk________::e16010089551a1a9182a94604fc0ea59":["mes","euromarine"],"20|corda__h2020::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"],"20|corda_______::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"],"20|grid________::b2cbbf5eadbbf87d534b022bad3191d7":["mes","euromarine"],"20|snsf________::74730ef1439d7f7636a8be58a6b471b8":["mes","euromarine"],"20|nsf_________::ad72e19043a5a467e35f9b444d11563e":["mes","euromarine"],"20|rcuk________::0fc3e92500290902a2d38ec2445e74c3":["mes","euromarine"],"20|grid________::ad2c29905da0eb3c06b3fa80cacd89ea":["mes","euromarine"],"20|corda__h2020::30b53e4d63d3724f00acb9cbaca40860":["mes","euromarine"],"20|corda__h2020::f60f84bee14ad93f0db0e49af1d5c317":["mes","euromarine"], "20|corda__h2020::7bf251ac3765b5e89d82270a1763d09f":["mes","euromarine"], "20|corda__h2020::65531bd11be9935948c7f2f4db1c1832":["mes","euromarine"], "20|corda__h2020::e0e98f86bbc76638bbb72a8fe2302946":["mes","euromarine"], "20|snsf________::3eb43582ac27601459a8d8b3e195724b":["mes","euromarine"], "20|corda__h2020::af2481dab65d06c8ea0ae02b5517b9b6":["mes","euromarine"], "20|corda__h2020::c19d05cfde69a50d3ebc89bd0ee49929":["mes","euromarine"], "20|corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0":["mes","euromarine"], "20|rcuk________::f33c02afb0dc66c49d0ed97ca5dd5cb0":["beopen"], + "20|grid________::a867f78acdc5041b34acfe4f9a349157":["beopen"], "20|grid________::7bb116a1a9f95ab812bf9d2dea2be1ff":["beopen"], "20|corda__h2020::6ab0e0739dbe625b99a2ae45842164ad":["beopen"], "20|corda__h2020::8ba50792bc5f4d51d79fca47d860c602":["beopen"], "20|corda_______::8ba50792bc5f4d51d79fca47d860c602":["beopen"], "20|corda__h2020::e70e9114979e963eef24666657b807c3":["beopen"], "20|corda_______::e70e9114979e963eef24666657b807c3":["beopen"], "20|corda_______::15911e01e9744d57205825d77c218737":["beopen"], "20|opendoar____::056a41e24e2a9a67215e87bbee6a80ab":["beopen"], "20|opendoar____::7f67f2e6c6fbb0628f8160fcd3d92ae3":["beopen"], "20|grid________::a8ecfd7c084e561168bcbe6bf0daf3e3":["beopen"], "20|corda_______::7bbe6cc5d8ec1864739a04b0d020c9e9":["beopen"], "20|corda_______::3ff558e30c2e434d688539548300b050":["beopen"], "20|corda__h2020::5ffee5b3b83b33a8cf0e046877bd3a39":["beopen"], "20|corda__h2020::5187217e2e806a6df3579c46f82401bc":["beopen"], "20|grid________::5fa7e2709bcd945e26bfa18689adeec1":["beopen"], "20|corda_______::d8696683c53027438031a96ad27c3c07":["beopen"], "20|corda__h2020::d8696683c53027438031a96ad27c3c07":["beopen"], "20|rcuk________::23a79ebdfa59790864e4a485881568c1":["beopen"], "20|corda__h2020::b76cf8fe49590a966953c37e18608af9":["beopen"], "20|grid________::d2f0204126ee709244a488a4cd3b91c2":["beopen"], "20|corda__h2020::05aba9d2ed17533d15221e5655ac11e6":["beopen"], "20|grid________::802401579481dc32062bdee69f5e6a34":["beopen"], "20|corda__h2020::3f6d9d54cac975a517ba6b252c81582d":["beopen"]} @@ -273,8 +227,8 @@ 'mongoDb' : 'mdstore', 'postgresURL' : 'jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus', 'postgresUser' : 'dnet', - 'postgresPassword' : '*****', - 'reuseContent' : 'false', + 'postgresPassword' : 'dnetPwd', + 'reuseContent' : 'true', 'contentPath' : '/tmp/beta_provision/aggregator', 'workingDir' : '/tmp/beta_provision/working_dir/aggregator' } @@ -403,7 +357,6 @@ - propagates ORCID among results linked by allowedsemrels semantic relationships @@ -429,7 +382,6 @@ - mark results respecting some rules as belonging to communities @@ -440,7 +392,7 @@ 'sourcePath' : 'orcidGraphPath', 'outputPath': 'bulkTaggingGraphPath', 'isLookUpUrl' : 'isLookUpUrl', - 'pathMap' : 'bulkTaggingPathMap', + 'pathMap' : 'bulkTaggingPathMap' } @@ -455,7 +407,6 @@ - creates relashionships between results and organizations when the organizations are associated to institutional repositories @@ -464,14 +415,14 @@ { 'sourcePath' : 'bulkTaggingGraphPath', - 'outputPath': 'affiliationGraphPath', - 'saveGraph' : 'true' + 'outputPath': 'affiliationGraphPath' } { 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/affiliation/oozie_app', - 'workingDir' : '/tmp/beta_provision/working_dir/affiliation' + 'workingDir' : '/tmp/beta_provision/working_dir/affiliation', + 'saveGraph' : 'true' } build-report @@ -480,7 +431,6 @@ - marks as belonging to communities the result collected from datasources related to the organizations specified in the organizationCommunityMap @@ -506,7 +456,6 @@ - created relation between projects and results linked to other results trough allowedsemrel semantic relations linked to projects @@ -532,7 +481,6 @@ - tag as belonging to communitites result in in allowedsemrels relation with other result already linked to communities @@ -542,14 +490,15 @@ { 'sourcePath' : 'fundingGraphPath', 'outputPath': 'communitySemRelGraphPath', - 'isLookupUrl' : 'isLookUpUrl' + 'isLookUpUrl' : 'isLookUpUrl' } { 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/community_semrel/oozie_app', 'workingDir' : '/tmp/beta_provision/working_dir/community_semrel', - 'allowedsemrels' : 'isSupplementedBy;isSupplementTo' + 'allowedsemrels' : 'isSupplementedBy;isSupplementTo', + 'saveGraph' : 'true' } build-report @@ -558,7 +507,6 @@ - associated to results colleced from allowedtypes and those in the whithelist the country of the organization(s) handling the datasource it is collected from @@ -581,16 +529,42 @@ build-report + + + + + + removes blacklisted relations + + executeOozieJob + IIS + + { + 'sourcePath' : 'countryGraphPath', + 'outputPath': 'blacklistedGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/blacklist/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/blacklist', + 'postgresURL' : 'jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus', + 'postgresUser' : 'dnet', + 'postgresPassword' : 'dnetPwd' + } + + build-report + - wf_20200428_155848_495 - 2020-04-28T16:53:23+00:00 + wf_20200509_100941_857 + 2020-05-09T13:26:09+00:00 FAILURE - + eu.dnetlib.data.hadoop.rmi.HadoopServiceException: hadoop job: 0002933-200403132837156-oozie-oozi-W failed with status: KILLED, oozie log: 2020-05-09 13:23:31,194 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[] No results found 2020-05-09 13:23:31,216 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@:start:] Start action [0002933-200403132837156-oozie-oozi-W@:start:] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:23:31,216 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@:start:] [***0002933-200403132837156-oozie-oozi-W@:start:***]Action status=DONE 2020-05-09 13:23:31,216 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@:start:] [***0002933-200403132837156-oozie-oozi-W@:start:***]Action updated in DB! 2020-05-09 13:23:31,257 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@:start:] No results found 2020-05-09 13:23:31,275 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@:start:] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@:start: 2020-05-09 13:23:31,275 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W 2020-05-09 13:23:31,314 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@reset-outputpath] Start action [0002933-200403132837156-oozie-oozi-W@reset-outputpath] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:23:33,897 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@reset-outputpath] [***0002933-200403132837156-oozie-oozi-W@reset-outputpath***]Action status=DONE 2020-05-09 13:23:33,897 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@reset-outputpath] [***0002933-200403132837156-oozie-oozi-W@reset-outputpath***]Action updated in DB! 2020-05-09 13:23:33,947 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@reset-outputpath] No results found 2020-05-09 13:23:33,966 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] Start action [0002933-200403132837156-oozie-oozi-W@copy_entities] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:23:33,966 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] [***0002933-200403132837156-oozie-oozi-W@copy_entities***]Action status=DONE 2020-05-09 13:23:33,966 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] [***0002933-200403132837156-oozie-oozi-W@copy_entities***]Action updated in DB! 2020-05-09 13:23:34,012 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] No results found 2020-05-09 13:23:34,018 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] No results found 2020-05-09 13:23:34,023 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] No results found 2020-05-09 13:23:34,029 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] No results found 2020-05-09 13:23:34,124 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] Start action [0002933-200403132837156-oozie-oozi-W@copy_relation] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:23:34,130 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] Start action [0002933-200403132837156-oozie-oozi-W@copy_projects] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:23:34,130 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] Start action [0002933-200403132837156-oozie-oozi-W@copy_datasources] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:23:34,140 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] Start action [0002933-200403132837156-oozie-oozi-W@copy_organization] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:23:35,010 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] checking action, hadoop job ID [job_1585920557248_14569] status [RUNNING] 2020-05-09 13:23:35,018 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] [***0002933-200403132837156-oozie-oozi-W@copy_projects***]Action status=RUNNING 2020-05-09 13:23:35,018 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] [***0002933-200403132837156-oozie-oozi-W@copy_projects***]Action updated in DB! 2020-05-09 13:23:35,022 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] checking action, hadoop job ID [job_1585920557248_14568] status [RUNNING] 2020-05-09 13:23:35,027 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_projects 2020-05-09 13:23:35,028 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] [***0002933-200403132837156-oozie-oozi-W@copy_relation***]Action status=RUNNING 2020-05-09 13:23:35,028 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] [***0002933-200403132837156-oozie-oozi-W@copy_relation***]Action updated in DB! 2020-05-09 13:23:35,031 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] checking action, hadoop job ID [job_1585920557248_14570] status [RUNNING] 2020-05-09 13:23:35,035 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] [***0002933-200403132837156-oozie-oozi-W@copy_datasources***]Action status=RUNNING 2020-05-09 13:23:35,035 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] [***0002933-200403132837156-oozie-oozi-W@copy_datasources***]Action updated in DB! 2020-05-09 13:23:35,037 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_relation 2020-05-09 13:23:35,048 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_datasources 2020-05-09 13:23:35,072 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] checking action, hadoop job ID [job_1585920557248_14571] status [RUNNING] 2020-05-09 13:23:35,076 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] [***0002933-200403132837156-oozie-oozi-W@copy_organization***]Action status=RUNNING 2020-05-09 13:23:35,076 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] [***0002933-200403132837156-oozie-oozi-W@copy_organization***]Action updated in DB! 2020-05-09 13:23:35,084 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_organization 2020-05-09 13:23:35,090 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_entities] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_entities 2020-05-09 13:23:35,090 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@reset-outputpath] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@reset-outputpath 2020-05-09 13:23:58,926 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] callback for action [0002933-200403132837156-oozie-oozi-W@copy_datasources] 2020-05-09 13:23:59,085 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] checking action, hadoop job ID [job_1585920557248_14570] status [RUNNING] 2020-05-09 13:23:59,242 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] callback for action [0002933-200403132837156-oozie-oozi-W@copy_projects] 2020-05-09 13:23:59,386 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] checking action, hadoop job ID [job_1585920557248_14569] status [RUNNING] 2020-05-09 13:24:01,343 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] callback for action [0002933-200403132837156-oozie-oozi-W@copy_datasources] 2020-05-09 13:24:01,418 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] Hadoop Jobs launched : [job_1585920557248_14573] 2020-05-09 13:24:01,418 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] action completed, external ID [job_1585920557248_14570] 2020-05-09 13:24:01,493 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_datasources] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_datasources 2020-05-09 13:24:01,935 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] callback for action [0002933-200403132837156-oozie-oozi-W@copy_projects] 2020-05-09 13:24:02,012 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] Hadoop Jobs launched : [job_1585920557248_14572] 2020-05-09 13:24:02,012 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] action completed, external ID [job_1585920557248_14569] 2020-05-09 13:24:02,076 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_projects] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_projects 2020-05-09 13:25:03,172 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] callback for action [0002933-200403132837156-oozie-oozi-W@copy_organization] 2020-05-09 13:25:03,336 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] checking action, hadoop job ID [job_1585920557248_14571] status [RUNNING] 2020-05-09 13:25:05,598 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] callback for action [0002933-200403132837156-oozie-oozi-W@copy_organization] 2020-05-09 13:25:05,688 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] Hadoop Jobs launched : [job_1585920557248_14574] 2020-05-09 13:25:05,691 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] action completed, external ID [job_1585920557248_14571] 2020-05-09 13:25:05,748 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_organization] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_organization 2020-05-09 13:25:23,274 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] callback for action [0002933-200403132837156-oozie-oozi-W@copy_relation] 2020-05-09 13:25:23,409 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] checking action, hadoop job ID [job_1585920557248_14568] status [RUNNING] 2020-05-09 13:25:25,419 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] callback for action [0002933-200403132837156-oozie-oozi-W@copy_relation] 2020-05-09 13:25:25,510 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] Hadoop Jobs launched : [job_1585920557248_14575] 2020-05-09 13:25:25,511 INFO org.apache.oozie.action.hadoop.DistcpActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] action completed, external ID [job_1585920557248_14568] 2020-05-09 13:25:25,565 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] No results found 2020-05-09 13:25:25,585 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_wait] Start action [0002933-200403132837156-oozie-oozi-W@copy_wait] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:25:25,585 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_wait] [***0002933-200403132837156-oozie-oozi-W@copy_wait***]Action status=DONE 2020-05-09 13:25:25,585 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_wait] [***0002933-200403132837156-oozie-oozi-W@copy_wait***]Action updated in DB! 2020-05-09 13:25:25,627 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_wait] No results found 2020-05-09 13:25:25,648 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] Start action [0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:25:25,648 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] [***0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1***]Action status=DONE 2020-05-09 13:25:25,648 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] [***0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1***]Action updated in DB! 2020-05-09 13:25:25,694 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] No results found 2020-05-09 13:25:25,700 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] No results found 2020-05-09 13:25:25,706 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] No results found 2020-05-09 13:25:25,711 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] No results found 2020-05-09 13:25:25,801 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] Start action [0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:25:25,825 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] Start action [0002933-200403132837156-oozie-oozi-W@join_prepare_software] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:25:25,825 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] Start action [0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:25:25,828 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] Start action [0002933-200403132837156-oozie-oozi-W@join_prepare_publication] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:25:27,165 INFO org.apache.oozie.action.hadoop.SparkActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] checking action, hadoop job ID [job_1585920557248_14578] status [RUNNING] 2020-05-09 13:25:27,170 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] [***0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct***]Action status=RUNNING 2020-05-09 13:25:27,170 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] [***0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct***]Action updated in DB! 2020-05-09 13:25:27,179 INFO org.apache.oozie.action.hadoop.SparkActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] checking action, hadoop job ID [job_1585920557248_14577] status [RUNNING] 2020-05-09 13:25:27,181 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct 2020-05-09 13:25:27,183 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] [***0002933-200403132837156-oozie-oozi-W@join_prepare_software***]Action status=RUNNING 2020-05-09 13:25:27,183 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] [***0002933-200403132837156-oozie-oozi-W@join_prepare_software***]Action updated in DB! 2020-05-09 13:25:27,188 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_software 2020-05-09 13:25:27,617 INFO org.apache.oozie.action.hadoop.SparkActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] checking action, hadoop job ID [job_1585920557248_14576] status [RUNNING] 2020-05-09 13:25:27,622 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] [***0002933-200403132837156-oozie-oozi-W@join_prepare_publication***]Action status=RUNNING 2020-05-09 13:25:27,622 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] [***0002933-200403132837156-oozie-oozi-W@join_prepare_publication***]Action updated in DB! 2020-05-09 13:25:27,625 INFO org.apache.oozie.action.hadoop.SparkActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] checking action, hadoop job ID [job_1585920557248_14579] status [RUNNING] 2020-05-09 13:25:27,628 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_publication 2020-05-09 13:25:27,629 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] [***0002933-200403132837156-oozie-oozi-W@join_prepare_dataset***]Action status=RUNNING 2020-05-09 13:25:27,629 INFO org.apache.oozie.command.wf.ForkedActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] [***0002933-200403132837156-oozie-oozi-W@join_prepare_dataset***]Action updated in DB! 2020-05-09 13:25:27,634 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_dataset 2020-05-09 13:25:27,639 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@fork_prepare_assoc_step1 2020-05-09 13:25:27,639 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_wait] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_wait 2020-05-09 13:25:27,640 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@copy_relation] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@copy_relation 2020-05-09 13:25:41,416 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] callback for action [0002933-200403132837156-oozie-oozi-W@join_prepare_software] 2020-05-09 13:25:41,490 INFO org.apache.oozie.action.hadoop.SparkActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] action completed, external ID [job_1585920557248_14577] 2020-05-09 13:25:41,495 WARN org.apache.oozie.action.hadoop.SparkActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] Launcher ERROR, reason: Main class [org.apache.oozie.action.hadoop.SparkMain], main() threw exception, File file:/data/3/yarn/nm/usercache/dnet.beta/appcache/application_1585920557248_14577/container_e68_1585920557248_14577_01_000002/dhp-propagation-1.1.8-SNAPSHOT.jar does not exist 2020-05-09 13:25:41,495 WARN org.apache.oozie.action.hadoop.SparkActionExecutor: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] Launcher exception: File file:/data/3/yarn/nm/usercache/dnet.beta/appcache/application_1585920557248_14577/container_e68_1585920557248_14577_01_000002/dhp-propagation-1.1.8-SNAPSHOT.jar does not exist java.io.FileNotFoundException: File file:/data/3/yarn/nm/usercache/dnet.beta/appcache/application_1585920557248_14577/container_e68_1585920557248_14577_01_000002/dhp-propagation-1.1.8-SNAPSHOT.jar does not exist at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:598) at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:811) at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:588) at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:432) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:340) at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:292) at org.apache.spark.deploy.yarn.Client.copyFileToRemote(Client.scala:404) at org.apache.spark.deploy.yarn.Client.org$apache$spark$deploy$yarn$Client$$distribute$1(Client.scala:496) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$9.apply(Client.scala:595) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$9.apply(Client.scala:594) at scala.Option.foreach(Option.scala:257) at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:594) at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:886) at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:180) at org.apache.spark.deploy.yarn.Client.run(Client.scala:1156) at org.apache.spark.deploy.yarn.YarnClusterApplication.start(Client.scala:1608) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:849) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:167) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:195) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86) at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:924) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:933) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) at org.apache.oozie.action.hadoop.SparkMain.runSpark(SparkMain.java:178) at org.apache.oozie.action.hadoop.SparkMain.run(SparkMain.java:90) at org.apache.oozie.action.hadoop.LauncherMain.run(LauncherMain.java:81) at org.apache.oozie.action.hadoop.SparkMain.main(SparkMain.java:57) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.oozie.action.hadoop.LauncherMapper.map(LauncherMapper.java:235) at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54) at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:459) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343) at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1924) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158) 2020-05-09 13:25:41,514 INFO org.apache.oozie.command.wf.ActionEndXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] ERROR is considered as FAILED for SLA 2020-05-09 13:25:41,541 INFO org.apache.oozie.service.JPAService: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] No results found 2020-05-09 13:25:41,580 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@Kill] Start action [0002933-200403132837156-oozie-oozi-W@Kill] with user-retry state : userRetryCount [0], userRetryMax [0], userRetryInterval [10] 2020-05-09 13:25:41,580 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@Kill] [***0002933-200403132837156-oozie-oozi-W@Kill***]Action status=DONE 2020-05-09 13:25:41,580 INFO org.apache.oozie.command.wf.ActionStartXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@Kill] [***0002933-200403132837156-oozie-oozi-W@Kill***]Action updated in DB! 2020-05-09 13:25:41,692 WARN org.apache.oozie.workflow.lite.LiteWorkflowInstance: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@Kill] Workflow completed [KILLED], killing [3] running nodes 2020-05-09 13:25:41,760 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@Kill] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@Kill 2020-05-09 13:25:41,766 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_software] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_software 2020-05-09 13:25:41,852 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct 2020-05-09 13:25:41,914 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] callback for action [0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] 2020-05-09 13:25:41,920 ERROR org.apache.oozie.command.wf.CompletedActionXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] XException, org.apache.oozie.command.CommandException: E0800: Action it is not running its in [KILLED] state, action [0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] at org.apache.oozie.command.wf.CompletedActionXCommand.eagerVerifyPrecondition(CompletedActionXCommand.java:92) at org.apache.oozie.command.XCommand.call(XCommand.java:257) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at org.apache.oozie.service.CallableQueueService$CallableWrapper.run(CallableQueueService.java:179) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) 2020-05-09 13:25:41,938 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_publication 2020-05-09 13:25:42,005 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] callback for action [0002933-200403132837156-oozie-oozi-W@join_prepare_publication] 2020-05-09 13:25:42,010 ERROR org.apache.oozie.command.wf.CompletedActionXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_publication] XException, org.apache.oozie.command.CommandException: E0800: Action it is not running its in [KILLED] state, action [0002933-200403132837156-oozie-oozi-W@join_prepare_publication] at org.apache.oozie.command.wf.CompletedActionXCommand.eagerVerifyPrecondition(CompletedActionXCommand.java:92) at org.apache.oozie.command.XCommand.call(XCommand.java:257) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at org.apache.oozie.service.CallableQueueService$CallableWrapper.run(CallableQueueService.java:179) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) 2020-05-09 13:25:42,028 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[dnet.beta] GROUP[-] TOKEN[] APP[orcid_to_result_from_semrel_propagation] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W 2020-05-09 13:25:42,028 INFO org.apache.oozie.command.wf.WorkflowNotificationXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_dataset] No Notification URL is defined. Therefore nothing to notify for job 0002933-200403132837156-oozie-oozi-W@join_prepare_dataset 2020-05-09 13:25:42,113 INFO org.apache.oozie.servlet.CallbackServlet: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[-] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] callback for action [0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] 2020-05-09 13:25:42,116 ERROR org.apache.oozie.command.wf.CompletedActionXCommand: SERVER[iis-cdh5-test-m3.ocean.icm.edu.pl] USER[-] GROUP[-] TOKEN[] APP[-] JOB[0002933-200403132837156-oozie-oozi-W] ACTION[0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] XException, org.apache.oozie.command.CommandException: E0800: Action it is not running its in [KILLED] state, action [0002933-200403132837156-oozie-oozi-W@join_prepare_otherresearchproduct] at org.apache.oozie.command.wf.CompletedActionXCommand.eagerVerifyPrecondition(CompletedActionXCommand.java:92) at org.apache.oozie.command.XCommand.call(XCommand.java:257) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at org.apache.oozie.service.CallableQueueService$CallableWrapper.run(CallableQueueService.java:179) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) \ No newline at end of file diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index ff392aa6f5..b0a55a05c5 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -22,9 +22,11 @@ dhp-actionmanager dhp-graph-mapper dhp-dedup-openaire + dhp-enrichment dhp-graph-provision dhp-dedup-scholexplorer dhp-graph-provision-scholexplorer + dhp-blacklist dhp-stats-update dhp-broker-events dhp-doiboost