forked from D-Net/dnet-hadoop
Merge branch 'doiboost' of https://code-repo.d4science.org/D-Net/dnet-hadoop into doiboost
This commit is contained in:
commit
d4e9075f22
|
@ -83,6 +83,10 @@
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.postgresql</groupId>
|
||||||
|
<artifactId>postgresql</artifactId>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -14,7 +14,7 @@ public class DbClient implements Closeable {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
private static final Log log = LogFactory.getLog(DbClient.class);
|
||||||
|
|
||||||
private final Connection connection;
|
private Connection connection;
|
||||||
|
|
||||||
public DbClient(final String address, final String login, final String password) {
|
public DbClient(final String address, final String login, final String password) {
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
public class ModelSupport {
|
public class ModelSupport {
|
||||||
|
|
||||||
/** Defines the mapping between the actual entity type and the main entity type */
|
/** Defines the mapping between the actual entity type and the main entity type */
|
||||||
private static final Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
entityMapping.put(EntityType.publication, MainEntityType.result);
|
entityMapping.put(EntityType.publication, MainEntityType.result);
|
||||||
|
@ -53,6 +53,232 @@ public class ModelSupport {
|
||||||
oafTypes.put("relation", Relation.class);
|
oafTypes.put("relation", Relation.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
entityIdPrefix.put("datasource", "10");
|
||||||
|
entityIdPrefix.put("organization", "20");
|
||||||
|
entityIdPrefix.put("project", "40");
|
||||||
|
entityIdPrefix.put("result", "50");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personResult_authorship_isAuthorOf", new RelationInverse()
|
||||||
|
.setRelation("isAuthorOf")
|
||||||
|
.setInverse("hasAuthor")
|
||||||
|
.setRelType("personResult")
|
||||||
|
.setSubReltype("authorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personResult_authorship_hasAuthor", new RelationInverse()
|
||||||
|
.setInverse("isAuthorOf")
|
||||||
|
.setRelation("hasAuthor")
|
||||||
|
.setRelType("personResult")
|
||||||
|
.setSubReltype("authorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectOrganization_participation_isParticipant", new RelationInverse()
|
||||||
|
.setRelation("isParticipant")
|
||||||
|
.setInverse("hasParticipant")
|
||||||
|
.setRelType("projectOrganization")
|
||||||
|
.setSubReltype("participation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectOrganization_participation_hasParticipant", new RelationInverse()
|
||||||
|
.setInverse("isParticipant")
|
||||||
|
.setRelation("hasParticipant")
|
||||||
|
.setRelType("projectOrganization")
|
||||||
|
.setSubReltype("participation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
|
||||||
|
.setRelation("hasAuthorInstitution")
|
||||||
|
.setInverse("isAuthorInstitutionOf")
|
||||||
|
.setRelType("resultOrganization")
|
||||||
|
.setSubReltype("affiliation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
|
||||||
|
.setInverse("hasAuthorInstitution")
|
||||||
|
.setRelation("isAuthorInstitutionOf")
|
||||||
|
.setRelType("resultOrganization")
|
||||||
|
.setSubReltype("affiliation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"organizationOrganization_dedup_merges", new RelationInverse()
|
||||||
|
.setRelation("merges")
|
||||||
|
.setInverse("isMergedIn")
|
||||||
|
.setRelType("organizationOrganization")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"organizationOrganization_dedup_isMergedIn", new RelationInverse()
|
||||||
|
.setInverse("merges")
|
||||||
|
.setRelation("isMergedIn")
|
||||||
|
.setRelType("organizationOrganization")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||||
|
.setInverse("isSimilarTo")
|
||||||
|
.setRelation("isSimilarTo")
|
||||||
|
.setRelType("organizationOrganization")
|
||||||
|
.setSubReltype("dedupSimilarity"));
|
||||||
|
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultProject_outcome_isProducedBy", new RelationInverse()
|
||||||
|
.setRelation("isProducedBy")
|
||||||
|
.setInverse("produces")
|
||||||
|
.setRelType("resultProject")
|
||||||
|
.setSubReltype("outcome"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultProject_outcome_produces", new RelationInverse()
|
||||||
|
.setInverse("isProducedBy")
|
||||||
|
.setRelation("produces")
|
||||||
|
.setRelType("resultProject")
|
||||||
|
.setSubReltype("outcome"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectPerson_contactPerson_isContact", new RelationInverse()
|
||||||
|
.setRelation("isContact")
|
||||||
|
.setInverse("hasContact")
|
||||||
|
.setRelType("projectPerson")
|
||||||
|
.setSubReltype("contactPerson"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectPerson_contactPerson_hasContact", new RelationInverse()
|
||||||
|
.setInverse("isContact")
|
||||||
|
.setRelation("hasContact")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("coAuthorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
|
||||||
|
.setInverse("isCoAuthorOf")
|
||||||
|
.setRelation("isCoAuthorOf")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("coAuthorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_dedup_merges", new RelationInverse()
|
||||||
|
.setInverse("isMergedIn")
|
||||||
|
.setRelation("merges")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_dedup_isMergedIn", new RelationInverse()
|
||||||
|
.setInverse("merges")
|
||||||
|
.setRelation("isMergedIn")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||||
|
.setInverse("isSimilarTo")
|
||||||
|
.setRelation("isSimilarTo")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("dedupSimilarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"datasourceOrganization_provision_isProvidedBy", new RelationInverse()
|
||||||
|
.setInverse("provides")
|
||||||
|
.setRelation("isProvidedBy")
|
||||||
|
.setRelType("datasourceOrganization")
|
||||||
|
.setSubReltype("provision"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"datasourceOrganization_provision_provides", new RelationInverse()
|
||||||
|
.setInverse("isProvidedBy")
|
||||||
|
.setRelation("provides")
|
||||||
|
.setRelType("datasourceOrganization")
|
||||||
|
.setSubReltype("provision"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
|
||||||
|
.setInverse("isAmongTopNSimilarDocuments")
|
||||||
|
.setRelation("hasAmongTopNSimilarDocuments")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("similarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||||
|
.setInverse("hasAmongTopNSimilarDocuments")
|
||||||
|
.setRelation("isAmongTopNSimilarDocuments")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("similarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_relationship_isRelatedTo", new RelationInverse()
|
||||||
|
.setInverse("isRelatedTo")
|
||||||
|
.setRelation("isRelatedTo")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("relationship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||||
|
.setInverse("hasAmongTopNSimilarDocuments")
|
||||||
|
.setRelation("isAmongTopNSimilarDocuments")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("similarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_supplement_isSupplementTo", new RelationInverse()
|
||||||
|
.setInverse("isSupplementedBy")
|
||||||
|
.setRelation("isSupplementTo")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("supplement"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_supplement_isSupplementedBy", new RelationInverse()
|
||||||
|
.setInverse("isSupplementTo")
|
||||||
|
.setRelation("isSupplementedBy")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("supplement"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_part_isPartOf", new RelationInverse()
|
||||||
|
.setInverse("hasPart")
|
||||||
|
.setRelation("isPartOf")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("part"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_part_hasPart", new RelationInverse()
|
||||||
|
.setInverse("isPartOf")
|
||||||
|
.setRelation("hasPart")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("part"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_dedup_merges", new RelationInverse()
|
||||||
|
.setInverse("isMergedIn")
|
||||||
|
.setRelation("merges")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_dedup_isMergedIn", new RelationInverse()
|
||||||
|
.setInverse("merges")
|
||||||
|
.setRelation("isMergedIn")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||||
|
.setInverse("isSimilarTo")
|
||||||
|
.setRelation("isSimilarTo")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("dedupSimilarity"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
||||||
|
|
||||||
private ModelSupport() {
|
private ModelSupport() {
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
|
public class RelationInverse {
|
||||||
|
private String relation;
|
||||||
|
private String inverse;
|
||||||
|
private String relType;
|
||||||
|
private String subReltype;
|
||||||
|
|
||||||
|
public String getRelType() {
|
||||||
|
return relType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RelationInverse setRelType(String relType) {
|
||||||
|
this.relType = relType;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSubReltype() {
|
||||||
|
return subReltype;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RelationInverse setSubReltype(String subReltype) {
|
||||||
|
this.subReltype = subReltype;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRelation() {
|
||||||
|
return relation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RelationInverse setRelation(String relation) {
|
||||||
|
this.relation = relation;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getInverse() {
|
||||||
|
return inverse;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RelationInverse setInverse(String inverse) {
|
||||||
|
this.inverse = inverse;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -2,8 +2,7 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
public class Author implements Serializable {
|
public class Author implements Serializable {
|
||||||
|
|
||||||
|
@ -86,4 +85,5 @@ public class Author implements Serializable {
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return Objects.hash(fullname, name, surname, rank, pid, affiliation);
|
return Objects.hash(fullname, name, surname, rank, pid, affiliation);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>dhp-workflows</artifactId>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<version>1.2.1-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>dhp-blacklist</artifactId>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-schemas</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,87 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.blacklist;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class PrepareMergedRelationJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareMergedRelationJob.class);
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareMergedRelationJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {} ", outputPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
selectMergesRelations(
|
||||||
|
spark,
|
||||||
|
inputPath,
|
||||||
|
outputPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
|
||||||
|
|
||||||
|
Dataset<Relation> relation = readRelations(spark, inputPath);
|
||||||
|
|
||||||
|
relation
|
||||||
|
.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
||||||
|
SparkSession spark, String inputPath) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map(
|
||||||
|
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
||||||
|
Encoders.bean(Relation.class));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,141 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.blacklist;
|
||||||
|
|
||||||
|
import java.io.BufferedWriter;
|
||||||
|
import java.io.Closeable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.sql.ResultSet;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.DbClient;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.common.RelationInverse;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class ReadBlacklistFromDB implements Closeable {
|
||||||
|
|
||||||
|
private final DbClient dbClient;
|
||||||
|
private static final Log log = LogFactory.getLog(ReadBlacklistFromDB.class);
|
||||||
|
private final Configuration conf;
|
||||||
|
private final BufferedWriter writer;
|
||||||
|
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private final static String query = "SELECT source_type, unnest(original_source_objects) as source, " +
|
||||||
|
"target_type, unnest(original_target_objects) as target, " +
|
||||||
|
"relationship FROM blacklist WHERE status = 'ACCEPTED'";
|
||||||
|
|
||||||
|
public static void main(final String[] args) throws Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
ReadBlacklistFromDB.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/blacklist/blacklist_parameters.json")));
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
final String dbUrl = parser.get("postgresUrl");
|
||||||
|
final String dbUser = parser.get("postgresUser");
|
||||||
|
final String dbPassword = parser.get("postgresPassword");
|
||||||
|
final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
|
||||||
|
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||||
|
|
||||||
|
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
||||||
|
dbPassword)) {
|
||||||
|
|
||||||
|
log.info("Processing blacklist...");
|
||||||
|
rbl.execute(query, rbl::processBlacklistEntry);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void execute(final String sql, final Function<ResultSet, List<Relation>> producer) throws Exception {
|
||||||
|
|
||||||
|
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(r -> writeRelation(r));
|
||||||
|
|
||||||
|
dbClient.processResults(sql, consumer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Relation> processBlacklistEntry(ResultSet rs) {
|
||||||
|
try {
|
||||||
|
Relation direct = new Relation();
|
||||||
|
Relation inverse = new Relation();
|
||||||
|
|
||||||
|
String source_prefix = ModelSupport.entityIdPrefix.get(rs.getString("source_type"));
|
||||||
|
String target_prefix = ModelSupport.entityIdPrefix.get(rs.getString("target_type"));
|
||||||
|
|
||||||
|
String source_direct = source_prefix + "|" + rs.getString("source");
|
||||||
|
direct.setSource(source_direct);
|
||||||
|
inverse.setTarget(source_direct);
|
||||||
|
|
||||||
|
String target_direct = target_prefix + "|" + rs.getString("target");
|
||||||
|
direct.setTarget(target_direct);
|
||||||
|
inverse.setSource(target_direct);
|
||||||
|
|
||||||
|
String encoding = rs.getString("relationship");
|
||||||
|
RelationInverse ri = ModelSupport.relationInverseMap.get(encoding);
|
||||||
|
direct.setRelClass(ri.getRelation());
|
||||||
|
inverse.setRelClass(ri.getInverse());
|
||||||
|
direct.setRelType(ri.getRelType());
|
||||||
|
inverse.setRelType(ri.getRelType());
|
||||||
|
direct.setSubRelType(ri.getSubReltype());
|
||||||
|
inverse.setSubRelType(ri.getSubReltype());
|
||||||
|
|
||||||
|
return Arrays.asList(direct, inverse);
|
||||||
|
|
||||||
|
} catch (final Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
dbClient.close();
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadBlacklistFromDB(
|
||||||
|
final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword)
|
||||||
|
throws Exception {
|
||||||
|
|
||||||
|
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
|
||||||
|
this.conf = new Configuration();
|
||||||
|
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
|
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||||
|
Path hdfsWritePath = new Path(hdfsPath);
|
||||||
|
FSDataOutputStream fsDataOutputStream = null;
|
||||||
|
if (fileSystem.exists(hdfsWritePath)) {
|
||||||
|
fsDataOutputStream = fileSystem.append(hdfsWritePath);
|
||||||
|
} else {
|
||||||
|
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void writeRelation(final Relation r) {
|
||||||
|
try {
|
||||||
|
writer.write(OBJECT_MAPPER.writeValueAsString(r));
|
||||||
|
writer.newLine();
|
||||||
|
} catch (final Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,147 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.blacklist;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkRemoveBlacklistedRelationJob {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class);
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkRemoveBlacklistedRelationJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath {}: ", outputPath);
|
||||||
|
|
||||||
|
final String blacklistPath = parser.get("hdfsPath");
|
||||||
|
log.info("blacklistPath {}: ", blacklistPath);
|
||||||
|
|
||||||
|
final String mergesPath = parser.get("mergesPath");
|
||||||
|
log.info("mergesPath {}: ", mergesPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
removeBlacklistedRelations(
|
||||||
|
spark,
|
||||||
|
blacklistPath,
|
||||||
|
inputPath,
|
||||||
|
outputPath,
|
||||||
|
mergesPath);
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
||||||
|
String outputPath, String mergesPath) {
|
||||||
|
Dataset<Relation> blackListed = readRelations(spark, blacklistPath + "/blacklist");
|
||||||
|
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
||||||
|
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
||||||
|
|
||||||
|
log.info("InputRelationCount: {}", inputRelation.count());
|
||||||
|
|
||||||
|
Dataset<Relation> dedupSource = blackListed
|
||||||
|
.joinWith(
|
||||||
|
mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
|
||||||
|
"left_outer")
|
||||||
|
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) c -> {
|
||||||
|
Optional
|
||||||
|
.ofNullable(c._2())
|
||||||
|
.ifPresent(mr -> c._1().setSource(mr.getSource()));
|
||||||
|
return c._1();
|
||||||
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
Dataset<Relation> dedupBL = dedupSource
|
||||||
|
.joinWith(
|
||||||
|
mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")),
|
||||||
|
"left_outer")
|
||||||
|
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) c -> {
|
||||||
|
Optional
|
||||||
|
.ofNullable(c._2())
|
||||||
|
.ifPresent(mr -> c._1().setTarget(mr.getSource()));
|
||||||
|
return c._1();
|
||||||
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
dedupBL
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(blacklistPath + "/deduped");
|
||||||
|
|
||||||
|
inputRelation
|
||||||
|
.joinWith(
|
||||||
|
dedupBL, (inputRelation
|
||||||
|
.col("source")
|
||||||
|
.equalTo(dedupBL.col("source"))
|
||||||
|
.and(
|
||||||
|
inputRelation
|
||||||
|
.col("target")
|
||||||
|
.equalTo(dedupBL.col("target")))),
|
||||||
|
"left_outer")
|
||||||
|
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) c -> {
|
||||||
|
Relation ir = c._1();
|
||||||
|
Optional<Relation> obl = Optional.ofNullable(c._2());
|
||||||
|
if (obl.isPresent()) {
|
||||||
|
if (ir.equals(obl.get())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ir;
|
||||||
|
}, Encoders.bean(Relation.class))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
||||||
|
SparkSession spark, String inputPath) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map(
|
||||||
|
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
||||||
|
Encoders.bean(Relation.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "hdfsPath",
|
||||||
|
"paramDescription": "the path where storing the sequential file",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "nn",
|
||||||
|
"paramLongName": "hdfsNameNode",
|
||||||
|
"paramDescription": "the name node on hdfs",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "pgurl",
|
||||||
|
"paramLongName": "postgresUrl",
|
||||||
|
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "pguser",
|
||||||
|
"paramLongName": "postgresUser",
|
||||||
|
"paramDescription": "postgres user",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "pgpasswd",
|
||||||
|
"paramLongName": "postgresPassword",
|
||||||
|
"paramDescription": "postgres password",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,26 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the path to the graph used to remove the relations ",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path where to store the temporary result ",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hive_metastore_uris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,54 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorNumber</name>
|
||||||
|
<value>4</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<value>15G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<value>6G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<value>1</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,195 @@
|
||||||
|
<workflow-app name="blacklist_relations" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>postgresURL</name>
|
||||||
|
<description>the url of the postgress server to query</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>postgresUser</name>
|
||||||
|
<description>the username to access the postgres db</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>postgresPassword</name>
|
||||||
|
<description>the postgres password</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the graph output path</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_outputpath">
|
||||||
|
<fs>
|
||||||
|
<delete path="${outputPath}"/>
|
||||||
|
<mkdir path="${outputPath}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="copy_entities"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name="copy_entities">
|
||||||
|
<path start="copy_publication"/>
|
||||||
|
<path start="copy_dataset"/>
|
||||||
|
<path start="copy_orp"/>
|
||||||
|
<path start="copy_software"/>
|
||||||
|
<path start="copy_datasource"/>
|
||||||
|
<path start="copy_project"/>
|
||||||
|
<path start="copy_organization"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="copy_publication">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_dataset">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_orp">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_software">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/software</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_organization">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_project">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_datasource">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait" to="read_blacklist"/>
|
||||||
|
|
||||||
|
<action name="read_blacklist">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB</main-class>
|
||||||
|
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
||||||
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
||||||
|
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||||
|
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="prepare_merged_relation"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="prepare_merged_relation">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PrepareMergedRelation</name>
|
||||||
|
<class>eu.dnetlib.dhp.blacklist.PrepareMergedRelationJob</class>
|
||||||
|
<jar>dhp-blacklist-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="apply_blacklist"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="apply_blacklist">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ApplyBlacklist</name>
|
||||||
|
<class>eu.dnetlib.dhp.blacklist.SparkRemoveBlacklistedRelationJob</class>
|
||||||
|
<jar>dhp-blacklist-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
|
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
||||||
|
<arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,33 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "hdfsPath",
|
||||||
|
"paramDescription": "the path where storing the sequential file",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the path to the graph used to remove the relations ",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path where to store the temporary result ",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "m",
|
||||||
|
"paramLongName": "mergesPath",
|
||||||
|
"paramDescription": "true if the spark session is managed",
|
||||||
|
"paramRequired": true
|
||||||
|
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,167 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.blacklist;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class BlackListTest {
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private static final ClassLoader cl = eu.dnetlib.dhp.blacklist.BlackListTest.class.getClassLoader();
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
private static Path workingDir;
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.blacklist.BlackListTest.class);
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void beforeAll() throws IOException {
|
||||||
|
workingDir = Files.createTempDirectory(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
|
||||||
|
log.info("using work dir {}", workingDir);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
|
||||||
|
|
||||||
|
conf.setMaster("local[*]");
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
conf.set("hive.metastore.local", "true");
|
||||||
|
conf.set("spark.ui.enabled", "false");
|
||||||
|
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||||
|
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||||
|
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(BlackListTest.class.getSimpleName())
|
||||||
|
.config(conf)
|
||||||
|
.getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void afterAll() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath =
|
||||||
|
* parser.get("outputPath"); log.info("outputPath {}: ", outputPath); final String blacklistPath =
|
||||||
|
* parser.get("hdfsPath"); log.info("blacklistPath {}: ", blacklistPath); final String mergesPath =
|
||||||
|
* parser.get("mergesPath"); log.info("mergesPath {}: ", mergesPath);
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void noRemoveTest() throws Exception {
|
||||||
|
SparkRemoveBlacklistedRelationJob
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged",
|
||||||
|
Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsNoRemoval").getPath(),
|
||||||
|
"-outputPath",
|
||||||
|
workingDir.toString() + "/relation",
|
||||||
|
"-hdfsPath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
|
||||||
|
"-mergesPath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Relation> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/relation")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||||
|
|
||||||
|
Assertions.assertEquals(13, tmp.count());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void removeNoMergeMatchTest() throws Exception {
|
||||||
|
SparkRemoveBlacklistedRelationJob
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged",
|
||||||
|
Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsOneRemoval").getPath(),
|
||||||
|
"-outputPath",
|
||||||
|
workingDir.toString() + "/relation",
|
||||||
|
"-hdfsPath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
|
||||||
|
"-mergesPath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Relation> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/relation")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||||
|
|
||||||
|
Assertions.assertEquals(12, tmp.count());
|
||||||
|
|
||||||
|
org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.oaf.Relation> verificationDataset = spark
|
||||||
|
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
0, verificationDataset
|
||||||
|
.filter(
|
||||||
|
"source = '40|corda__h2020::5161f53ab205d803c36b4c888fe7deef' and " +
|
||||||
|
"target = '20|dedup_wf_001::157af406bc653aa4d9749318b644de43'")
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions.assertEquals(0, verificationDataset.filter("relClass = 'hasParticipant'").count());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void removeMergeMatchTest() throws Exception {
|
||||||
|
SparkRemoveBlacklistedRelationJob
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged",
|
||||||
|
Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch").getPath(),
|
||||||
|
"-outputPath",
|
||||||
|
workingDir.toString() + "/relation",
|
||||||
|
"-hdfsPath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
|
||||||
|
"-mergesPath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRelOneMerge").getPath(),
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Relation> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/relation")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||||
|
|
||||||
|
Assertions.assertEquals(12, tmp.count());
|
||||||
|
|
||||||
|
org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.oaf.Relation> verificationDataset = spark
|
||||||
|
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
|
||||||
|
|
||||||
|
Assertions.assertEquals(12, verificationDataset.filter("relClass = 'isProvidedBy'").count());
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"hasParticipant","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"isParticipant","source":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43","target":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::a727cc288016db7132ef9a799aa83350","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::a727cc288016db7132ef9a799aa83350"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::062cf091d5c7a7d730001c34177042e3","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::062cf091d5c7a7d730001c34177042e3"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::1b172ab34639e7935e2357119cf20830","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|od_______908::1b172ab34639e7935e2357119cf20830"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
|
||||||
|
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021"}
|
|
@ -0,0 +1,14 @@
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______177::67c1385662f2fa0bde310bec15427646"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
|
|
@ -0,0 +1,14 @@
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
|
||||||
|
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}
|
|
@ -0,0 +1,13 @@
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProducedBy","relType":"resultProject","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"outcome","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
|
|
@ -0,0 +1,13 @@
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::018cb61ed43c01704decc66183ce5d60","subRelType":"provision","target":"20|dedup_wf_001::b9fff055ce5efacecbe4ef918c127f86"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
|
|
@ -0,0 +1,13 @@
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","subRelType":"participation","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}
|
|
@ -0,0 +1,53 @@
|
||||||
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
|
|
||||||
|
import org.json4s
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
|
|
||||||
|
case class Papers(PaperId:Long, Rank:Integer, Doi:String,
|
||||||
|
DocType:String, PaperTitle:String, OriginalTitle:String,
|
||||||
|
BookTitle:String, Year:Option[Integer], Date:Option[java.sql.Timestamp], Publisher:String,
|
||||||
|
JournalId:Option[Long], ConferenceSeriesId:Option[Long], ConferenceInstanceId:Option[Long],
|
||||||
|
Volume:String, Issue:String, FirstPage:String, LastPage:String,
|
||||||
|
ReferenceCount:Option[Long], CitationCount:Option[Long], EstimatedCitation:Option[Long],
|
||||||
|
OriginalVenue:String, FamilyId:Option[Long], CreatedDate:java.sql.Timestamp) {}
|
||||||
|
|
||||||
|
|
||||||
|
case class PaperAbstract(PaperId:Long,IndexedAbstract:String) {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
case object ConversionUtil {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def transformPaperAbstract(input:PaperAbstract) : PaperAbstract = {
|
||||||
|
PaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def convertInvertedIndexString(json_input:String) :String = {
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: json4s.JValue = parse(json_input)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
val idl = (json \ "IndexLength").extract[Int]
|
||||||
|
|
||||||
|
if (idl > 0) {
|
||||||
|
val res = Array.ofDim[String](idl)
|
||||||
|
|
||||||
|
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
||||||
|
|
||||||
|
for {(k:String,v:List[Int]) <- iid}{
|
||||||
|
v.foreach(item => res(item) = k)
|
||||||
|
}
|
||||||
|
return res.mkString(" ")
|
||||||
|
|
||||||
|
}
|
||||||
|
""
|
||||||
|
}
|
||||||
|
}
|
|
@ -63,7 +63,7 @@ object SparkImportMagIntoDataset {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_mag_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
import org.apache.spark.sql.functions._
|
||||||
|
|
||||||
|
object SparkPreProcessMAG {
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
val conf: SparkConf = new SparkConf()
|
||||||
|
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
|
||||||
|
parser.parseArgument(args)
|
||||||
|
val spark: SparkSession =
|
||||||
|
SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
|
.appName(getClass.getSimpleName)
|
||||||
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
logger.info("Phase 1) make uninque DOI in Papers:")
|
||||||
|
|
||||||
|
val d: Dataset[Papers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[Papers]
|
||||||
|
|
||||||
|
|
||||||
|
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
||||||
|
val result: RDD[Papers] = d.where(col("Doi").isNotNull).rdd.map { p: Papers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: Papers, p2: Papers) =>
|
||||||
|
var r = if (p1 == null) p2 else p1
|
||||||
|
if (p1 != null && p2 != null) {
|
||||||
|
if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
||||||
|
if (p1.CreatedDate.before(p2.CreatedDate))
|
||||||
|
r = p1
|
||||||
|
else
|
||||||
|
r = p2
|
||||||
|
} else {
|
||||||
|
r = if (p1.CreatedDate == null) p2 else p1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r
|
||||||
|
}.map(_._2)
|
||||||
|
|
||||||
|
val distinctPaper: Dataset[Papers] = spark.createDataset(result)
|
||||||
|
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
||||||
|
logger.info(s"Total number of element: ${result.count()}")
|
||||||
|
|
||||||
|
logger.info("Phase 2) convert InverdIndex Abastrac to string")
|
||||||
|
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[PaperAbstract]
|
||||||
|
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||||
|
|
||||||
|
|
||||||
|
distinctPaper.joinWith(pa, col("PaperId").eqia)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -34,7 +34,7 @@
|
||||||
<delete path='${targetPath}'/>
|
<delete path='${targetPath}'/>
|
||||||
<mkdir path='${targetPath}'/>
|
<mkdir path='${targetPath}'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="ConvertMagToDataset"/>
|
<ok to="PreprocessMag"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -59,5 +59,28 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<action name="PreprocessMag">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Convert Mag to Dataset</name>
|
||||||
|
<class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -0,0 +1,6 @@
|
||||||
|
[
|
||||||
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
|
||||||
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||||
|
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
||||||
|
|
||||||
|
]
|
|
@ -1,20 +1,15 @@
|
||||||
package eu.dnetlib.doiboost
|
package eu.dnetlib.doiboost
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.SerializationFeature
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset, KeyValue, Oaf, Publication, Relation, Result}
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import eu.dnetlib.doiboost.crossref.{Crossref2Oaf, SparkMapDumpIntoOAF}
|
import eu.dnetlib.doiboost.crossref.Crossref2Oaf
|
||||||
import eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset
|
|
||||||
import org.apache.spark.{SparkConf, sql}
|
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
import org.codehaus.jackson.map.ObjectMapper
|
||||||
import org.junit.jupiter.api.Test
|
|
||||||
|
|
||||||
import scala.io.Source
|
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
|
import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
import scala.util.matching.Regex
|
import scala.util.matching.Regex
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,12 +19,6 @@ class CrossrefMappingTest {
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def testMAGCSV() :Unit = {
|
|
||||||
SparkImportMagIntoDataset.main(null)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testFunderRelationshipsMapping(): Unit = {
|
def testFunderRelationshipsMapping(): Unit = {
|
||||||
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
|
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
|
||||||
|
|
|
@ -1,14 +0,0 @@
|
||||||
package eu.dnetlib.doiboost.mag
|
|
||||||
|
|
||||||
|
|
||||||
case class Papers(PaperId:Long, Rank:Integer, Doi:String,
|
|
||||||
DocType:String, PaperTitle:String, OriginalTitle:String,
|
|
||||||
BookTitle:String, Year:Option[Integer], Date:Option[java.sql.Timestamp], Publisher:String,
|
|
||||||
JournalId:Option[Long], ConferenceSeriesId:Option[Long], ConferenceInstanceId:Option[Long],
|
|
||||||
Volume:String, Issue:String, FirstPage:String, LastPage:String,
|
|
||||||
ReferenceCount:Option[Long], CitationCount:Option[Long], EstimatedCitation:Option[Long],
|
|
||||||
OriginalVenue:String, FamilyId:Option[Long], CreatedDate:java.sql.Timestamp) {}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
package eu.dnetlib.doiboost.mag
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
import org.apache.spark.SparkConf
|
|
||||||
import org.apache.spark.api.java.function.ReduceFunction
|
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
import org.apache.spark.sql.{Dataset, Encoders, SaveMode, SparkSession}
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
import org.codehaus.jackson.map.ObjectMapper
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import org.apache.spark.sql.functions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
|
||||||
class MAGMappingTest {
|
class MAGMappingTest {
|
||||||
|
@ -18,34 +15,18 @@ class MAGMappingTest {
|
||||||
|
|
||||||
//@Test
|
//@Test
|
||||||
def testMAGCSV(): Unit = {
|
def testMAGCSV(): Unit = {
|
||||||
|
SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" "))
|
||||||
val conf: SparkConf = new SparkConf()
|
}
|
||||||
val spark: SparkSession =
|
|
||||||
SparkSession
|
|
||||||
.builder()
|
|
||||||
.config(conf)
|
|
||||||
.appName(getClass.getSimpleName)
|
|
||||||
.master("local[*]").getOrCreate()
|
|
||||||
|
|
||||||
|
|
||||||
import spark.implicits._
|
@Test
|
||||||
val d: Dataset[Papers] = spark.read.load("/data/doiboost/mag/datasets/Papers").as[Papers]
|
def buildInvertedIndexTest() :Unit = {
|
||||||
logger.info(s"Total number of element: ${d.where(col("Doi").isNotNull).count()}")
|
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
|
||||||
//implicit val mapEncoder = org.apache.spark.sql.Encoders.bean[Papers]
|
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
||||||
val result: RDD[Papers] = d.where(col("Doi").isNotNull).rdd.map { p: Papers => Tuple2(p.Doi, p) }.reduceByKey {case (p1:Papers, p2:Papers) =>
|
assertNotNull(description)
|
||||||
var r = if (p1==null) p2 else p1
|
assertTrue(description.nonEmpty)
|
||||||
if (p1!=null && p2!=null ) if (p1.CreatedDate.before(p2.CreatedDate))
|
|
||||||
r = p1
|
|
||||||
else
|
|
||||||
r = p2
|
|
||||||
r
|
|
||||||
}.map(_._2)
|
|
||||||
|
|
||||||
|
|
||||||
val distinctPaper:Dataset[Papers] = spark.createDataset(result)
|
|
||||||
distinctPaper.write.mode(SaveMode.Overwrite).save("/data/doiboost/mag/datasets/Papers_d")
|
|
||||||
logger.info(s"Total number of element: ${result.count()}")
|
|
||||||
|
|
||||||
|
logger.debug(description)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,334 @@
|
||||||
|
{
|
||||||
|
"IndexLength": 139,
|
||||||
|
"InvertedIndex": {
|
||||||
|
"The": [
|
||||||
|
0,
|
||||||
|
23,
|
||||||
|
47
|
||||||
|
],
|
||||||
|
"invention": [
|
||||||
|
1,
|
||||||
|
53
|
||||||
|
],
|
||||||
|
"discloses": [
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"a": [
|
||||||
|
3,
|
||||||
|
10,
|
||||||
|
71,
|
||||||
|
81,
|
||||||
|
121
|
||||||
|
],
|
||||||
|
"treatment": [
|
||||||
|
4,
|
||||||
|
69,
|
||||||
|
85,
|
||||||
|
96
|
||||||
|
],
|
||||||
|
"method": [
|
||||||
|
5,
|
||||||
|
24,
|
||||||
|
49
|
||||||
|
],
|
||||||
|
"of": [
|
||||||
|
6,
|
||||||
|
9,
|
||||||
|
19,
|
||||||
|
57,
|
||||||
|
84,
|
||||||
|
117,
|
||||||
|
120
|
||||||
|
],
|
||||||
|
"waste": [
|
||||||
|
7,
|
||||||
|
118
|
||||||
|
],
|
||||||
|
"mash": [
|
||||||
|
8,
|
||||||
|
119
|
||||||
|
],
|
||||||
|
"cane": [
|
||||||
|
11,
|
||||||
|
122
|
||||||
|
],
|
||||||
|
"sugar": [
|
||||||
|
12,
|
||||||
|
123
|
||||||
|
],
|
||||||
|
"factory,": [
|
||||||
|
13
|
||||||
|
],
|
||||||
|
"belonging": [
|
||||||
|
14
|
||||||
|
],
|
||||||
|
"to": [
|
||||||
|
15
|
||||||
|
],
|
||||||
|
"the": [
|
||||||
|
16,
|
||||||
|
26,
|
||||||
|
52,
|
||||||
|
55,
|
||||||
|
66,
|
||||||
|
93,
|
||||||
|
115,
|
||||||
|
135
|
||||||
|
],
|
||||||
|
"technical": [
|
||||||
|
17,
|
||||||
|
48
|
||||||
|
],
|
||||||
|
"field": [
|
||||||
|
18
|
||||||
|
],
|
||||||
|
"industrial": [
|
||||||
|
20
|
||||||
|
],
|
||||||
|
"wastewater": [
|
||||||
|
21
|
||||||
|
],
|
||||||
|
"treatment.": [
|
||||||
|
22
|
||||||
|
],
|
||||||
|
"comprises": [
|
||||||
|
25
|
||||||
|
],
|
||||||
|
"following": [
|
||||||
|
27
|
||||||
|
],
|
||||||
|
"steps": [
|
||||||
|
28
|
||||||
|
],
|
||||||
|
"of:": [
|
||||||
|
29
|
||||||
|
],
|
||||||
|
"(1)": [
|
||||||
|
30
|
||||||
|
],
|
||||||
|
"pretreatment;": [
|
||||||
|
31
|
||||||
|
],
|
||||||
|
"(2)": [
|
||||||
|
32
|
||||||
|
],
|
||||||
|
"primary": [
|
||||||
|
33
|
||||||
|
],
|
||||||
|
"concentration;": [
|
||||||
|
34
|
||||||
|
],
|
||||||
|
"(3)": [
|
||||||
|
35
|
||||||
|
],
|
||||||
|
"cooling": [
|
||||||
|
36
|
||||||
|
],
|
||||||
|
"sedimentation": [
|
||||||
|
37
|
||||||
|
],
|
||||||
|
"and": [
|
||||||
|
38,
|
||||||
|
45,
|
||||||
|
62,
|
||||||
|
80,
|
||||||
|
86,
|
||||||
|
114,
|
||||||
|
134
|
||||||
|
],
|
||||||
|
"dense": [
|
||||||
|
39
|
||||||
|
],
|
||||||
|
"slurry": [
|
||||||
|
40
|
||||||
|
],
|
||||||
|
"drying;": [
|
||||||
|
41
|
||||||
|
],
|
||||||
|
"(4)": [
|
||||||
|
42
|
||||||
|
],
|
||||||
|
"secondary": [
|
||||||
|
43
|
||||||
|
],
|
||||||
|
"concentration": [
|
||||||
|
44
|
||||||
|
],
|
||||||
|
"drying.": [
|
||||||
|
46
|
||||||
|
],
|
||||||
|
"disclosed": [
|
||||||
|
50
|
||||||
|
],
|
||||||
|
"by": [
|
||||||
|
51
|
||||||
|
],
|
||||||
|
"has": [
|
||||||
|
54
|
||||||
|
],
|
||||||
|
"advantages": [
|
||||||
|
56
|
||||||
|
],
|
||||||
|
"small": [
|
||||||
|
58
|
||||||
|
],
|
||||||
|
"investment,": [
|
||||||
|
59
|
||||||
|
],
|
||||||
|
"simple": [
|
||||||
|
60
|
||||||
|
],
|
||||||
|
"equipment": [
|
||||||
|
61
|
||||||
|
],
|
||||||
|
"easiness": [
|
||||||
|
63
|
||||||
|
],
|
||||||
|
"in": [
|
||||||
|
64,
|
||||||
|
132
|
||||||
|
],
|
||||||
|
"popularization;": [
|
||||||
|
65
|
||||||
|
],
|
||||||
|
"product": [
|
||||||
|
67
|
||||||
|
],
|
||||||
|
"after": [
|
||||||
|
68
|
||||||
|
],
|
||||||
|
"is": [
|
||||||
|
70,
|
||||||
|
91,
|
||||||
|
98,
|
||||||
|
102,
|
||||||
|
112,
|
||||||
|
130,
|
||||||
|
137
|
||||||
|
],
|
||||||
|
"high-quality": [
|
||||||
|
72
|
||||||
|
],
|
||||||
|
"high": [
|
||||||
|
73
|
||||||
|
],
|
||||||
|
"value-added": [
|
||||||
|
74
|
||||||
|
],
|
||||||
|
"(fully": [
|
||||||
|
75
|
||||||
|
],
|
||||||
|
"water-soluble)": [
|
||||||
|
76
|
||||||
|
],
|
||||||
|
"potassium": [
|
||||||
|
77
|
||||||
|
],
|
||||||
|
"humate": [
|
||||||
|
78
|
||||||
|
],
|
||||||
|
"product,": [
|
||||||
|
79
|
||||||
|
],
|
||||||
|
"new": [
|
||||||
|
82
|
||||||
|
],
|
||||||
|
"mode": [
|
||||||
|
83
|
||||||
|
],
|
||||||
|
"profit": [
|
||||||
|
87
|
||||||
|
],
|
||||||
|
"enabling": [
|
||||||
|
88
|
||||||
|
],
|
||||||
|
"sustainable": [
|
||||||
|
89
|
||||||
|
],
|
||||||
|
"development": [
|
||||||
|
90
|
||||||
|
],
|
||||||
|
"realized;": [
|
||||||
|
92
|
||||||
|
],
|
||||||
|
"environmental": [
|
||||||
|
94
|
||||||
|
],
|
||||||
|
"protection": [
|
||||||
|
95
|
||||||
|
],
|
||||||
|
"effect": [
|
||||||
|
97
|
||||||
|
],
|
||||||
|
"good,": [
|
||||||
|
99
|
||||||
|
],
|
||||||
|
"water": [
|
||||||
|
100,
|
||||||
|
106
|
||||||
|
],
|
||||||
|
"balance": [
|
||||||
|
101
|
||||||
|
],
|
||||||
|
"realized": [
|
||||||
|
103
|
||||||
|
],
|
||||||
|
"through": [
|
||||||
|
104
|
||||||
|
],
|
||||||
|
"final": [
|
||||||
|
105
|
||||||
|
],
|
||||||
|
"quality": [
|
||||||
|
107
|
||||||
|
],
|
||||||
|
"treatment,": [
|
||||||
|
108
|
||||||
|
],
|
||||||
|
"real": [
|
||||||
|
109
|
||||||
|
],
|
||||||
|
"zero": [
|
||||||
|
110
|
||||||
|
],
|
||||||
|
"emission": [
|
||||||
|
111
|
||||||
|
],
|
||||||
|
"realized,": [
|
||||||
|
113
|
||||||
|
],
|
||||||
|
"problem": [
|
||||||
|
116
|
||||||
|
],
|
||||||
|
"factory": [
|
||||||
|
124
|
||||||
|
],
|
||||||
|
"can": [
|
||||||
|
125
|
||||||
|
],
|
||||||
|
"be": [
|
||||||
|
126
|
||||||
|
],
|
||||||
|
"solved": [
|
||||||
|
127
|
||||||
|
],
|
||||||
|
"fundamentally;": [
|
||||||
|
128
|
||||||
|
],
|
||||||
|
"energy": [
|
||||||
|
129
|
||||||
|
],
|
||||||
|
"saved": [
|
||||||
|
131
|
||||||
|
],
|
||||||
|
"operation,": [
|
||||||
|
133
|
||||||
|
],
|
||||||
|
"feasibility": [
|
||||||
|
136
|
||||||
|
],
|
||||||
|
"high.": [
|
||||||
|
138
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>dhp-workflows</artifactId>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<version>1.2.1-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>dhp-enrichment</artifactId>
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-schemas</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-hive_2.11</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>dom4j</groupId>
|
||||||
|
<artifactId>dom4j</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>jaxen</groupId>
|
||||||
|
<artifactId>jaxen</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
|
<artifactId>json-path</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.github.classgraph</groupId>
|
||||||
|
<artifactId>classgraph</artifactId>
|
||||||
|
<version>4.8.71</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,169 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
|
public class PropagationConstant {
|
||||||
|
public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_DATA_INFO_TYPE = "propagation";
|
||||||
|
|
||||||
|
public static final String TRUE = "true";
|
||||||
|
|
||||||
|
public static final String DNET_COUNTRY_SCHEMA = "dnet:countries";
|
||||||
|
public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
|
||||||
|
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
|
||||||
|
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = "result:organization:instrepo";
|
||||||
|
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel";
|
||||||
|
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = "result:community:semrel";
|
||||||
|
public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = "result:community:organization";
|
||||||
|
public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
|
||||||
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
|
||||||
|
|
||||||
|
public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy";
|
||||||
|
|
||||||
|
public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization";
|
||||||
|
public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation";
|
||||||
|
public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf";
|
||||||
|
public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution";
|
||||||
|
|
||||||
|
public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult";
|
||||||
|
|
||||||
|
public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject";
|
||||||
|
public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome";
|
||||||
|
public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy";
|
||||||
|
public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces";
|
||||||
|
|
||||||
|
public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges";
|
||||||
|
|
||||||
|
public static final String PROPAGATION_AUTHOR_PID = "ORCID";
|
||||||
|
|
||||||
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
|
||||||
|
+
|
||||||
|
"from result r " +
|
||||||
|
"lateral view explode(instance) i as inst " +
|
||||||
|
"where r.datainfo.deletedbyinference=false";
|
||||||
|
|
||||||
|
public static Country getCountry(String classid, String classname) {
|
||||||
|
Country nc = new Country();
|
||||||
|
nc.setClassid(classid);
|
||||||
|
nc.setClassname(classname);
|
||||||
|
nc.setSchemename(DNET_COUNTRY_SCHEMA);
|
||||||
|
nc.setSchemeid(DNET_COUNTRY_SCHEMA);
|
||||||
|
nc
|
||||||
|
.setDataInfo(
|
||||||
|
getDataInfo(
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_COUNTRY_INSTREPO_CLASS_ID,
|
||||||
|
PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME));
|
||||||
|
return nc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DataInfo getDataInfo(
|
||||||
|
String inference_provenance, String inference_class_id, String inference_class_name) {
|
||||||
|
DataInfo di = new DataInfo();
|
||||||
|
di.setInferred(true);
|
||||||
|
di.setDeletedbyinference(false);
|
||||||
|
di.setTrust("0.85");
|
||||||
|
di.setInferenceprovenance(inference_provenance);
|
||||||
|
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
|
||||||
|
return di;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
|
||||||
|
Qualifier pa = new Qualifier();
|
||||||
|
pa.setClassid(inference_class_id);
|
||||||
|
pa.setClassname(inference_class_name);
|
||||||
|
pa.setSchemeid(DNET_SCHEMA_ID);
|
||||||
|
pa.setSchemename(DNET_SCHEMA_NAME);
|
||||||
|
return pa;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Relation getRelation(
|
||||||
|
String source,
|
||||||
|
String target,
|
||||||
|
String rel_class,
|
||||||
|
String rel_type,
|
||||||
|
String subrel_type,
|
||||||
|
String inference_provenance,
|
||||||
|
String inference_class_id,
|
||||||
|
String inference_class_name) {
|
||||||
|
Relation r = new Relation();
|
||||||
|
r.setSource(source);
|
||||||
|
r.setTarget(target);
|
||||||
|
r.setRelClass(rel_class);
|
||||||
|
r.setRelType(rel_type);
|
||||||
|
r.setSubRelType(subrel_type);
|
||||||
|
r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getConstraintList(String text, List<String> constraints) {
|
||||||
|
String ret = " and (" + text + constraints.get(0) + "'";
|
||||||
|
for (int i = 1; i < constraints.size(); i++) {
|
||||||
|
ret += " OR " + text + constraints.get(i) + "'";
|
||||||
|
}
|
||||||
|
ret += ")";
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void removeOutputDir(SparkSession spark, String path) {
|
||||||
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
|
||||||
|
return Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Boolean isTest(ArgumentApplicationParser parser) {
|
||||||
|
return Optional
|
||||||
|
.ofNullable(parser.get("isTest"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.FALSE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void createCfHbforResult(SparkSession spark) {
|
||||||
|
org.apache.spark.sql.Dataset<Row> cfhb = spark.sql(cfHbforResultQuery);
|
||||||
|
cfhb.createOrReplaceTempView("cfhb");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <R> Dataset<R> readPath(
|
||||||
|
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,120 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.bulktag.community.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class SparkBulkTagJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob.class);
|
||||||
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkBulkTagJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
Boolean isTest = Optional
|
||||||
|
.ofNullable(parser.get("isTest"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.FALSE);
|
||||||
|
log.info("isTest: {} ", isTest);
|
||||||
|
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class);
|
||||||
|
log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final Boolean saveGraph = Optional
|
||||||
|
.ofNullable(parser.get("saveGraph"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
CommunityConfiguration cc;
|
||||||
|
|
||||||
|
String taggingConf = parser.get("taggingConf");
|
||||||
|
|
||||||
|
if (isTest) {
|
||||||
|
cc = CommunityConfigurationFactory.newInstance(taggingConf);
|
||||||
|
} else {
|
||||||
|
cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookUpUrl"));
|
||||||
|
}
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void execBulkTag(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
ProtoMap protoMappingParams,
|
||||||
|
Class<R> resultClazz,
|
||||||
|
CommunityConfiguration communityConfiguration) {
|
||||||
|
|
||||||
|
ResultTagger resultTagger = new ResultTagger();
|
||||||
|
readPath(spark, inputPath, resultClazz)
|
||||||
|
.map(
|
||||||
|
(MapFunction<R, R>) value -> resultTagger
|
||||||
|
.enrichContextCriteria(
|
||||||
|
value, communityConfiguration, protoMappingParams),
|
||||||
|
Encoders.bean(resultClazz))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <R> Dataset<R> readPath(
|
||||||
|
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
/** Created by miriam on 01/08/2018. */
|
||||||
|
public class Community implements Serializable {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(Community.class);
|
||||||
|
|
||||||
|
private String id;
|
||||||
|
private List<String> subjects = new ArrayList<>();
|
||||||
|
private List<Provider> providers = new ArrayList<>();
|
||||||
|
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
|
||||||
|
|
||||||
|
public String toJson() {
|
||||||
|
final Gson g = new Gson();
|
||||||
|
return g.toJson(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isValid() {
|
||||||
|
return !getSubjects().isEmpty()
|
||||||
|
|| !getProviders().isEmpty()
|
||||||
|
|| !getZenodoCommunities().isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getSubjects() {
|
||||||
|
return subjects;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSubjects(List<String> subjects) {
|
||||||
|
this.subjects = subjects;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Provider> getProviders() {
|
||||||
|
return providers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProviders(List<Provider> providers) {
|
||||||
|
this.providers = providers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<ZenodoCommunity> getZenodoCommunities() {
|
||||||
|
return zenodoCommunities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) {
|
||||||
|
this.zenodoCommunities = zenodoCommunities;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,196 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.Selection;
|
||||||
|
|
||||||
|
/** Created by miriam on 02/08/2018. */
|
||||||
|
public class CommunityConfiguration implements Serializable {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(CommunityConfiguration.class);
|
||||||
|
|
||||||
|
private Map<String, Community> communities;
|
||||||
|
|
||||||
|
// map subject -> communityid
|
||||||
|
private Map<String, List<Pair<String, SelectionConstraints>>> subjectMap = new HashMap<>();
|
||||||
|
// map datasourceid -> communityid
|
||||||
|
private Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap = new HashMap<>();
|
||||||
|
// map zenodocommunityid -> communityid
|
||||||
|
private Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap = new HashMap<>();
|
||||||
|
|
||||||
|
public Map<String, List<Pair<String, SelectionConstraints>>> getSubjectMap() {
|
||||||
|
return subjectMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSubjectMap(Map<String, List<Pair<String, SelectionConstraints>>> subjectMap) {
|
||||||
|
this.subjectMap = subjectMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, List<Pair<String, SelectionConstraints>>> getDatasourceMap() {
|
||||||
|
return datasourceMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDatasourceMap(
|
||||||
|
Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap) {
|
||||||
|
this.datasourceMap = datasourceMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, List<Pair<String, SelectionConstraints>>> getZenodocommunityMap() {
|
||||||
|
return zenodocommunityMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setZenodocommunityMap(
|
||||||
|
Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap) {
|
||||||
|
this.zenodocommunityMap = zenodocommunityMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
CommunityConfiguration(final Map<String, Community> communities) {
|
||||||
|
this.communities = communities;
|
||||||
|
init();
|
||||||
|
}
|
||||||
|
|
||||||
|
void init() {
|
||||||
|
|
||||||
|
if (subjectMap == null) {
|
||||||
|
subjectMap = Maps.newHashMap();
|
||||||
|
}
|
||||||
|
if (datasourceMap == null) {
|
||||||
|
datasourceMap = Maps.newHashMap();
|
||||||
|
}
|
||||||
|
if (zenodocommunityMap == null) {
|
||||||
|
zenodocommunityMap = Maps.newHashMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Community c : getCommunities().values()) {
|
||||||
|
// get subjects
|
||||||
|
final String id = c.getId();
|
||||||
|
for (String sbj : c.getSubjects()) {
|
||||||
|
Pair<String, SelectionConstraints> p = new Pair<>(id, new SelectionConstraints());
|
||||||
|
add(sbj.toLowerCase().trim(), p, subjectMap);
|
||||||
|
}
|
||||||
|
// get datasources
|
||||||
|
for (Provider d : c.getProviders()) {
|
||||||
|
|
||||||
|
add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
|
||||||
|
}
|
||||||
|
// get zenodo communities
|
||||||
|
for (ZenodoCommunity zc : c.getZenodoCommunities()) {
|
||||||
|
add(
|
||||||
|
zc.getZenodoCommunityId(),
|
||||||
|
new Pair<>(id, zc.getSelCriteria()),
|
||||||
|
zenodocommunityMap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void add(
|
||||||
|
String key,
|
||||||
|
Pair<String, SelectionConstraints> value,
|
||||||
|
Map<String, List<Pair<String, SelectionConstraints>>> map) {
|
||||||
|
List<Pair<String, SelectionConstraints>> values = map.get(key);
|
||||||
|
|
||||||
|
if (values == null) {
|
||||||
|
values = new ArrayList<>();
|
||||||
|
map.put(key, values);
|
||||||
|
}
|
||||||
|
values.add(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Pair<String, SelectionConstraints>> getCommunityForSubject(String sbj) {
|
||||||
|
return subjectMap.get(sbj);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Pair<String, SelectionConstraints>> getCommunityForDatasource(String dts) {
|
||||||
|
return datasourceMap.get(dts);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getCommunityForDatasource(
|
||||||
|
final String dts, final Map<String, List<String>> param) {
|
||||||
|
List<Pair<String, SelectionConstraints>> lp = datasourceMap.get(dts);
|
||||||
|
if (lp == null)
|
||||||
|
return Lists.newArrayList();
|
||||||
|
|
||||||
|
return lp
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
p -> {
|
||||||
|
if (p.getSnd() == null)
|
||||||
|
return p.getFst();
|
||||||
|
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
|
||||||
|
return p.getFst();
|
||||||
|
else
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(st -> (st != null))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Pair<String, SelectionConstraints>> getCommunityForZenodoCommunity(String zc) {
|
||||||
|
return zenodocommunityMap.get(zc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getCommunityForSubjectValue(String value) {
|
||||||
|
|
||||||
|
return getContextIds(subjectMap.get(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getCommunityForDatasourceValue(String value) {
|
||||||
|
|
||||||
|
return getContextIds(datasourceMap.get(value.toLowerCase()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getCommunityForZenodoCommunityValue(String value) {
|
||||||
|
|
||||||
|
return getContextIds(zenodocommunityMap.get(value.toLowerCase()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getContextIds(List<Pair<String, SelectionConstraints>> list) {
|
||||||
|
if (list != null) {
|
||||||
|
return list.stream().map(p -> p.getFst()).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Community> getCommunities() {
|
||||||
|
return communities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCommunities(Map<String, Community> communities) {
|
||||||
|
this.communities = communities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toJson() {
|
||||||
|
GsonBuilder builder = new GsonBuilder();
|
||||||
|
builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
|
||||||
|
Gson gson = builder.create();
|
||||||
|
|
||||||
|
return gson.toJson(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return communities.keySet().size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Community getCommunityById(String id) {
|
||||||
|
return communities.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Community> getCommunityList() {
|
||||||
|
return Lists.newLinkedList(communities.values());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,138 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
import org.dom4j.Node;
|
||||||
|
import org.dom4j.io.SAXReader;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.Selection;
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
|
||||||
|
|
||||||
|
/** Created by miriam on 03/08/2018. */
|
||||||
|
public class CommunityConfigurationFactory {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class);
|
||||||
|
|
||||||
|
private static VerbResolver resolver = VerbResolverFactory.newInstance();
|
||||||
|
|
||||||
|
public static CommunityConfiguration newInstance(final String xml) throws DocumentException {
|
||||||
|
|
||||||
|
log.debug(String.format("parsing community configuration from:\n%s", xml));
|
||||||
|
|
||||||
|
final Document doc = new SAXReader().read(new StringReader(xml));
|
||||||
|
|
||||||
|
final Map<String, Community> communities = Maps.newHashMap();
|
||||||
|
|
||||||
|
for (final Object o : doc.selectNodes("//community")) {
|
||||||
|
|
||||||
|
final Node node = (Node) o;
|
||||||
|
|
||||||
|
final Community community = parseCommunity(node);
|
||||||
|
|
||||||
|
if (community.isValid()) {
|
||||||
|
communities.put(community.getId(), community);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info(String.format("loaded %s community configuration profiles", communities.size()));
|
||||||
|
log.debug(String.format("loaded community configuration:\n%s", communities.toString()));
|
||||||
|
|
||||||
|
return new CommunityConfiguration(communities);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CommunityConfiguration fromJson(final String json) {
|
||||||
|
GsonBuilder builder = new GsonBuilder();
|
||||||
|
builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
|
||||||
|
Gson gson = builder.create();
|
||||||
|
final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class);
|
||||||
|
log.info(String.format("loaded %s community configuration profiles", conf.size()));
|
||||||
|
conf.init();
|
||||||
|
log.info("created inverse maps");
|
||||||
|
|
||||||
|
return conf;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Community parseCommunity(final Node node) {
|
||||||
|
|
||||||
|
final Community c = new Community();
|
||||||
|
|
||||||
|
c.setId(node.valueOf("./@id"));
|
||||||
|
|
||||||
|
log.info(String.format("community id: %s", c.getId()));
|
||||||
|
|
||||||
|
c.setSubjects(parseSubjects(node));
|
||||||
|
c.setProviders(parseDatasources(node));
|
||||||
|
c.setZenodoCommunities(parseZenodoCommunities(node));
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> parseSubjects(final Node node) {
|
||||||
|
|
||||||
|
final List<String> subjects = Lists.newArrayList();
|
||||||
|
|
||||||
|
final List<Node> list = node.selectNodes("./subjects/subject");
|
||||||
|
|
||||||
|
for (Node n : list) {
|
||||||
|
log.debug("text of the node " + n.getText());
|
||||||
|
subjects.add(StringUtils.trim(n.getText()));
|
||||||
|
}
|
||||||
|
log.info("size of the subject list " + subjects.size());
|
||||||
|
return subjects;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Provider> parseDatasources(final Node node) {
|
||||||
|
final List<Node> list = node.selectNodes("./datasources/datasource");
|
||||||
|
final List<Provider> providerList = new ArrayList<>();
|
||||||
|
for (Node n : list) {
|
||||||
|
Provider d = new Provider();
|
||||||
|
d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
|
||||||
|
d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
|
||||||
|
providerList.add(d);
|
||||||
|
}
|
||||||
|
log.info("size of the datasource list " + providerList.size());
|
||||||
|
return providerList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) {
|
||||||
|
final Node oacommunitynode = node.selectSingleNode("./oacommunity");
|
||||||
|
String oacommunity = null;
|
||||||
|
if (oacommunitynode != null) {
|
||||||
|
String tmp = oacommunitynode.getText();
|
||||||
|
if (StringUtils.isNotBlank(tmp))
|
||||||
|
oacommunity = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<Node> list = node.selectNodes("./zenodocommunities/zenodocommunity");
|
||||||
|
final List<ZenodoCommunity> zenodoCommunityList = new ArrayList<>();
|
||||||
|
for (Node n : list) {
|
||||||
|
ZenodoCommunity zc = new ZenodoCommunity();
|
||||||
|
zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText());
|
||||||
|
zc.setSelCriteria(n.selectSingleNode("./selcriteria"));
|
||||||
|
|
||||||
|
zenodoCommunityList.add(zc);
|
||||||
|
}
|
||||||
|
if (oacommunity != null) {
|
||||||
|
ZenodoCommunity zc = new ZenodoCommunity();
|
||||||
|
zc.setZenodoCommunityId(oacommunity);
|
||||||
|
zenodoCommunityList.add(zc);
|
||||||
|
}
|
||||||
|
log.info("size of the zenodo community list " + zenodoCommunityList.size());
|
||||||
|
return zenodoCommunityList;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.Selection;
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
||||||
|
|
||||||
|
public class Constraint implements Serializable {
|
||||||
|
private String verb;
|
||||||
|
private String field;
|
||||||
|
private String value;
|
||||||
|
private Selection selection;
|
||||||
|
|
||||||
|
public Constraint() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getVerb() {
|
||||||
|
return verb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerb(String verb) {
|
||||||
|
this.verb = verb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getField() {
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setField(String field) {
|
||||||
|
this.field = field;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getValue() {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setValue(String value) {
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelection(Selection sel) {
|
||||||
|
selection = sel;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelection(VerbResolver resolver)
|
||||||
|
throws InvocationTargetException, NoSuchMethodException, InstantiationException,
|
||||||
|
IllegalAccessException {
|
||||||
|
selection = resolver.getSelectionCriteria(verb, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean verifyCriteria(String metadata) {
|
||||||
|
return selection.apply(metadata);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,74 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.lang.reflect.Type;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.reflect.TypeToken;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
||||||
|
|
||||||
|
/** Created by miriam on 02/08/2018. */
|
||||||
|
public class Constraints implements Serializable {
|
||||||
|
private static final Log log = LogFactory.getLog(Constraints.class);
|
||||||
|
// private ConstraintEncapsulator ce;
|
||||||
|
private List<Constraint> constraint;
|
||||||
|
|
||||||
|
public Constraints() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Constraint> getConstraint() {
|
||||||
|
return constraint;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setConstraint(List<Constraint> constraint) {
|
||||||
|
this.constraint = constraint;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSc(String json) {
|
||||||
|
Type collectionType = new TypeToken<Collection<Constraint>>() {
|
||||||
|
}.getType();
|
||||||
|
constraint = new Gson().fromJson(json, collectionType);
|
||||||
|
}
|
||||||
|
|
||||||
|
void setSelection(VerbResolver resolver) {
|
||||||
|
for (Constraint st : constraint) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
st.setSelection(resolver);
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
log.error(e.getMessage());
|
||||||
|
} catch (IllegalAccessException e) {
|
||||||
|
log.error(e.getMessage());
|
||||||
|
} catch (InvocationTargetException e) {
|
||||||
|
log.error(e.getMessage());
|
||||||
|
} catch (InstantiationException e) {
|
||||||
|
log.error(e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Constraint in and
|
||||||
|
public boolean verifyCriteria(final Map<String, List<String>> param) {
|
||||||
|
|
||||||
|
for (Constraint sc : constraint) {
|
||||||
|
boolean verified = false;
|
||||||
|
for (String value : param.get(sc.getField())) {
|
||||||
|
if (sc.verifyCriteria(value.trim())) {
|
||||||
|
verified = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!verified)
|
||||||
|
return verified;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
/** Created by miriam on 03/08/2018. */
|
||||||
|
public class Pair<A, B> implements Serializable {
|
||||||
|
private A fst;
|
||||||
|
private B snd;
|
||||||
|
|
||||||
|
public A getFst() {
|
||||||
|
return fst;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair setFst(A fst) {
|
||||||
|
this.fst = fst;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public B getSnd() {
|
||||||
|
return snd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair setSnd(B snd) {
|
||||||
|
this.snd = snd;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair(A a, B b) {
|
||||||
|
fst = a;
|
||||||
|
snd = b;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toJson() {
|
||||||
|
return new Gson().toJson(this);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
public class ProtoMap extends HashMap<String, String> implements Serializable {
|
||||||
|
|
||||||
|
public ProtoMap() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,61 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
||||||
|
|
||||||
|
/** Created by miriam on 01/08/2018. */
|
||||||
|
public class Provider implements Serializable {
|
||||||
|
private static final Log log = LogFactory.getLog(Provider.class);
|
||||||
|
|
||||||
|
private String openaireId;
|
||||||
|
|
||||||
|
private SelectionConstraints selectionConstraints;
|
||||||
|
|
||||||
|
public SelectionConstraints getSelCriteria() {
|
||||||
|
return selectionConstraints;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SelectionConstraints getSelectionConstraints() {
|
||||||
|
return selectionConstraints;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelectionConstraints(SelectionConstraints selectionConstraints) {
|
||||||
|
this.selectionConstraints = selectionConstraints;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelCriteria(SelectionConstraints selCriteria) {
|
||||||
|
this.selectionConstraints = selCriteria;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOpenaireId() {
|
||||||
|
return openaireId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOpenaireId(String openaireId) {
|
||||||
|
this.openaireId = openaireId;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setSelCriteria(String json, VerbResolver resolver) {
|
||||||
|
log.info("Selection constraints for datasource = " + json);
|
||||||
|
selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class);
|
||||||
|
|
||||||
|
selectionConstraints.setSelection(resolver);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelCriteria(Node n, VerbResolver resolver) {
|
||||||
|
try {
|
||||||
|
setSelCriteria(n.getText(), resolver);
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.info("not set selection criteria... ");
|
||||||
|
selectionConstraints = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
public class QueryInformationSystem {
|
||||||
|
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
|
||||||
|
+ " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
|
||||||
|
+ " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
|
||||||
|
+ " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
|
||||||
|
+ " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
|
||||||
|
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] "
|
||||||
|
+ " return "
|
||||||
|
+ " <community> "
|
||||||
|
+ " { $x//CONFIGURATION/context/@id} "
|
||||||
|
+ " <subjects> "
|
||||||
|
+ " {for $y in tokenize($subj,',') "
|
||||||
|
+ " return "
|
||||||
|
+ " <subject>{$y}</subject>} "
|
||||||
|
+ " </subjects> "
|
||||||
|
+ " <datasources> "
|
||||||
|
+ " {for $d in $datasources "
|
||||||
|
+ " where $d/param[./@name='enabled']/text()='true' "
|
||||||
|
+ " return "
|
||||||
|
+ " <datasource> "
|
||||||
|
+ " <openaireId> "
|
||||||
|
+ " {$d//param[./@name='openaireId']/text()} "
|
||||||
|
+ " </openaireId> "
|
||||||
|
+ " <selcriteria> "
|
||||||
|
+ " {$d/param[./@name='selcriteria']/text()} "
|
||||||
|
+ " </selcriteria> "
|
||||||
|
+ " </datasource> } "
|
||||||
|
+ " </datasources> "
|
||||||
|
+ " <zenodocommunities> "
|
||||||
|
+ " {for $zc in $communities "
|
||||||
|
+ " return "
|
||||||
|
+ " <zenodocommunity> "
|
||||||
|
+ " <zenodoid> "
|
||||||
|
+ " {$zc/param[./@name='zenodoid']/text()} "
|
||||||
|
+ " </zenodoid> "
|
||||||
|
+ " <selcriteria> "
|
||||||
|
+ " {$zc/param[./@name='selcriteria']/text()} "
|
||||||
|
+ " </selcriteria> "
|
||||||
|
+ " </zenodocommunity>} "
|
||||||
|
+ " </zenodocommunities> "
|
||||||
|
+ " </community>";
|
||||||
|
|
||||||
|
public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
|
||||||
|
throws ISLookUpException, DocumentException {
|
||||||
|
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||||
|
final List<String> res = isLookUp.quickSearchProfile(XQUERY);
|
||||||
|
|
||||||
|
final String xmlConf = "<communities>" + Joiner.on(" ").join(res) + "</communities>";
|
||||||
|
|
||||||
|
return CommunityConfigurationFactory.newInstance(xmlConf);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,247 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.jayway.jsonpath.DocumentContext;
|
||||||
|
import com.jayway.jsonpath.JsonPath;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
|
/** Created by miriam on 02/08/2018. */
|
||||||
|
public class ResultTagger implements Serializable {
|
||||||
|
|
||||||
|
private String trust = "0.8";
|
||||||
|
|
||||||
|
private boolean clearContext(Result result) {
|
||||||
|
int tmp = result.getContext().size();
|
||||||
|
List<Context> clist = result
|
||||||
|
.getContext()
|
||||||
|
.stream()
|
||||||
|
.filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
result.setContext(clist);
|
||||||
|
return (tmp != clist.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, List<String>> getParamMap(final Result result, Map<String, String> params) {
|
||||||
|
Map<String, List<String>> param = new HashMap<>();
|
||||||
|
String json = new Gson().toJson(result, Result.class);
|
||||||
|
DocumentContext jsonContext = JsonPath.parse(json);
|
||||||
|
if (params == null) {
|
||||||
|
params = new HashMap<>();
|
||||||
|
}
|
||||||
|
for (String key : params.keySet()) {
|
||||||
|
try {
|
||||||
|
param.put(key, jsonContext.read(params.get(key)));
|
||||||
|
} catch (com.jayway.jsonpath.PathNotFoundException e) {
|
||||||
|
param.put(key, new ArrayList<>());
|
||||||
|
// throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public <R extends Result> R enrichContextCriteria(
|
||||||
|
final R result, final CommunityConfiguration conf, final Map<String, String> criteria) {
|
||||||
|
|
||||||
|
// }
|
||||||
|
// public Result enrichContextCriteria(final Result result, final CommunityConfiguration
|
||||||
|
// conf, final Map<String,String> criteria) {
|
||||||
|
final Map<String, List<String>> param = getParamMap(result, criteria);
|
||||||
|
|
||||||
|
// Verify if the entity is deletedbyinference. In case verify if to clean the context list
|
||||||
|
// from all the zenodo communities
|
||||||
|
if (result.getDataInfo().getDeletedbyinference()) {
|
||||||
|
clearContext(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// communities contains all the communities to be added as context for the result
|
||||||
|
final Set<String> communities = new HashSet<>();
|
||||||
|
|
||||||
|
// tagging for Subject
|
||||||
|
final Set<String> subjects = new HashSet<>();
|
||||||
|
Optional<List<StructuredProperty>> oresultsubj = Optional.ofNullable(result.getSubject());
|
||||||
|
if (oresultsubj.isPresent()) {
|
||||||
|
oresultsubj
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.map(subject -> subject.getValue())
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.map(String::toLowerCase)
|
||||||
|
.map(String::trim)
|
||||||
|
.collect(Collectors.toCollection(HashSet::new))
|
||||||
|
.forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
|
||||||
|
}
|
||||||
|
|
||||||
|
communities.addAll(subjects);
|
||||||
|
|
||||||
|
// Tagging for datasource
|
||||||
|
final Set<String> datasources = new HashSet<>();
|
||||||
|
final Set<String> tmp = new HashSet<>();
|
||||||
|
|
||||||
|
Optional<List<Instance>> oresultinstance = Optional.ofNullable(result.getInstance());
|
||||||
|
if (oresultinstance.isPresent()) {
|
||||||
|
for (Instance i : oresultinstance.get()) {
|
||||||
|
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
|
||||||
|
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
|
||||||
|
}
|
||||||
|
|
||||||
|
oresultinstance
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
|
||||||
|
.flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
|
||||||
|
.map(s -> StringUtils.substringAfter(s, "|"))
|
||||||
|
.collect(Collectors.toCollection(HashSet::new))
|
||||||
|
.forEach(
|
||||||
|
dsId -> datasources
|
||||||
|
.addAll(
|
||||||
|
conf.getCommunityForDatasource(dsId, param)));
|
||||||
|
}
|
||||||
|
|
||||||
|
communities.addAll(datasources);
|
||||||
|
|
||||||
|
/* Tagging for Zenodo Communities */
|
||||||
|
final Set<String> czenodo = new HashSet<>();
|
||||||
|
|
||||||
|
Optional<List<Context>> oresultcontext = Optional.ofNullable(result.getContext());
|
||||||
|
if (oresultcontext.isPresent()) {
|
||||||
|
oresultcontext
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.forEach(
|
||||||
|
c -> czenodo
|
||||||
|
.addAll(
|
||||||
|
conf
|
||||||
|
.getCommunityForZenodoCommunityValue(
|
||||||
|
c
|
||||||
|
.getId()
|
||||||
|
.substring(
|
||||||
|
c.getId().lastIndexOf("/") + 1)
|
||||||
|
.trim())));
|
||||||
|
}
|
||||||
|
|
||||||
|
communities.addAll(czenodo);
|
||||||
|
|
||||||
|
clearContext(result);
|
||||||
|
|
||||||
|
/* Verify if there is something to bulktag */
|
||||||
|
if (communities.isEmpty()) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
.getContext()
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
c -> {
|
||||||
|
if (communities.contains(c.getId())) {
|
||||||
|
Optional<List<DataInfo>> opt_dataInfoList = Optional.ofNullable(c.getDataInfo());
|
||||||
|
List<DataInfo> dataInfoList;
|
||||||
|
if (opt_dataInfoList.isPresent())
|
||||||
|
dataInfoList = opt_dataInfoList.get();
|
||||||
|
else {
|
||||||
|
dataInfoList = new ArrayList<>();
|
||||||
|
c.setDataInfo(dataInfoList);
|
||||||
|
}
|
||||||
|
if (subjects.contains(c.getId()))
|
||||||
|
dataInfoList
|
||||||
|
.add(
|
||||||
|
getDataInfo(
|
||||||
|
BULKTAG_DATA_INFO_TYPE,
|
||||||
|
CLASS_ID_SUBJECT,
|
||||||
|
CLASS_NAME_BULKTAG_SUBJECT));
|
||||||
|
if (datasources.contains(c.getId()))
|
||||||
|
dataInfoList
|
||||||
|
.add(
|
||||||
|
getDataInfo(
|
||||||
|
BULKTAG_DATA_INFO_TYPE,
|
||||||
|
CLASS_ID_DATASOURCE,
|
||||||
|
CLASS_NAME_BULKTAG_DATASOURCE));
|
||||||
|
if (czenodo.contains(c.getId()))
|
||||||
|
dataInfoList
|
||||||
|
.add(
|
||||||
|
getDataInfo(
|
||||||
|
BULKTAG_DATA_INFO_TYPE,
|
||||||
|
CLASS_ID_CZENODO,
|
||||||
|
CLASS_NAME_BULKTAG_ZENODO));
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
communities
|
||||||
|
.removeAll(
|
||||||
|
result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
|
||||||
|
|
||||||
|
if (communities.isEmpty())
|
||||||
|
return result;
|
||||||
|
|
||||||
|
List<Context> toaddcontext = communities
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
c -> {
|
||||||
|
Context context = new Context();
|
||||||
|
context.setId(c);
|
||||||
|
List<DataInfo> dataInfoList = new ArrayList<>();
|
||||||
|
if (subjects.contains(c))
|
||||||
|
dataInfoList
|
||||||
|
.add(
|
||||||
|
getDataInfo(
|
||||||
|
BULKTAG_DATA_INFO_TYPE,
|
||||||
|
CLASS_ID_SUBJECT,
|
||||||
|
CLASS_NAME_BULKTAG_SUBJECT));
|
||||||
|
if (datasources.contains(c))
|
||||||
|
dataInfoList
|
||||||
|
.add(
|
||||||
|
getDataInfo(
|
||||||
|
BULKTAG_DATA_INFO_TYPE,
|
||||||
|
CLASS_ID_DATASOURCE,
|
||||||
|
CLASS_NAME_BULKTAG_DATASOURCE));
|
||||||
|
if (czenodo.contains(c))
|
||||||
|
dataInfoList
|
||||||
|
.add(
|
||||||
|
getDataInfo(
|
||||||
|
BULKTAG_DATA_INFO_TYPE,
|
||||||
|
CLASS_ID_CZENODO,
|
||||||
|
CLASS_NAME_BULKTAG_ZENODO));
|
||||||
|
context.setDataInfo(dataInfoList);
|
||||||
|
return context;
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
result.getContext().addAll(toaddcontext);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DataInfo getDataInfo(
|
||||||
|
String inference_provenance, String inference_class_id, String inference_class_name) {
|
||||||
|
DataInfo di = new DataInfo();
|
||||||
|
di.setInferred(true);
|
||||||
|
di.setInferenceprovenance(inference_provenance);
|
||||||
|
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
|
||||||
|
return di;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
|
||||||
|
Qualifier pa = new Qualifier();
|
||||||
|
pa.setClassid(inference_class_id);
|
||||||
|
pa.setClassname(inference_class_name);
|
||||||
|
pa.setSchemeid(DNET_PROVENANCE_ACTIONS);
|
||||||
|
pa.setSchemename(DNET_PROVENANCE_ACTIONS);
|
||||||
|
return pa;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.Type;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.reflect.TypeToken;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
||||||
|
|
||||||
|
public class SelectionConstraints implements Serializable {
|
||||||
|
private List<Constraints> criteria;
|
||||||
|
|
||||||
|
public SelectionConstraints() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Constraints> getCriteria() {
|
||||||
|
return criteria;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCriteria(List<Constraints> criteria) {
|
||||||
|
this.criteria = criteria;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSc(String json) {
|
||||||
|
Type collectionType = new TypeToken<Collection<Constraints>>() {
|
||||||
|
}.getType();
|
||||||
|
criteria = new Gson().fromJson(json, collectionType);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Constraints in or
|
||||||
|
public boolean verifyCriteria(final Map<String, List<String>> param) {
|
||||||
|
for (Constraints selc : criteria) {
|
||||||
|
if (selc.verifyCriteria(param)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelection(VerbResolver resolver) {
|
||||||
|
|
||||||
|
for (Constraints cs : criteria) {
|
||||||
|
cs.setSelection(resolver);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
public class TaggingConstants {
|
||||||
|
|
||||||
|
public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
|
||||||
|
|
||||||
|
public static final String CLASS_ID_SUBJECT = "community:subject";
|
||||||
|
public static final String CLASS_ID_DATASOURCE = "community:datasource";
|
||||||
|
public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
|
||||||
|
|
||||||
|
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
|
||||||
|
|
||||||
|
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
|
||||||
|
public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
|
||||||
|
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
|
||||||
|
}
|
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
/** Created by miriam on 01/08/2018. */
|
||||||
|
public class ZenodoCommunity implements Serializable {
|
||||||
|
|
||||||
|
private String zenodoCommunityId;
|
||||||
|
|
||||||
|
private SelectionConstraints selCriteria;
|
||||||
|
|
||||||
|
public String getZenodoCommunityId() {
|
||||||
|
return zenodoCommunityId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setZenodoCommunityId(String zenodoCommunityId) {
|
||||||
|
this.zenodoCommunityId = zenodoCommunityId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SelectionConstraints getSelCriteria() {
|
||||||
|
return selCriteria;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelCriteria(SelectionConstraints selCriteria) {
|
||||||
|
this.selCriteria = selCriteria;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setSelCriteria(String json) {
|
||||||
|
// Type collectionType = new TypeToken<Collection<Constraints>>(){}.getType();
|
||||||
|
selCriteria = new Gson().fromJson(json, SelectionConstraints.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelCriteria(Node n) {
|
||||||
|
if (n == null) {
|
||||||
|
selCriteria = null;
|
||||||
|
} else {
|
||||||
|
setSelCriteria(n.getText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("contains")
|
||||||
|
public class ContainsVerb implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public ContainsVerb() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public ContainsVerb(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return value.contains(param);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("contains_ignorecase")
|
||||||
|
public class ContainsVerbIgnoreCase implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public ContainsVerbIgnoreCase() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public ContainsVerbIgnoreCase(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return value.toLowerCase().contains(param.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("equals")
|
||||||
|
public class EqualVerb implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public EqualVerb() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public EqualVerb(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return value.equals(param);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("equals_ignorecase")
|
||||||
|
public class EqualVerbIgnoreCase implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public EqualVerbIgnoreCase() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public EqualVerbIgnoreCase(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return value.equalsIgnoreCase(param);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,43 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.lang.reflect.Type;
|
||||||
|
|
||||||
|
import com.google.gson.*;
|
||||||
|
|
||||||
|
public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
|
||||||
|
|
||||||
|
private static final String CLASSNAME = "CLASSNAME";
|
||||||
|
private static final String DATA = "DATA";
|
||||||
|
|
||||||
|
public Object deserialize(
|
||||||
|
JsonElement jsonElement,
|
||||||
|
Type type,
|
||||||
|
JsonDeserializationContext jsonDeserializationContext)
|
||||||
|
throws JsonParseException {
|
||||||
|
|
||||||
|
JsonObject jsonObject = jsonElement.getAsJsonObject();
|
||||||
|
JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME);
|
||||||
|
String className = prim.getAsString();
|
||||||
|
Class klass = getObjectClass(className);
|
||||||
|
return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass);
|
||||||
|
}
|
||||||
|
|
||||||
|
public JsonElement serialize(
|
||||||
|
Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) {
|
||||||
|
JsonObject jsonObject = new JsonObject();
|
||||||
|
jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName());
|
||||||
|
jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement));
|
||||||
|
return jsonObject;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** **** Helper method to get the className of the object to be deserialized **** */
|
||||||
|
public Class getObjectClass(String className) {
|
||||||
|
try {
|
||||||
|
return Class.forName(className);
|
||||||
|
} catch (ClassNotFoundException e) {
|
||||||
|
// e.printStackTrace();
|
||||||
|
throw new JsonParseException(e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("not_contains")
|
||||||
|
public class NotContainsVerb implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public NotContainsVerb() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public NotContainsVerb(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return !value.contains(param);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("not_contains_ignorecase")
|
||||||
|
public class NotContainsVerbIgnoreCase implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public NotContainsVerbIgnoreCase() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public NotContainsVerbIgnoreCase(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return !(value.toLowerCase().contains(param.toLowerCase()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("not_equals")
|
||||||
|
public class NotEqualVerb implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public NotEqualVerb(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public NotEqualVerb() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return !value.equals(param);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@VerbClass("not_equals_ignorecase")
|
||||||
|
public class NotEqualVerbIgnoreCase implements Selection, Serializable {
|
||||||
|
|
||||||
|
private String param;
|
||||||
|
|
||||||
|
public NotEqualVerbIgnoreCase(final String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public NotEqualVerbIgnoreCase() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParam() {
|
||||||
|
return param;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParam(String param) {
|
||||||
|
this.param = param;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean apply(String value) {
|
||||||
|
return !value.equalsIgnoreCase(param);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
public interface Selection {
|
||||||
|
|
||||||
|
boolean apply(String value);
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.lang.annotation.ElementType;
|
||||||
|
import java.lang.annotation.Retention;
|
||||||
|
import java.lang.annotation.RetentionPolicy;
|
||||||
|
import java.lang.annotation.Target;
|
||||||
|
|
||||||
|
@Retention(RetentionPolicy.RUNTIME)
|
||||||
|
@Target(ElementType.TYPE)
|
||||||
|
@interface VerbClass {
|
||||||
|
|
||||||
|
String value();
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import io.github.classgraph.ClassGraph;
|
||||||
|
import io.github.classgraph.ClassInfo;
|
||||||
|
import io.github.classgraph.ClassInfoList;
|
||||||
|
import io.github.classgraph.ScanResult;
|
||||||
|
|
||||||
|
public class VerbResolver implements Serializable {
|
||||||
|
private Map<String, Class<Selection>> map = null; // = new HashMap<>();
|
||||||
|
private final ClassGraph classgraph = new ClassGraph();
|
||||||
|
|
||||||
|
public VerbResolver() {
|
||||||
|
|
||||||
|
try (ScanResult scanResult = // Assign scanResult in try-with-resources
|
||||||
|
classgraph // Create a new ClassGraph instance
|
||||||
|
.verbose() // If you want to enable logging to stderr
|
||||||
|
.enableAllInfo() // Scan classes, methods, fields, annotations
|
||||||
|
.whitelistPackages(
|
||||||
|
"eu.dnetlib.dhp.bulktag.criteria") // Scan com.xyz and subpackages
|
||||||
|
.scan()) { // Perform the scan and return a ScanResult
|
||||||
|
|
||||||
|
ClassInfoList routeClassInfoList = scanResult
|
||||||
|
.getClassesWithAnnotation(
|
||||||
|
"eu.dnetlib.dhp.bulktag.criteria.VerbClass");
|
||||||
|
|
||||||
|
this.map = routeClassInfoList
|
||||||
|
.stream()
|
||||||
|
.collect(
|
||||||
|
Collectors
|
||||||
|
.toMap(
|
||||||
|
value -> (String) ((ClassInfo) value)
|
||||||
|
.getAnnotationInfo()
|
||||||
|
.get(0)
|
||||||
|
.getParameterValues()
|
||||||
|
.get(0)
|
||||||
|
.getValue(),
|
||||||
|
value -> (Class<Selection>) ((ClassInfo) value).loadClass()));
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Selection getSelectionCriteria(String name, String param)
|
||||||
|
throws NoSuchMethodException, IllegalAccessException, InvocationTargetException,
|
||||||
|
InstantiationException {
|
||||||
|
|
||||||
|
// return Class.forName(tmp_map.get(name)).
|
||||||
|
return map.get(name).getDeclaredConstructor((String.class)).newInstance(param);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,10 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.bulktag.criteria;
|
||||||
|
|
||||||
|
public class VerbResolverFactory {
|
||||||
|
|
||||||
|
public static VerbResolver newInstance() {
|
||||||
|
|
||||||
|
return new VerbResolver();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class CountrySbs implements Serializable {
|
||||||
|
private String classid;
|
||||||
|
private String classname;
|
||||||
|
|
||||||
|
public String getClassid() {
|
||||||
|
return classid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClassid(String classid) {
|
||||||
|
this.classid = classid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getClassname() {
|
||||||
|
return classname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClassname(String classname) {
|
||||||
|
this.classname = classname;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class DatasourceCountry implements Serializable {
|
||||||
|
private String dataSourceId;
|
||||||
|
private CountrySbs country;
|
||||||
|
|
||||||
|
public String getDataSourceId() {
|
||||||
|
return dataSourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDataSourceId(String dataSourceId) {
|
||||||
|
this.dataSourceId = dataSourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CountrySbs getCountry() {
|
||||||
|
return country;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCountry(CountrySbs country) {
|
||||||
|
this.country = country;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,121 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For the association of the country to the datasource The association is computed only for datasource of specific type
|
||||||
|
* or having whitelisted ids The country is registered in the Organization associated to the Datasource, so the relation
|
||||||
|
* provides between Datasource and Organization is exploited to get the country for the datasource
|
||||||
|
*/
|
||||||
|
public class PrepareDatasourceCountryAssociation {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareDatasourceCountryAssociation.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath {}: ", outputPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
prepareDatasourceCountryAssociation(
|
||||||
|
spark,
|
||||||
|
Arrays.asList(parser.get("whitelist").split(";")),
|
||||||
|
Arrays.asList(parser.get("allowedtypes").split(";")),
|
||||||
|
inputPath,
|
||||||
|
outputPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void prepareDatasourceCountryAssociation(
|
||||||
|
SparkSession spark,
|
||||||
|
List<String> whitelist,
|
||||||
|
List<String> allowedtypes,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath) {
|
||||||
|
String whitelisted = "";
|
||||||
|
for (String i : whitelist) {
|
||||||
|
whitelisted += " OR id = '" + i + "'";
|
||||||
|
}
|
||||||
|
|
||||||
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||||
|
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
|
||||||
|
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
|
||||||
|
|
||||||
|
datasource.createOrReplaceTempView("datasource");
|
||||||
|
relation.createOrReplaceTempView("relation");
|
||||||
|
organization.createOrReplaceTempView("organization");
|
||||||
|
|
||||||
|
String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
||||||
|
+ "FROM ( SELECT id "
|
||||||
|
+ " FROM datasource "
|
||||||
|
+ " WHERE (datainfo.deletedbyinference = false "
|
||||||
|
+ whitelisted
|
||||||
|
+ ") "
|
||||||
|
+ getConstraintList("datasourcetype.classid = '", allowedtypes)
|
||||||
|
+ ") d "
|
||||||
|
+ "JOIN ( SELECT source, target "
|
||||||
|
+ " FROM relation "
|
||||||
|
+ " WHERE relclass = '"
|
||||||
|
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
|
||||||
|
+ "' "
|
||||||
|
+ " AND datainfo.deletedbyinference = false ) rel "
|
||||||
|
+ "ON d.id = rel.source "
|
||||||
|
+ "JOIN (SELECT id, country "
|
||||||
|
+ " FROM organization "
|
||||||
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
|
+ " AND length(country.classid) > 0) o "
|
||||||
|
+ "ON o.id = rel.target";
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(query)
|
||||||
|
.as(Encoders.bean(DatasourceCountry.class))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,98 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
|
public class PrepareResultCountrySet {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
||||||
|
|
||||||
|
private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet "
|
||||||
|
+ "FROM ( SELECT id, country "
|
||||||
|
+ "FROM datasource_country JOIN cfhb ON cf = dataSourceId "
|
||||||
|
+ "UNION ALL "
|
||||||
|
+ "SELECT id, country FROM datasource_country "
|
||||||
|
+ "JOIN cfhb ON hb = dataSourceId ) tmp "
|
||||||
|
+ "GROUP BY id";
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareResultCountrySet.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String datasourcecountrypath = parser.get("preparedInfoPath");
|
||||||
|
log.info("preparedInfoPath: {}", datasourcecountrypath);
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
getPotentialResultToUpdate(
|
||||||
|
spark,
|
||||||
|
inputPath,
|
||||||
|
outputPath,
|
||||||
|
datasourcecountrypath,
|
||||||
|
resultClazz);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void getPotentialResultToUpdate(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
String datasourcecountrypath,
|
||||||
|
Class<R> resultClazz) {
|
||||||
|
|
||||||
|
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
||||||
|
result.createOrReplaceTempView("result");
|
||||||
|
// log.info("number of results: {}", result.count());
|
||||||
|
createCfHbforResult(spark);
|
||||||
|
|
||||||
|
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
|
||||||
|
|
||||||
|
datasource_country.createOrReplaceTempView("datasource_country");
|
||||||
|
// log.info("datasource_country number : {}", datasource_country.count());
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(RESULT_COUNTRYSET_QUERY)
|
||||||
|
.as(Encoders.bean(ResultCountrySet.class))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class ResultCountrySet implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private ArrayList<CountrySbs> countrySet;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<CountrySbs> getCountrySet() {
|
||||||
|
return countrySet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCountrySet(ArrayList<CountrySbs> countrySet) {
|
||||||
|
this.countrySet = countrySet;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,132 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkCountryPropagationJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCountryPropagationJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String sourcePath = parser.get("sourcePath");
|
||||||
|
log.info("sourcePath: {}", sourcePath);
|
||||||
|
|
||||||
|
String preparedInfoPath = parser.get("preparedInfoPath");
|
||||||
|
log.info("preparedInfoPath: {}", preparedInfoPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final Boolean saveGraph = Optional
|
||||||
|
.ofNullable(parser.get("saveGraph"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> execPropagation(
|
||||||
|
spark,
|
||||||
|
sourcePath,
|
||||||
|
preparedInfoPath,
|
||||||
|
outputPath,
|
||||||
|
resultClazz,
|
||||||
|
saveGraph));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void execPropagation(
|
||||||
|
SparkSession spark,
|
||||||
|
String sourcePath,
|
||||||
|
String preparedInfoPath,
|
||||||
|
String outputPath,
|
||||||
|
Class<R> resultClazz,
|
||||||
|
boolean saveGraph) {
|
||||||
|
|
||||||
|
if (saveGraph) {
|
||||||
|
// updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
|
||||||
|
log.info("Reading Graph table from: {}", sourcePath);
|
||||||
|
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
|
||||||
|
|
||||||
|
log.info("Reading prepared info: {}", preparedInfoPath);
|
||||||
|
Dataset<ResultCountrySet> prepared = spark
|
||||||
|
.read()
|
||||||
|
.json(preparedInfoPath)
|
||||||
|
.as(Encoders.bean(ResultCountrySet.class));
|
||||||
|
|
||||||
|
res
|
||||||
|
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
|
||||||
|
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||||
|
return (MapFunction<Tuple2<R, ResultCountrySet>, R>) t -> {
|
||||||
|
Optional.ofNullable(t._2()).ifPresent(r -> {
|
||||||
|
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
|
||||||
|
});
|
||||||
|
return t._1();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
|
||||||
|
HashSet<String> countries = c1
|
||||||
|
.stream()
|
||||||
|
.map(c -> c.getClassid())
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
|
||||||
|
return c2
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !countries.contains(c.getClassid()))
|
||||||
|
.map(c -> getCountry(c.getClassid(), c.getClassname()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,43 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
|
public class AutoritativeAuthor {
|
||||||
|
|
||||||
|
private String name;
|
||||||
|
private String surname;
|
||||||
|
private String fullname;
|
||||||
|
private String orcid;
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSurname() {
|
||||||
|
return surname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSurname(String surname) {
|
||||||
|
this.surname = surname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFullname() {
|
||||||
|
return fullname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFullname(String fullname) {
|
||||||
|
this.fullname = fullname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOrcid() {
|
||||||
|
return orcid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOrcid(String orcid) {
|
||||||
|
this.orcid = orcid;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class PrepareResultOrcidAssociationStep1 {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConf = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareResultOrcidAssociationStep1.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
|
||||||
|
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
|
||||||
|
|
||||||
|
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
|
||||||
|
log.info("resultType: {}", resultType);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
String inputRelationPath = inputPath + "/relation";
|
||||||
|
log.info("inputRelationPath: {}", inputRelationPath);
|
||||||
|
|
||||||
|
String inputResultPath = inputPath + "/" + resultType;
|
||||||
|
log.info("inputResultPath: {}", inputResultPath);
|
||||||
|
|
||||||
|
String outputResultPath = outputPath + "/" + resultType;
|
||||||
|
log.info("outputResultPath: {}", outputResultPath);
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
prepareInfo(
|
||||||
|
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void prepareInfo(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputRelationPath,
|
||||||
|
String inputResultPath,
|
||||||
|
String outputResultPath,
|
||||||
|
Class<R> resultClazz,
|
||||||
|
List<String> allowedsemrel) {
|
||||||
|
|
||||||
|
Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class);
|
||||||
|
relation.createOrReplaceTempView("relation");
|
||||||
|
|
||||||
|
log.info("Reading Graph table from: {}", inputResultPath);
|
||||||
|
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
|
||||||
|
result.createOrReplaceTempView("result");
|
||||||
|
|
||||||
|
String query = " select target resultId, author authorList"
|
||||||
|
+ " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
|
||||||
|
+ " from ( "
|
||||||
|
+ " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
|
||||||
|
+ " from result "
|
||||||
|
+ " lateral view explode (author) a as MyT "
|
||||||
|
+ " lateral view explode (MyT.pid) p as MyP "
|
||||||
|
+ " where MyP.qualifier.classid = 'ORCID') tmp "
|
||||||
|
+ " group by id) r_t "
|
||||||
|
+ " join ("
|
||||||
|
+ " select source, target "
|
||||||
|
+ " from relation "
|
||||||
|
+ " where datainfo.deletedbyinference = false "
|
||||||
|
+ getConstraintList(" relclass = '", allowedsemrel)
|
||||||
|
+ ") rel_rel "
|
||||||
|
+ " on source = id";
|
||||||
|
spark
|
||||||
|
.sql(query)
|
||||||
|
.as(Encoders.bean(ResultOrcidList.class))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputResultPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,97 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class PrepareResultOrcidAssociationStep2 {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareResultOrcidAssociationStep2.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
mergeInfo(spark, inputPath, outputPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {
|
||||||
|
|
||||||
|
Dataset<ResultOrcidList> resultOrcidAssoc = readPath(spark, inputPath + "/publication", ResultOrcidList.class)
|
||||||
|
.union(readPath(spark, inputPath + "/dataset", ResultOrcidList.class))
|
||||||
|
.union(readPath(spark, inputPath + "/otherresearchproduct", ResultOrcidList.class))
|
||||||
|
.union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
|
||||||
|
|
||||||
|
resultOrcidAssoc
|
||||||
|
.toJavaRDD()
|
||||||
|
.mapToPair(r -> new Tuple2<>(r.getResultId(), r))
|
||||||
|
.reduceByKey(
|
||||||
|
(a, b) -> {
|
||||||
|
if (a == null) {
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
if (b == null) {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
Set<String> orcid_set = new HashSet<>();
|
||||||
|
a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
|
||||||
|
b
|
||||||
|
.getAuthorList()
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
aa -> {
|
||||||
|
if (!orcid_set.contains(aa.getOrcid())) {
|
||||||
|
a.getAuthorList().add(aa);
|
||||||
|
orcid_set.add(aa.getOrcid());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(c -> c._2())
|
||||||
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class ResultOrcidList implements Serializable {
|
||||||
|
String resultId;
|
||||||
|
List<AutoritativeAuthor> authorList = new ArrayList<>();
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<AutoritativeAuthor> getAuthorList() {
|
||||||
|
return authorList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAuthorList(List<AutoritativeAuthor> authorList) {
|
||||||
|
this.authorList = authorList;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,199 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkOrcidToResultFromSemRelJob {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkOrcidToResultFromSemRelJob.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkOrcidToResultFromSemRelJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String possibleUpdates = parser.get("possibleUpdatesPath");
|
||||||
|
log.info("possibleUpdatesPath: {}", possibleUpdates);
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final Boolean saveGraph = Optional
|
||||||
|
.ofNullable(parser.get("saveGraph"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
if (saveGraph)
|
||||||
|
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void execPropagation(
|
||||||
|
SparkSession spark,
|
||||||
|
String possibleUpdatesPath,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
Class<R> resultClazz) {
|
||||||
|
|
||||||
|
// read possible updates (resultId and list of possible orcid to add
|
||||||
|
Dataset<ResultOrcidList> possible_updates = readPath(spark, possibleUpdatesPath, ResultOrcidList.class);
|
||||||
|
// read the result we have been considering
|
||||||
|
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
||||||
|
// make join result left_outer with possible updates
|
||||||
|
|
||||||
|
result
|
||||||
|
.joinWith(
|
||||||
|
possible_updates,
|
||||||
|
result.col("id").equalTo(possible_updates.col("resultId")),
|
||||||
|
"left_outer")
|
||||||
|
.map(authorEnrichFn(), Encoders.bean(resultClazz))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> MapFunction<Tuple2<R, ResultOrcidList>, R> authorEnrichFn() {
|
||||||
|
return (MapFunction<Tuple2<R, ResultOrcidList>, R>) value -> {
|
||||||
|
R ret = value._1();
|
||||||
|
Optional<ResultOrcidList> rol = Optional.ofNullable(value._2());
|
||||||
|
if (rol.isPresent()) {
|
||||||
|
List<Author> toenrich_author = ret.getAuthor();
|
||||||
|
List<AutoritativeAuthor> autoritativeAuthors = rol.get().getAuthorList();
|
||||||
|
for (Author author : toenrich_author) {
|
||||||
|
if (!containsAllowedPid(author)) {
|
||||||
|
enrichAuthor(author, autoritativeAuthors);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
|
||||||
|
for (AutoritativeAuthor aa : au) {
|
||||||
|
if (enrichAuthor(aa, a)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
|
||||||
|
boolean toaddpid = false;
|
||||||
|
|
||||||
|
if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) {
|
||||||
|
if (StringUtils.isNoneEmpty(author.getSurname())) {
|
||||||
|
if (autoritative_author
|
||||||
|
.getSurname()
|
||||||
|
.trim()
|
||||||
|
.equalsIgnoreCase(author.getSurname().trim())) {
|
||||||
|
|
||||||
|
// have the same surname. Check the name
|
||||||
|
if (StringUtils.isNoneEmpty(autoritative_author.getName())) {
|
||||||
|
if (StringUtils.isNoneEmpty(author.getName())) {
|
||||||
|
if (autoritative_author
|
||||||
|
.getName()
|
||||||
|
.trim()
|
||||||
|
.equalsIgnoreCase(author.getName().trim())) {
|
||||||
|
toaddpid = true;
|
||||||
|
}
|
||||||
|
// they could be differently written (i.e. only the initials of the name
|
||||||
|
// in one of the two
|
||||||
|
if (autoritative_author
|
||||||
|
.getName()
|
||||||
|
.trim()
|
||||||
|
.substring(0, 0)
|
||||||
|
.equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
|
||||||
|
toaddpid = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (toaddpid) {
|
||||||
|
StructuredProperty p = new StructuredProperty();
|
||||||
|
p.setValue(autoritative_author.getOrcid());
|
||||||
|
p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID));
|
||||||
|
p
|
||||||
|
.setDataInfo(
|
||||||
|
getDataInfo(
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID,
|
||||||
|
PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME));
|
||||||
|
|
||||||
|
Optional<List<StructuredProperty>> authorPid = Optional.ofNullable(author.getPid());
|
||||||
|
if (authorPid.isPresent()) {
|
||||||
|
authorPid.get().add(p);
|
||||||
|
} else {
|
||||||
|
author.setPid(Lists.newArrayList(p));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return toaddpid;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean containsAllowedPid(Author a) {
|
||||||
|
Optional<List<StructuredProperty>> pids = Optional.ofNullable(a.getPid());
|
||||||
|
if (!pids.isPresent()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (StructuredProperty pid : pids.get()) {
|
||||||
|
if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,126 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.projecttoresult;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.getConstraintList;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class PrepareProjectResultsAssociation {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareProjectResultsAssociation.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String potentialUpdatePath = parser.get("potentialUpdatePath");
|
||||||
|
log.info("potentialUpdatePath {}: ", potentialUpdatePath);
|
||||||
|
|
||||||
|
String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
||||||
|
log.info("alreadyLinkedPath: {} ", alreadyLinkedPath);
|
||||||
|
|
||||||
|
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
|
||||||
|
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
prepareResultProjProjectResults(
|
||||||
|
spark,
|
||||||
|
inputPath,
|
||||||
|
potentialUpdatePath,
|
||||||
|
alreadyLinkedPath,
|
||||||
|
allowedsemrel);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void prepareResultProjProjectResults(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
String potentialUpdatePath,
|
||||||
|
String alreadyLinkedPath,
|
||||||
|
List<String> allowedsemrel) {
|
||||||
|
|
||||||
|
Dataset<Relation> relation = readPath(spark, inputPath, Relation.class);
|
||||||
|
relation.createOrReplaceTempView("relation");
|
||||||
|
|
||||||
|
String resproj_relation_query = "SELECT source, target "
|
||||||
|
+ " FROM relation "
|
||||||
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
|
+ " AND relClass = '"
|
||||||
|
+ RELATION_RESULT_PROJECT_REL_CLASS
|
||||||
|
+ "'";
|
||||||
|
|
||||||
|
Dataset<Row> resproj_relation = spark.sql(resproj_relation_query);
|
||||||
|
resproj_relation.createOrReplaceTempView("resproj_relation");
|
||||||
|
|
||||||
|
String potential_update_query = "SELECT resultId, collect_set(projectId) projectSet "
|
||||||
|
+ "FROM ( "
|
||||||
|
+ "SELECT r1.target resultId, r2.target projectId "
|
||||||
|
+ " FROM (SELECT source, target "
|
||||||
|
+ " FROM relation "
|
||||||
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
|
+ getConstraintList(" relClass = '", allowedsemrel)
|
||||||
|
+ " ) r1"
|
||||||
|
+ " JOIN resproj_relation r2 "
|
||||||
|
+ " ON r1.source = r2.source "
|
||||||
|
+ " ) tmp "
|
||||||
|
+ "GROUP BY resultId ";
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(potential_update_query)
|
||||||
|
.as(Encoders.bean(ResultProjectSet.class))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(potentialUpdatePath);
|
||||||
|
|
||||||
|
String result_projectset_query = "SELECT source resultId, collect_set(target) projectSet "
|
||||||
|
+ "FROM resproj_relation "
|
||||||
|
+ "GROUP BY source";
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(result_projectset_query)
|
||||||
|
.as(Encoders.bean(ResultProjectSet.class))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(alreadyLinkedPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.projecttoresult;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class ResultProjectSet implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private ArrayList<String> projectSet;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<String> getProjectSet() {
|
||||||
|
return projectSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProjectSet(ArrayList<String> project) {
|
||||||
|
this.projectSet = project;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,147 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.projecttoresult;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkResultToProjectThroughSemRelJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkResultToProjectThroughSemRelJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath {}: ", outputPath);
|
||||||
|
|
||||||
|
final String potentialUpdatePath = parser.get("potentialUpdatePath");
|
||||||
|
log.info("potentialUpdatePath {}: ", potentialUpdatePath);
|
||||||
|
|
||||||
|
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
||||||
|
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
||||||
|
|
||||||
|
final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
|
||||||
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
execPropagation(
|
||||||
|
spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void execPropagation(
|
||||||
|
SparkSession spark,
|
||||||
|
String outputPath,
|
||||||
|
String alreadyLinkedPath,
|
||||||
|
String potentialUpdatePath,
|
||||||
|
Boolean saveGraph) {
|
||||||
|
|
||||||
|
Dataset<ResultProjectSet> toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class);
|
||||||
|
Dataset<ResultProjectSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class);
|
||||||
|
|
||||||
|
if (saveGraph) {
|
||||||
|
toaddrelations
|
||||||
|
.joinWith(
|
||||||
|
alreadyLinked,
|
||||||
|
toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")),
|
||||||
|
"left_outer")
|
||||||
|
.flatMap(mapRelationRn(), Encoders.bean(Relation.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation> mapRelationRn() {
|
||||||
|
return (FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation>) value -> {
|
||||||
|
List<Relation> new_relations = new ArrayList<>();
|
||||||
|
ResultProjectSet potential_update = value._1();
|
||||||
|
Optional<ResultProjectSet> already_linked = Optional.ofNullable(value._2());
|
||||||
|
if (already_linked.isPresent()) {
|
||||||
|
already_linked
|
||||||
|
.get()
|
||||||
|
.getProjectSet()
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
(p -> {
|
||||||
|
if (potential_update
|
||||||
|
.getProjectSet()
|
||||||
|
.contains(p)) {
|
||||||
|
potential_update.getProjectSet().remove(p);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
String resId = potential_update.getResultId();
|
||||||
|
potential_update
|
||||||
|
.getProjectSet()
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
projectId -> {
|
||||||
|
new_relations
|
||||||
|
.add(
|
||||||
|
getRelation(
|
||||||
|
resId,
|
||||||
|
projectId,
|
||||||
|
RELATION_RESULT_PROJECT_REL_CLASS,
|
||||||
|
RELATION_RESULTPROJECT_REL_TYPE,
|
||||||
|
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
|
new_relations
|
||||||
|
.add(
|
||||||
|
getRelation(
|
||||||
|
projectId,
|
||||||
|
resId,
|
||||||
|
RELATION_PROJECT_RESULT_REL_CLASS,
|
||||||
|
RELATION_RESULTPROJECT_REL_TYPE,
|
||||||
|
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
|
});
|
||||||
|
return new_relations.iterator();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromorganization;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class OrganizationMap extends HashMap<String, List<String>> {
|
||||||
|
|
||||||
|
public OrganizationMap() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> get(String key) {
|
||||||
|
|
||||||
|
if (super.get(key) == null) {
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
return super.get(key);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,130 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromorganization;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class PrepareResultCommunitySet {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareResultCommunitySet.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final OrganizationMap organizationMap = new Gson()
|
||||||
|
.fromJson(
|
||||||
|
parser.get("organizationtoresultcommunitymap"),
|
||||||
|
OrganizationMap.class);
|
||||||
|
log.info("organizationMap: {}", new Gson().toJson(organizationMap));
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
prepareInfo(spark, inputPath, outputPath, organizationMap);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void prepareInfo(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
OrganizationMap organizationMap) {
|
||||||
|
|
||||||
|
Dataset<Relation> relation = readPath(spark, inputPath, Relation.class);
|
||||||
|
relation.createOrReplaceTempView("relation");
|
||||||
|
|
||||||
|
String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges "
|
||||||
|
+ "FROM (SELECT source, target "
|
||||||
|
+ " FROM relation "
|
||||||
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
|
+ " AND relClass = '"
|
||||||
|
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
|
||||||
|
+ "') result_organization "
|
||||||
|
+ "LEFT JOIN (SELECT source, collect_set(target) org_set "
|
||||||
|
+ " FROM relation "
|
||||||
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
|
+ " AND relClass = '"
|
||||||
|
+ RELATION_REPRESENTATIVERESULT_RESULT_CLASS
|
||||||
|
+ "' "
|
||||||
|
+ " GROUP BY source) organization_organization "
|
||||||
|
+ "ON result_organization.target = organization_organization.source ";
|
||||||
|
|
||||||
|
Dataset<ResultOrganizations> result_organizationset = spark
|
||||||
|
.sql(query)
|
||||||
|
.as(Encoders.bean(ResultOrganizations.class));
|
||||||
|
|
||||||
|
result_organizationset
|
||||||
|
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
|
||||||
|
OrganizationMap organizationMap) {
|
||||||
|
return (MapFunction<ResultOrganizations, ResultCommunityList>) value -> {
|
||||||
|
String rId = value.getResultId();
|
||||||
|
Optional<List<String>> orgs = Optional.ofNullable(value.getMerges());
|
||||||
|
String oTarget = value.getOrgId();
|
||||||
|
Set<String> communitySet = new HashSet<>();
|
||||||
|
if (organizationMap.containsKey(oTarget)) {
|
||||||
|
communitySet.addAll(organizationMap.get(oTarget));
|
||||||
|
}
|
||||||
|
if (orgs.isPresent())
|
||||||
|
for (String oId : orgs.get()) {
|
||||||
|
if (organizationMap.containsKey(oId)) {
|
||||||
|
communitySet.addAll(organizationMap.get(oId));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (communitySet.size() > 0) {
|
||||||
|
ResultCommunityList rcl = new ResultCommunityList();
|
||||||
|
rcl.setResultId(rId);
|
||||||
|
ArrayList<String> communityList = new ArrayList<>();
|
||||||
|
communityList.addAll(communitySet);
|
||||||
|
rcl.setCommunityList(communityList);
|
||||||
|
return rcl;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromorganization;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class ResultCommunityList implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private ArrayList<String> communityList;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<String> getCommunityList() {
|
||||||
|
return communityList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCommunityList(ArrayList<String> communityList) {
|
||||||
|
this.communityList = communityList;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromorganization;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class ResultOrganizations implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private String orgId;
|
||||||
|
private ArrayList<String> merges;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOrgId() {
|
||||||
|
return orgId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOrgId(String orgId) {
|
||||||
|
this.orgId = orgId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<String> getMerges() {
|
||||||
|
return merges;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMerges(ArrayList<String> merges) {
|
||||||
|
this.merges = merges;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,137 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromorganization;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkResultToCommunityFromOrganizationJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromOrganizationJob.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkResultToCommunityFromOrganizationJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String possibleupdatespath = parser.get("preparedInfoPath");
|
||||||
|
log.info("preparedInfoPath: {}", possibleupdatespath);
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final Boolean saveGraph = Optional
|
||||||
|
.ofNullable(parser.get("saveGraph"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
if (saveGraph)
|
||||||
|
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void execPropagation(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
Class<R> resultClazz,
|
||||||
|
String possibleUpdatesPath) {
|
||||||
|
|
||||||
|
Dataset<ResultCommunityList> possibleUpdates = readPath(spark, possibleUpdatesPath, ResultCommunityList.class);
|
||||||
|
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
||||||
|
|
||||||
|
result
|
||||||
|
.joinWith(
|
||||||
|
possibleUpdates,
|
||||||
|
result.col("id").equalTo(possibleUpdates.col("resultId")),
|
||||||
|
"left_outer")
|
||||||
|
.map(resultCommunityFn(), Encoders.bean(resultClazz))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> resultCommunityFn() {
|
||||||
|
return (MapFunction<Tuple2<R, ResultCommunityList>, R>) value -> {
|
||||||
|
R ret = value._1();
|
||||||
|
Optional<ResultCommunityList> rcl = Optional.ofNullable(value._2());
|
||||||
|
if (rcl.isPresent()) {
|
||||||
|
ArrayList<String> communitySet = rcl.get().getCommunityList();
|
||||||
|
List<String> contextList = ret
|
||||||
|
.getContext()
|
||||||
|
.stream()
|
||||||
|
.map(con -> con.getId())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
Result res = new Result();
|
||||||
|
res.setId(ret.getId());
|
||||||
|
List<Context> propagatedContexts = new ArrayList<>();
|
||||||
|
for (String cId : communitySet) {
|
||||||
|
if (!contextList.contains(cId)) {
|
||||||
|
Context newContext = new Context();
|
||||||
|
newContext.setId(cId);
|
||||||
|
newContext
|
||||||
|
.setDataInfo(
|
||||||
|
Arrays
|
||||||
|
.asList(
|
||||||
|
getDataInfo(
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID,
|
||||||
|
PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME)));
|
||||||
|
propagatedContexts.add(newContext);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res.setContext(propagatedContexts);
|
||||||
|
ret.mergeFrom(res);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,167 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
public class PrepareResultCommunitySetStep1 {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class);
|
||||||
|
|
||||||
|
private static final String COMMUNITY_LIST_XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')"
|
||||||
|
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']"
|
||||||
|
+ " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'"
|
||||||
|
+ " return $x//CONFIGURATION/context/@id/string()";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* associates to each result the set of community contexts they are associated to; associates to each target of a
|
||||||
|
* relation with allowed semantics the set of community context it could possibly inherit from the source of the
|
||||||
|
* relation
|
||||||
|
*/
|
||||||
|
// TODO
|
||||||
|
private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context "
|
||||||
|
+ "from (select id, collect_set(co.id) community_context "
|
||||||
|
+ " from result "
|
||||||
|
+ " lateral view explode (context) c as co "
|
||||||
|
+ " where datainfo.deletedbyinference = false %s group by id) p "
|
||||||
|
+ " JOIN "
|
||||||
|
+ " (select source, target from relation "
|
||||||
|
+ " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* a dataset for example could be linked to more than one publication. For each publication linked to that dataset
|
||||||
|
* the previous query will produce a row: targetId set of community context the target could possibly inherit with
|
||||||
|
* the following query there will be a single row for each result linked to more than one result of the result type
|
||||||
|
* currently being used
|
||||||
|
*/
|
||||||
|
// TODO
|
||||||
|
private static final String RESULT_COMMUNITY_LIST_QUERY = "select resultId , collect_set(co) communityList "
|
||||||
|
+ "from result_context "
|
||||||
|
+ "lateral view explode (community_context) c as co "
|
||||||
|
+ "where length(co) > 0 "
|
||||||
|
+ "group by resultId";
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareResultCommunitySetStep1.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
|
||||||
|
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
|
||||||
|
|
||||||
|
final String isLookupUrl = parser.get("isLookUpUrl");
|
||||||
|
log.info("isLookupUrl: {}", isLookupUrl);
|
||||||
|
|
||||||
|
final List<String> communityIdList = getCommunityList(isLookupUrl);
|
||||||
|
log.info("communityIdList: {}", new Gson().toJson(communityIdList));
|
||||||
|
|
||||||
|
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
|
||||||
|
log.info("resultType: {}", resultType);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
prepareInfo(
|
||||||
|
spark,
|
||||||
|
inputPath,
|
||||||
|
outputPath,
|
||||||
|
allowedsemrel,
|
||||||
|
resultClazz,
|
||||||
|
resultType,
|
||||||
|
communityIdList);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void prepareInfo(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
List<String> allowedsemrel,
|
||||||
|
Class<R> resultClazz,
|
||||||
|
String resultType,
|
||||||
|
List<String> communityIdList) {
|
||||||
|
|
||||||
|
final String inputResultPath = inputPath + "/" + resultType;
|
||||||
|
log.info("Reading Graph table from: {}", inputResultPath);
|
||||||
|
|
||||||
|
final String inputRelationPath = inputPath + "/relation";
|
||||||
|
log.info("Reading relation table from: {}", inputResultPath);
|
||||||
|
|
||||||
|
Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class);
|
||||||
|
relation.createOrReplaceTempView("relation");
|
||||||
|
|
||||||
|
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
|
||||||
|
result.createOrReplaceTempView("result");
|
||||||
|
|
||||||
|
final String outputResultPath = outputPath + "/" + resultType;
|
||||||
|
log.info("writing output results to: {}", outputResultPath);
|
||||||
|
|
||||||
|
String resultContextQuery = String
|
||||||
|
.format(
|
||||||
|
RESULT_CONTEXT_QUERY_TEMPLATE,
|
||||||
|
getConstraintList(" co.id = '", communityIdList),
|
||||||
|
getConstraintList(" relClass = '", allowedsemrel));
|
||||||
|
|
||||||
|
Dataset<Row> result_context = spark.sql(resultContextQuery);
|
||||||
|
result_context.createOrReplaceTempView("result_context");
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(RESULT_COMMUNITY_LIST_QUERY)
|
||||||
|
.as(Encoders.bean(ResultCommunityList.class))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputResultPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<String> getCommunityList(final String isLookupUrl) throws ISLookUpException {
|
||||||
|
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||||
|
return isLookUp.quickSearchProfile(COMMUNITY_LIST_XQUERY);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,101 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class PrepareResultCommunitySetStep2 {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep2.class);
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareResultCommunitySetStep2.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
mergeInfo(spark, inputPath, outputPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {
|
||||||
|
|
||||||
|
Dataset<ResultCommunityList> resultOrcidAssocCommunityList = readPath(
|
||||||
|
spark, inputPath + "/publication", ResultCommunityList.class)
|
||||||
|
.union(readPath(spark, inputPath + "/dataset", ResultCommunityList.class))
|
||||||
|
.union(readPath(spark, inputPath + "/otherresearchproduct", ResultCommunityList.class))
|
||||||
|
.union(readPath(spark, inputPath + "/software", ResultCommunityList.class));
|
||||||
|
|
||||||
|
resultOrcidAssocCommunityList
|
||||||
|
.toJavaRDD()
|
||||||
|
.mapToPair(r -> new Tuple2<>(r.getResultId(), r))
|
||||||
|
.reduceByKey(
|
||||||
|
(a, b) -> {
|
||||||
|
if (a == null) {
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
if (b == null) {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
Set<String> community_set = new HashSet<>();
|
||||||
|
a.getCommunityList().stream().forEach(aa -> community_set.add(aa));
|
||||||
|
b
|
||||||
|
.getCommunityList()
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
aa -> {
|
||||||
|
if (!community_set.contains(aa)) {
|
||||||
|
a.getCommunityList().add(aa);
|
||||||
|
community_set.add(aa);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(c -> c._2())
|
||||||
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,143 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkResultToCommunityThroughSemRelJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityThroughSemRelJob.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkResultToCommunityThroughSemRelJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String preparedInfoPath = parser.get("preparedInfoPath");
|
||||||
|
log.info("preparedInfoPath: {}", preparedInfoPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final Boolean saveGraph = Optional
|
||||||
|
.ofNullable(parser.get("saveGraph"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
if (saveGraph) {
|
||||||
|
execPropagation(
|
||||||
|
spark, inputPath, outputPath, preparedInfoPath, resultClazz);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void execPropagation(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
String preparedInfoPath,
|
||||||
|
Class<R> resultClazz) {
|
||||||
|
|
||||||
|
Dataset<ResultCommunityList> possibleUpdates = readPath(spark, preparedInfoPath, ResultCommunityList.class);
|
||||||
|
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
||||||
|
|
||||||
|
result
|
||||||
|
.joinWith(
|
||||||
|
possibleUpdates,
|
||||||
|
result.col("id").equalTo(possibleUpdates.col("resultId")),
|
||||||
|
"left_outer")
|
||||||
|
.map(contextUpdaterFn(), Encoders.bean(resultClazz))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> contextUpdaterFn() {
|
||||||
|
return (MapFunction<Tuple2<R, ResultCommunityList>, R>) value -> {
|
||||||
|
R ret = value._1();
|
||||||
|
Optional<ResultCommunityList> rcl = Optional.ofNullable(value._2());
|
||||||
|
if (rcl.isPresent()) {
|
||||||
|
Set<String> context_set = new HashSet<>();
|
||||||
|
ret.getContext().stream().forEach(c -> context_set.add(c.getId()));
|
||||||
|
List<Context> contextList = rcl
|
||||||
|
.get()
|
||||||
|
.getCommunityList()
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
c -> {
|
||||||
|
if (!context_set.contains(c)) {
|
||||||
|
Context newContext = new Context();
|
||||||
|
newContext.setId(c);
|
||||||
|
newContext
|
||||||
|
.setDataInfo(
|
||||||
|
Arrays
|
||||||
|
.asList(
|
||||||
|
getDataInfo(
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID,
|
||||||
|
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME)));
|
||||||
|
return newContext;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
Result r = new Result();
|
||||||
|
r.setId(ret.getId());
|
||||||
|
r.setContext(contextList);
|
||||||
|
ret.mergeFrom(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class DatasourceOrganization implements Serializable {
|
||||||
|
|
||||||
|
private String datasourceId;
|
||||||
|
private String organizationId;
|
||||||
|
|
||||||
|
public String getDatasourceId() {
|
||||||
|
return datasourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDatasourceId(String datasourceId) {
|
||||||
|
this.datasourceId = datasourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOrganizationId() {
|
||||||
|
return organizationId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOrganizationId(String organizationId) {
|
||||||
|
this.organizationId = organizationId;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,122 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class PrepareResultInstRepoAssociation {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class);
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
PrepareResultInstRepoAssociation.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath");
|
||||||
|
log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath);
|
||||||
|
|
||||||
|
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
||||||
|
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
readNeededResources(spark, inputPath);
|
||||||
|
prepareDatasourceOrganization(spark, datasourceOrganizationPath);
|
||||||
|
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void prepareAlreadyLinkedAssociation(
|
||||||
|
SparkSession spark, String alreadyLinkedPath) {
|
||||||
|
String query = "Select source resultId, collect_set(target) organizationSet "
|
||||||
|
+ "from relation "
|
||||||
|
+ "where datainfo.deletedbyinference = false "
|
||||||
|
+ "and relClass = '"
|
||||||
|
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
|
||||||
|
+ "' "
|
||||||
|
+ "group by source";
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(query)
|
||||||
|
.as(Encoders.bean(ResultOrganizationSet.class))
|
||||||
|
// TODO retry to stick with datasets
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
|
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void readNeededResources(SparkSession spark, String inputPath) {
|
||||||
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||||
|
datasource.createOrReplaceTempView("datasource");
|
||||||
|
|
||||||
|
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
|
||||||
|
relation.createOrReplaceTempView("relation");
|
||||||
|
|
||||||
|
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
|
||||||
|
organization.createOrReplaceTempView("organization");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void prepareDatasourceOrganization(
|
||||||
|
SparkSession spark, String datasourceOrganizationPath) {
|
||||||
|
|
||||||
|
String query = "SELECT source datasourceId, target organizationId "
|
||||||
|
+ "FROM ( SELECT id "
|
||||||
|
+ "FROM datasource "
|
||||||
|
+ "WHERE datasourcetype.classid = '"
|
||||||
|
+ INSTITUTIONAL_REPO_TYPE
|
||||||
|
+ "' "
|
||||||
|
+ "AND datainfo.deletedbyinference = false ) d "
|
||||||
|
+ "JOIN ( SELECT source, target "
|
||||||
|
+ "FROM relation "
|
||||||
|
+ "WHERE relclass = '"
|
||||||
|
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
|
||||||
|
+ "' "
|
||||||
|
+ "AND datainfo.deletedbyinference = false ) rel "
|
||||||
|
+ "ON d.id = rel.source ";
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(query)
|
||||||
|
.as(Encoders.bean(DatasourceOrganization.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(datasourceOrganizationPath);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class ResultOrganizationSet implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private ArrayList<String> organizationSet;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<String> getOrganizationSet() {
|
||||||
|
return organizationSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOrganizationSet(ArrayList<String> organizationSet) {
|
||||||
|
this.organizationSet = organizationSet;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,193 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.broadcast.Broadcast;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromIstRepoJob.class);
|
||||||
|
|
||||||
|
private static final String RESULT_ORGANIZATIONSET_QUERY = "SELECT id resultId, collect_set(organizationId) organizationSet "
|
||||||
|
+ "FROM ( SELECT id, organizationId "
|
||||||
|
+ "FROM rels "
|
||||||
|
+ "JOIN cfhb "
|
||||||
|
+ " ON cf = datasourceId "
|
||||||
|
+ "UNION ALL "
|
||||||
|
+ "SELECT id , organizationId "
|
||||||
|
+ "FROM rels "
|
||||||
|
+ "JOIN cfhb "
|
||||||
|
+ " ON hb = datasourceId ) tmp "
|
||||||
|
+ "GROUP BY id";
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkResultToOrganizationFromIstRepoJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String datasourceorganization = parser.get("datasourceOrganizationPath");
|
||||||
|
log.info("datasourceOrganizationPath: {}", datasourceorganization);
|
||||||
|
|
||||||
|
final String alreadylinked = parser.get("alreadyLinkedPath");
|
||||||
|
log.info("alreadyLinkedPath: {}", alreadylinked);
|
||||||
|
|
||||||
|
final String resultClassName = parser.get("resultTableName");
|
||||||
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final Boolean saveGraph = Optional
|
||||||
|
.ofNullable(parser.get("saveGraph"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
if (isTest(parser)) {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
|
}
|
||||||
|
if (saveGraph)
|
||||||
|
execPropagation(
|
||||||
|
spark,
|
||||||
|
datasourceorganization,
|
||||||
|
alreadylinked,
|
||||||
|
inputPath,
|
||||||
|
outputPath,
|
||||||
|
resultClazz);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void execPropagation(
|
||||||
|
SparkSession spark,
|
||||||
|
String datasourceorganization,
|
||||||
|
String alreadyLinkedPath,
|
||||||
|
String inputPath,
|
||||||
|
String outputPath,
|
||||||
|
Class<? extends Result> clazz) {
|
||||||
|
|
||||||
|
Dataset<DatasourceOrganization> ds_org = readPath(spark, datasourceorganization, DatasourceOrganization.class);
|
||||||
|
|
||||||
|
Dataset<ResultOrganizationSet> potentialUpdates = getPotentialRelations(spark, inputPath, clazz, ds_org);
|
||||||
|
|
||||||
|
Dataset<ResultOrganizationSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultOrganizationSet.class);
|
||||||
|
|
||||||
|
potentialUpdates
|
||||||
|
.joinWith(
|
||||||
|
alreadyLinked,
|
||||||
|
potentialUpdates.col("resultId").equalTo(alreadyLinked.col("resultId")),
|
||||||
|
"left_outer")
|
||||||
|
.flatMap(createRelationFn(), Encoders.bean(Relation.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FlatMapFunction<Tuple2<ResultOrganizationSet, ResultOrganizationSet>, Relation> createRelationFn() {
|
||||||
|
return (FlatMapFunction<Tuple2<ResultOrganizationSet, ResultOrganizationSet>, Relation>) value -> {
|
||||||
|
List<Relation> new_relations = new ArrayList<>();
|
||||||
|
ResultOrganizationSet potential_update = value._1();
|
||||||
|
Optional<ResultOrganizationSet> already_linked = Optional.ofNullable(value._2());
|
||||||
|
List<String> organization_list = potential_update.getOrganizationSet();
|
||||||
|
if (already_linked.isPresent()) {
|
||||||
|
already_linked
|
||||||
|
.get()
|
||||||
|
.getOrganizationSet()
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
rId -> {
|
||||||
|
if (organization_list.contains(rId)) {
|
||||||
|
organization_list.remove(rId);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
String resultId = potential_update.getResultId();
|
||||||
|
organization_list
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
orgId -> {
|
||||||
|
new_relations
|
||||||
|
.add(
|
||||||
|
getRelation(
|
||||||
|
orgId,
|
||||||
|
resultId,
|
||||||
|
RELATION_ORGANIZATION_RESULT_REL_CLASS,
|
||||||
|
RELATION_RESULTORGANIZATION_REL_TYPE,
|
||||||
|
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
||||||
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
||||||
|
new_relations
|
||||||
|
.add(
|
||||||
|
getRelation(
|
||||||
|
resultId,
|
||||||
|
orgId,
|
||||||
|
RELATION_RESULT_ORGANIZATION_REL_CLASS,
|
||||||
|
RELATION_RESULTORGANIZATION_REL_TYPE,
|
||||||
|
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
|
||||||
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
||||||
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
||||||
|
});
|
||||||
|
return new_relations.iterator();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> Dataset<ResultOrganizationSet> getPotentialRelations(
|
||||||
|
SparkSession spark,
|
||||||
|
String inputPath,
|
||||||
|
Class<R> resultClazz,
|
||||||
|
Dataset<DatasourceOrganization> ds_org) {
|
||||||
|
|
||||||
|
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
||||||
|
result.createOrReplaceTempView("result");
|
||||||
|
createCfHbforResult(spark);
|
||||||
|
|
||||||
|
ds_org.createOrReplaceTempView("rels");
|
||||||
|
|
||||||
|
return spark
|
||||||
|
.sql(RESULT_ORGANIZATIONSET_QUERY)
|
||||||
|
.as(Encoders.bean(ResultOrganizationSet.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"is",
|
||||||
|
"paramLongName":"isLookUpUrl",
|
||||||
|
"paramDescription": "URL of the isLookUp Service",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "pm",
|
||||||
|
"paramLongName":"pathMap",
|
||||||
|
"paramDescription": "the json path associated to each selection field",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"tn",
|
||||||
|
"paramLongName":"resultTableName",
|
||||||
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "test",
|
||||||
|
"paramLongName": "isTest",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "tg",
|
||||||
|
"paramLongName": "taggingConf",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
|
@ -0,0 +1,54 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorNumber</name>
|
||||||
|
<value>4</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<value>15G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<value>6G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<value>1</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,216 @@
|
||||||
|
<workflow-app name="bulk_tagging" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>isLookUpUrl</name>
|
||||||
|
<description>the isLookup service endpoint</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>pathMap</name>
|
||||||
|
<description>the json path associated to each selection field</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the output path</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_outputpath">
|
||||||
|
<fs>
|
||||||
|
<delete path="${outputPath}"/>
|
||||||
|
<mkdir path="${outputPath}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="copy_entities"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name="copy_entities">
|
||||||
|
<path start="copy_relation"/>
|
||||||
|
<path start="copy_organization"/>
|
||||||
|
<path start="copy_projects"/>
|
||||||
|
<path start="copy_datasources"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="copy_relation">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_organization">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_projects">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_datasources">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="copy_wait" to="fork_exec_bulktag"/>
|
||||||
|
|
||||||
|
<fork name="fork_exec_bulktag">
|
||||||
|
<path start="join_bulktag_publication"/>
|
||||||
|
<path start="join_bulktag_dataset"/>
|
||||||
|
<path start="join_bulktag_otherresearchproduct"/>
|
||||||
|
<path start="join_bulktag_software"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="join_bulktag_publication">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>bulkTagging-publication</name>
|
||||||
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--num-executors=${sparkExecutorNumber}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
|
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
||||||
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_bulktag_dataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>bulkTagging-dataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--num-executors=${sparkExecutorNumber}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
|
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
||||||
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_bulktag_otherresearchproduct">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>bulkTagging-orp</name>
|
||||||
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--num-executors=${sparkExecutorNumber}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
|
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
||||||
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_bulktag_software">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>bulkTagging-software</name>
|
||||||
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--num-executors=${sparkExecutorNumber}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
|
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
||||||
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait" to="End"/>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,44 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hive_metastore_uris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"sg",
|
||||||
|
"paramLongName":"saveGraph",
|
||||||
|
"paramDescription": "true if the new version of the graph must be saved",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"tn",
|
||||||
|
"paramLongName":"resultTableName",
|
||||||
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "preparedInfoPath",
|
||||||
|
"paramDescription": "the path where prepared info have been stored",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,38 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hive_metastore_uris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "w",
|
||||||
|
"paramLongName": "whitelist",
|
||||||
|
"paramDescription": "the datasource having a type different from the allowed ones but that we want to add anyway",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "at",
|
||||||
|
"paramLongName": "allowedtypes",
|
||||||
|
"paramDescription": "the allowed datasource types for country propagation",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,38 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"out",
|
||||||
|
"paramLongName":"outputPath",
|
||||||
|
"paramDescription": "the output path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hive_metastore_uris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"tn",
|
||||||
|
"paramLongName":"resultTableName",
|
||||||
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "preparedInfoPath",
|
||||||
|
"paramDescription": "the path where prepared info have been stored",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,58 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorNumber</name>
|
||||||
|
<value>4</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<value>15G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<value>6G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<value>1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2MaxExecutors</name>
|
||||||
|
<value>50</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,376 @@
|
||||||
|
<workflow-app name="country_propagation" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>whitelist</name>
|
||||||
|
<description>the white list</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedtypes</name>
|
||||||
|
<description>the allowed types</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the output path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_outputpath">
|
||||||
|
<fs>
|
||||||
|
<delete path="${outputPath}"/>
|
||||||
|
<mkdir path="${outputPath}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="copy_entities"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name="copy_entities">
|
||||||
|
<path start="copy_relation"/>
|
||||||
|
<path start="copy_organization"/>
|
||||||
|
<path start="copy_projects"/>
|
||||||
|
<path start="copy_datasources"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="copy_relation">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_organization">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
<action name="copy_projects">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_datasources">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="copy_wait" to="prepare_datasource_country_association"/>
|
||||||
|
|
||||||
|
<action name="prepare_datasource_country_association">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PrepareDatasourceCountryAssociation</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--whitelist</arg><arg>${whitelist}</arg>
|
||||||
|
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="fork_join_prepare_result_country"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name="fork_join_prepare_result_country">
|
||||||
|
<path start="join_prepareresult_publication"/>
|
||||||
|
<path start="join_prepareresult_dataset"/>
|
||||||
|
<path start="join_prepareresult_otherresearchproduct"/>
|
||||||
|
<path start="join_prepareresult_software"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="join_prepareresult_publication">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>prepareResultCountry-Publication</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_prepareresult_dataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>prepareResultCountry-Dataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_prepareresult_otherresearchproduct">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>prepareResultCountry-ORP</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_prepareresult_software">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>prepareResultCountry-Software</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait_prepare" to="fork_join_apply_country_propagation"/>
|
||||||
|
|
||||||
|
<fork name="fork_join_apply_country_propagation">
|
||||||
|
<path start="join_propagation_publication"/>
|
||||||
|
<path start="join_propagation_dataset"/>
|
||||||
|
<path start="join_propagation_otherresearchproduct"/>
|
||||||
|
<path start="join_propagation_software"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="join_propagation_publication">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>countryPropagationForPublications</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_propagation_dataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>countryPropagationForDataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_propagation_otherresearchproduct">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>countryPropagationForORP</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_propagation_software">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>countryPropagationForSoftware</name>
|
||||||
|
<class>eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait" to="End"/>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,50 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"sg",
|
||||||
|
"paramLongName":"saveGraph",
|
||||||
|
"paramDescription": "true if the new version of the graph must be saved",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hive_metastore_uris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"tn",
|
||||||
|
"paramLongName":"resultTableName",
|
||||||
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"pu",
|
||||||
|
"paramLongName":"possibleUpdatesPath",
|
||||||
|
"paramDescription": "the path the the association resultId orcid author list can be found",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"test",
|
||||||
|
"paramLongName":"isTest",
|
||||||
|
"paramDescription": "true if it is executing a test",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,38 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"as",
|
||||||
|
"paramLongName":"allowedsemrels",
|
||||||
|
"paramDescription": "the allowed sematinc relations for propagation",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hive_metastore_uris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"tn",
|
||||||
|
"paramLongName":"resultTableName",
|
||||||
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,20 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,58 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorNumber</name>
|
||||||
|
<value>4</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<value>15G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<value>6G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<value>1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2MaxExecutors</name>
|
||||||
|
<value>50</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,372 @@
|
||||||
|
<workflow-app name="orcid_to_result_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedsemrels</name>
|
||||||
|
<description>the semantic relationships allowed for propagation</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the output path</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_outputpath">
|
||||||
|
<fs>
|
||||||
|
<delete path="${outputPath}"/>
|
||||||
|
<mkdir path="${outputPath}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="copy_entities"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name="copy_entities">
|
||||||
|
<path start="copy_relation"/>
|
||||||
|
<path start="copy_organization"/>
|
||||||
|
<path start="copy_projects"/>
|
||||||
|
<path start="copy_datasources"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="copy_relation">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_organization">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
<action name="copy_projects">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copy_datasources">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="copy_wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
|
||||||
|
|
||||||
|
|
||||||
|
<fork name="fork_prepare_assoc_step1">
|
||||||
|
<path start="join_prepare_publication"/>
|
||||||
|
<path start="join_prepare_dataset"/>
|
||||||
|
<path start="join_prepare_otherresearchproduct"/>
|
||||||
|
<path start="join_prepare_software"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="join_prepare_publication">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-PreparePhase1-Publications</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_prepare_dataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-PreparePhase1-Dataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_prepare_otherresearchproduct">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-PreparePhase1-ORP</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_prepare_software">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-PreparePhase1-Software</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait" to="prepare_assoc_step2"/>
|
||||||
|
|
||||||
|
<action name="prepare_assoc_step2">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-PreparePhase2</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep2</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="fork-join-exec-propagation"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
|
||||||
|
</action>
|
||||||
|
<fork name="fork-join-exec-propagation">
|
||||||
|
<path start="join_propagate_publication"/>
|
||||||
|
<path start="join_propagate_dataset"/>
|
||||||
|
<path start="join_propagate_otherresearchproduct"/>
|
||||||
|
<path start="join_propagate_software"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="join_propagate_publication">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-Publication</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
<action name="join_propagate_dataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-Dataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
<action name="join_propagate_otherresearchproduct">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-ORP</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
<action name="join_propagate_software">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ORCIDPropagation-Software</name>
|
||||||
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.speculation=false
|
||||||
|
--conf spark.hadoop.mapreduce.map.speculative=false
|
||||||
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait2" to="End"/>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,33 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName":"asr",
|
||||||
|
"paramLongName":"allowedsemrels",
|
||||||
|
"paramDescription": "the types of the allowed datasources. Split by ;",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hive_metastore_uris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"pu",
|
||||||
|
"paramLongName":"potentialUpdatePath",
|
||||||
|
"paramDescription": "the path of the potential updates ",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"al",
|
||||||
|
"paramLongName":"alreadyLinkedPath",
|
||||||
|
"paramDescription": "the path of the already linked project result_set",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue