1
0
Fork 0

Merge pull request 'master' (#11) from miriam.baglioni/dnet-hadoop:master into enrichment_wfs

This commit is contained in:
Claudio Atzori 2020-05-11 15:14:56 +02:00
commit c403971c2f
156 changed files with 13879 additions and 74 deletions

View File

@ -83,6 +83,10 @@
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.common;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class); private static final Log log = LogFactory.getLog(DbClient.class);
private final Connection connection; private Connection connection;
public DbClient(final String address, final String login, final String password) { public DbClient(final String address, final String login, final String password) {

View File

@ -1,4 +1,3 @@
package eu.dnetlib.dhp.schema.common; package eu.dnetlib.dhp.schema.common;
import java.util.Map; import java.util.Map;
@ -13,7 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class ModelSupport { public class ModelSupport {
/** Defines the mapping between the actual entity type and the main entity type */ /** Defines the mapping between the actual entity type and the main entity type */
private static final Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap(); private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
static { static {
entityMapping.put(EntityType.publication, MainEntityType.result); entityMapping.put(EntityType.publication, MainEntityType.result);
@ -53,6 +52,232 @@ public class ModelSupport {
oafTypes.put("relation", Relation.class); oafTypes.put("relation", Relation.class);
} }
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
static {
entityIdPrefix.put("datasource", "10");
entityIdPrefix.put("organization", "20");
entityIdPrefix.put("project", "40");
entityIdPrefix.put("result", "50");
}
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
static {
relationInverseMap
.put(
"personResult_authorship_isAuthorOf", new RelationInverse()
.setRelation("isAuthorOf")
.setInverse("hasAuthor")
.setRelType("personResult")
.setSubReltype("authorship"));
relationInverseMap
.put(
"personResult_authorship_hasAuthor", new RelationInverse()
.setInverse("isAuthorOf")
.setRelation("hasAuthor")
.setRelType("personResult")
.setSubReltype("authorship"));
relationInverseMap
.put(
"projectOrganization_participation_isParticipant", new RelationInverse()
.setRelation("isParticipant")
.setInverse("hasParticipant")
.setRelType("projectOrganization")
.setSubReltype("participation"));
relationInverseMap
.put(
"projectOrganization_participation_hasParticipant", new RelationInverse()
.setInverse("isParticipant")
.setRelation("hasParticipant")
.setRelType("projectOrganization")
.setSubReltype("participation"));
relationInverseMap
.put(
"resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
.setRelation("hasAuthorInstitution")
.setInverse("isAuthorInstitutionOf")
.setRelType("resultOrganization")
.setSubReltype("affiliation"));
relationInverseMap
.put(
"resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
.setInverse("hasAuthorInstitution")
.setRelation("isAuthorInstitutionOf")
.setRelType("resultOrganization")
.setSubReltype("affiliation"));
relationInverseMap
.put(
"organizationOrganization_dedup_merges", new RelationInverse()
.setRelation("merges")
.setInverse("isMergedIn")
.setRelType("organizationOrganization")
.setSubReltype("dedup"));
relationInverseMap
.put(
"organizationOrganization_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("organizationOrganization")
.setSubReltype("dedup"));
relationInverseMap
.put(
"organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("organizationOrganization")
.setSubReltype("dedupSimilarity"));
relationInverseMap
.put(
"resultProject_outcome_isProducedBy", new RelationInverse()
.setRelation("isProducedBy")
.setInverse("produces")
.setRelType("resultProject")
.setSubReltype("outcome"));
relationInverseMap
.put(
"resultProject_outcome_produces", new RelationInverse()
.setInverse("isProducedBy")
.setRelation("produces")
.setRelType("resultProject")
.setSubReltype("outcome"));
relationInverseMap
.put(
"projectPerson_contactPerson_isContact", new RelationInverse()
.setRelation("isContact")
.setInverse("hasContact")
.setRelType("projectPerson")
.setSubReltype("contactPerson"));
relationInverseMap
.put(
"projectPerson_contactPerson_hasContact", new RelationInverse()
.setInverse("isContact")
.setRelation("hasContact")
.setRelType("personPerson")
.setSubReltype("coAuthorship"));
relationInverseMap
.put(
"personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
.setInverse("isCoAuthorOf")
.setRelation("isCoAuthorOf")
.setRelType("personPerson")
.setSubReltype("coAuthorship"));
relationInverseMap
.put(
"personPerson_dedup_merges", new RelationInverse()
.setInverse("isMergedIn")
.setRelation("merges")
.setRelType("personPerson")
.setSubReltype("dedup"));
relationInverseMap
.put(
"personPerson_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("personPerson")
.setSubReltype("dedup"));
relationInverseMap
.put(
"personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("personPerson")
.setSubReltype("dedupSimilarity"));
relationInverseMap
.put(
"datasourceOrganization_provision_isProvidedBy", new RelationInverse()
.setInverse("provides")
.setRelation("isProvidedBy")
.setRelType("datasourceOrganization")
.setSubReltype("provision"));
relationInverseMap
.put(
"datasourceOrganization_provision_provides", new RelationInverse()
.setInverse("isProvidedBy")
.setRelation("provides")
.setRelType("datasourceOrganization")
.setSubReltype("provision"));
relationInverseMap
.put(
"resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("isAmongTopNSimilarDocuments")
.setRelation("hasAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("hasAmongTopNSimilarDocuments")
.setRelation("isAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_relationship_isRelatedTo", new RelationInverse()
.setInverse("isRelatedTo")
.setRelation("isRelatedTo")
.setRelType("resultResult")
.setSubReltype("relationship"));
relationInverseMap
.put(
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("hasAmongTopNSimilarDocuments")
.setRelation("isAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_supplement_isSupplementTo", new RelationInverse()
.setInverse("isSupplementedBy")
.setRelation("isSupplementTo")
.setRelType("resultResult")
.setSubReltype("supplement"));
relationInverseMap
.put(
"resultResult_supplement_isSupplementedBy", new RelationInverse()
.setInverse("isSupplementTo")
.setRelation("isSupplementedBy")
.setRelType("resultResult")
.setSubReltype("supplement"));
relationInverseMap
.put(
"resultResult_part_isPartOf", new RelationInverse()
.setInverse("hasPart")
.setRelation("isPartOf")
.setRelType("resultResult")
.setSubReltype("part"));
relationInverseMap
.put(
"resultResult_part_hasPart", new RelationInverse()
.setInverse("isPartOf")
.setRelation("hasPart")
.setRelType("resultResult")
.setSubReltype("part"));
relationInverseMap
.put(
"resultResult_dedup_merges", new RelationInverse()
.setInverse("isMergedIn")
.setRelation("merges")
.setRelType("resultResult")
.setSubReltype("dedup"));
relationInverseMap
.put(
"resultResult_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("resultResult")
.setSubReltype("dedup"));
relationInverseMap
.put(
"resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("resultResult")
.setSubReltype("dedupSimilarity"));
}
private static final String schemeTemplate = "dnet:%s_%s_relations"; private static final String schemeTemplate = "dnet:%s_%s_relations";
private ModelSupport() { private ModelSupport() {
@ -68,7 +293,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y * @return True if X is a subclass of Y
*/ */
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass( public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
X subClazzObject, Y superClazzObject) { X subClazzObject, Y superClazzObject) {
return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
} }
@ -82,7 +307,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y * @return True if X is a subclass of Y
*/ */
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass( public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
X subClazzObject, Class<Y> superClazz) { X subClazzObject, Class<Y> superClazz) {
return isSubClass(subClazzObject.getClass(), superClazz); return isSubClass(subClazzObject.getClass(), superClazz);
} }
@ -96,7 +321,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y * @return True if X is a subclass of Y
*/ */
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass( public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
Class<X> subClazz, Class<Y> superClazz) { Class<X> subClazz, Class<Y> superClazz) {
return superClazz.isAssignableFrom(subClazz); return superClazz.isAssignableFrom(subClazz);
} }
@ -108,32 +333,32 @@ public class ModelSupport {
*/ */
public static <T extends Oaf> Class<T>[] getOafModelClasses() { public static <T extends Oaf> Class<T>[] getOafModelClasses() {
return new Class[] { return new Class[] {
Author.class, Author.class,
Context.class, Context.class,
Country.class, Country.class,
DataInfo.class, DataInfo.class,
Dataset.class, Dataset.class,
Datasource.class, Datasource.class,
ExternalReference.class, ExternalReference.class,
ExtraInfo.class, ExtraInfo.class,
Field.class, Field.class,
GeoLocation.class, GeoLocation.class,
Instance.class, Instance.class,
Journal.class, Journal.class,
KeyValue.class, KeyValue.class,
Oaf.class, Oaf.class,
OafEntity.class, OafEntity.class,
OAIProvenance.class, OAIProvenance.class,
Organization.class, Organization.class,
OriginDescription.class, OriginDescription.class,
OtherResearchProduct.class, OtherResearchProduct.class,
Project.class, Project.class,
Publication.class, Publication.class,
Qualifier.class, Qualifier.class,
Relation.class, Relation.class,
Result.class, Result.class,
Software.class, Software.class,
StructuredProperty.class StructuredProperty.class
}; };
} }
@ -147,10 +372,10 @@ public class ModelSupport {
public static String getScheme(final String sourceType, final String targetType) { public static String getScheme(final String sourceType, final String targetType) {
return String return String
.format( .format(
schemeTemplate, schemeTemplate,
entityMapping.get(EntityType.valueOf(sourceType)).name(), entityMapping.get(EntityType.valueOf(sourceType)).name(),
entityMapping.get(EntityType.valueOf(targetType)).name()); entityMapping.get(EntityType.valueOf(targetType)).name());
} }
public static <T extends Oaf> Function<T, String> idFn() { public static <T extends Oaf> Function<T, String> idFn() {
@ -165,38 +390,38 @@ public class ModelSupport {
private static <T extends Oaf> String idFnForRelation(T t) { private static <T extends Oaf> String idFnForRelation(T t) {
Relation r = (Relation) t; Relation r = (Relation) t;
return Optional return Optional
.ofNullable(r.getSource()) .ofNullable(r.getSource())
.map( .map(
source -> Optional source -> Optional
.ofNullable(r.getTarget()) .ofNullable(r.getTarget())
.map( .map(
target -> Optional target -> Optional
.ofNullable(r.getRelType()) .ofNullable(r.getRelType())
.map( .map(
relType -> Optional relType -> Optional
.ofNullable(r.getSubRelType()) .ofNullable(r.getSubRelType())
.map( .map(
subRelType -> Optional subRelType -> Optional
.ofNullable(r.getRelClass()) .ofNullable(r.getRelClass())
.map( .map(
relClass -> String relClass -> String
.join( .join(
source, source,
target, target,
relType, relType,
subRelType, subRelType,
relClass)) relClass))
.orElse( .orElse(
String String
.join( .join(
source, source,
target, target,
relType, relType,
subRelType))) subRelType)))
.orElse(String.join(source, target, relType))) .orElse(String.join(source, target, relType)))
.orElse(String.join(source, target))) .orElse(String.join(source, target)))
.orElse(source)) .orElse(source))
.orElse(null); .orElse(null);
} }
private static <T extends Oaf> String idFnForOafEntity(T t) { private static <T extends Oaf> String idFnForOafEntity(T t) {

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.common;
public class RelationInverse {
private String relation;
private String inverse;
private String relType;
private String subReltype;
public String getRelType() {
return relType;
}
public RelationInverse setRelType(String relType) {
this.relType = relType;
return this;
}
public String getSubReltype() {
return subReltype;
}
public RelationInverse setSubReltype(String subReltype) {
this.subReltype = subReltype;
return this;
}
public String getRelation() {
return relation;
}
public RelationInverse setRelation(String relation) {
this.relation = relation;
return this;
}
public String getInverse() {
return inverse;
}
public RelationInverse setInverse(String inverse) {
this.inverse = inverse;
return this;
}
}

View File

@ -2,8 +2,7 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.*;
import java.util.Objects;
public class Author implements Serializable { public class Author implements Serializable {
@ -86,4 +85,5 @@ public class Author implements Serializable {
public int hashCode() { public int hashCode() {
return Objects.hash(fullname, name, surname, rank, pid, affiliation); return Objects.hash(fullname, name, surname, rank, pid, affiliation);
} }
} }

View File

@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-blacklist</artifactId>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.blacklist;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareMergedRelationJob {
private static final Logger log = LoggerFactory.getLogger(PrepareMergedRelationJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareMergedRelationJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/blacklist/input_preparerelation_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {} ", outputPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
selectMergesRelations(
spark,
inputPath,
outputPath);
});
}
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
Dataset<Relation> relation = readRelations(spark, inputPath);
relation
.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
// relation.createOrReplaceTempView("relation");
//
// spark
// .sql(
// "Select * from relation " +
// "where relclass = 'merges' " +
// "and datainfo.deletedbyinference = false")
// .as(Encoders.bean(Relation.class))
// .toJSON()
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .text(outputPath);
}
public static org.apache.spark.sql.Dataset<Relation> readRelations(
SparkSession spark, String inputPath) {
return spark
.read()
.textFile(inputPath)
.map(
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
Encoders.bean(Relation.class));
}
}

View File

@ -0,0 +1,142 @@
package eu.dnetlib.dhp.blacklist;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.util.Arrays;
import java.util.List;
import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.common.RelationInverse;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class ReadBlacklistFromDB implements Closeable {
private final DbClient dbClient;
private static final Log log = LogFactory.getLog(ReadBlacklistFromDB.class);
private final Configuration conf;
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final static String query = "SELECT source_type, unnest(original_source_objects) as source, " +
"target_type, unnest(original_target_objects) as target, " +
"relationship FROM blacklist WHERE status = 'ACCEPTED'";
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
ReadBlacklistFromDB.class
.getResourceAsStream(
"/eu/dnetlib/dhp/blacklist/blacklist_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
final String hdfsNameNode = parser.get("hdfsNameNode");
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
dbPassword)) {
log.info("Processing blacklist...");
rbl.execute(query, rbl::processBlacklistEntry);
}
}
public void execute(final String sql, final Function<ResultSet, List<Relation>> producer)
throws Exception {
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(r -> writeRelation(r));
dbClient.processResults(sql, consumer);
}
public List<Relation> processBlacklistEntry(ResultSet rs) {
try {
Relation direct = new Relation();
Relation inverse = new Relation();
String source_prefix = ModelSupport.entityIdPrefix.get(rs.getString("source_type"));
String target_prefix = ModelSupport.entityIdPrefix.get(rs.getString("target_type"));
String source_direct = source_prefix + "|" + rs.getString("source");
direct.setSource(source_direct);
inverse.setTarget(source_direct);
String target_direct = target_prefix + "|" + rs.getString("target");
direct.setTarget(target_direct);
inverse.setSource(target_direct);
String encoding = rs.getString("relationship");
RelationInverse ri = ModelSupport.relationInverseMap.get(encoding);
direct.setRelClass(ri.getRelation());
inverse.setRelClass(ri.getInverse());
direct.setRelType(ri.getRelType());
inverse.setRelType(ri.getRelType());
direct.setSubRelType(ri.getSubReltype());
inverse.setSubRelType(ri.getSubReltype());
return Arrays.asList(direct, inverse);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
dbClient.close();
writer.close();
}
public ReadBlacklistFromDB(
final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword)
throws Exception {
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fsDataOutputStream = fileSystem.append(hdfsWritePath);
} else {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
}
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
}
protected void writeRelation(final Relation r) {
try {
writer.write(OBJECT_MAPPER.writeValueAsString(r));
writer.newLine();
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,151 @@
package eu.dnetlib.dhp.blacklist;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class SparkRemoveBlacklistedRelationJob {
private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkRemoveBlacklistedRelationJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/blacklist/sparkblacklist_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String blacklistPath = parser.get("hdfsPath");
log.info("blacklistPath {}: ", blacklistPath);
final String mergesPath = parser.get("mergesPath");
log.info("mergesPath {}: ", mergesPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeBlacklistedRelations(
spark,
blacklistPath,
inputPath,
outputPath,
mergesPath);
});
}
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
String outputPath, String mergesPath) {
Dataset<Relation> blackListed = readRelations(spark, blacklistPath + "/blacklist");
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
log.info("InputRelationCount: {}", inputRelation.count());
Dataset<Relation> dedupSource = blackListed
.joinWith(
mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
"left_outer")
.map(c -> {
Optional
.ofNullable(c._2())
.ifPresent(mr -> c._1().setSource(mr.getSource()));
return c._1();
}, Encoders.bean(Relation.class));
Dataset<Relation> dedupBL = dedupSource
.joinWith(
mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")),
"left_outer")
.map(c -> {
Optional
.ofNullable(c._2())
.ifPresent(mr -> c._1().setTarget(mr.getSource()));
return c._1();
}, Encoders.bean(Relation.class));
dedupBL
.write()
.mode(SaveMode.Overwrite)
.json(blacklistPath + "/deduped");
inputRelation
.joinWith(
dedupBL, (inputRelation
.col("source")
.equalTo(dedupBL.col("source"))
.and(
inputRelation
.col("target")
.equalTo(dedupBL.col("target")))),
"left_outer")
.map(c -> {
Relation ir = c._1();
Optional<Relation> obl = Optional.ofNullable(c._2());
if (obl.isPresent()) {
if (ir.equals(obl.get())) {
return null;
}
}
return ir;
}, Encoders.bean(Relation.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
public static org.apache.spark.sql.Dataset<Relation> readRelations(
SparkSession spark, String inputPath) {
return spark
.read()
.textFile(inputPath)
.map(
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
Encoders.bean(Relation.class));
}
}

View File

@ -0,0 +1,32 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the name node on hdfs",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the path to the graph used to remove the relations ",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path where to store the temporary result ",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed",
"paramRequired": false
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramRequired": true
}
]

View File

@ -0,0 +1,54 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,98 @@
<workflow-app name="blacklisting" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>postgresURL</name>
<description>the url of the postgress server to query</description>
</property>
<property>
<name>postgresUser</name>
<description>the username to access the postgres db</description>
</property>
<property>
<name>postgresPassword</name>
<description>the postgres password</description>
</property>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
</parameters>
<start to="reset-outputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset-outputpath">
<fs>
<delete path='${workingDir}/blacklist'/>
</fs>
<ok to="read_blacklist"/>
<error to="Kill"/>
</action>
<action name="read_blacklist">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
</java>
<ok to="prepare_merged_relation"/>
<error to="Kill"/>
</action>
<action name="prepare_merged_relation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareMergedRelation</name>
<class>eu.dnetlib.dhp.blacklist.PrepareMergedRelationJob</class>
<jar>dhp-blacklist-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
</spark>
<ok to="apply_blacklist"/>
<error to="Kill"/>
</action>
<action name="apply_blacklist">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ApplyBlacklist</name>
<class>eu.dnetlib.dhp.blacklist.SparkRemoveBlacklistedRelationJob</class>
<jar>dhp-blacklist-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
<arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,33 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the path to the graph used to remove the relations ",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path where to store the temporary result ",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed",
"paramRequired": false
},
{
"paramName": "m",
"paramLongName": "mergesPath",
"paramDescription": "true if the spark session is managed",
"paramRequired": true
}
]

View File

@ -0,0 +1,166 @@
package eu.dnetlib.dhp.blacklist;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class BlackListTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.blacklist.BlackListTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.blacklist.BlackListTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(eu.dnetlib.dhp.blacklist.BlackListTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(BlackListTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
/*
* String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath =
* parser.get("outputPath"); log.info("outputPath {}: ", outputPath); final String blacklistPath =
* parser.get("hdfsPath"); log.info("blacklistPath {}: ", blacklistPath); final String mergesPath =
* parser.get("mergesPath"); log.info("mergesPath {}: ", mergesPath);
*/
@Test
public void noRemoveTest() throws Exception {
SparkRemoveBlacklistedRelationJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsNoRemoval").getPath(),
"-outputPath",
workingDir.toString() + "/relation",
"-hdfsPath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
"-mergesPath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(13, tmp.count());
}
@Test
public void removeNoMergeMatchTest() throws Exception {
SparkRemoveBlacklistedRelationJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/relationsOneRemoval").getPath(),
"-outputPath",
workingDir.toString() + "/relation",
"-hdfsPath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
"-mergesPath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRel").getPath(),
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(12, tmp.count());
org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.oaf.Relation> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
Assertions
.assertEquals(
0, verificationDataset
.filter(
"source = '40|corda__h2020::5161f53ab205d803c36b4c888fe7deef' and " +
"target = '20|dedup_wf_001::157af406bc653aa4d9749318b644de43'")
.count());
Assertions.assertEquals(0, verificationDataset.filter("relClass = 'hasParticipant'").count());
}
@Test
public void removeMergeMatchTest() throws Exception {
SparkRemoveBlacklistedRelationJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/relationOneRemovalWithMatch").getPath(),
"-outputPath",
workingDir.toString() + "/relation",
"-hdfsPath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/blacklist").getPath(),
"-mergesPath",
getClass().getResource("/eu/dnetlib/dhp/blacklist/mergesRelOneMerge").getPath(),
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(12, tmp.count());
org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.oaf.Relation> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
Assertions.assertEquals(12, verificationDataset.filter("relClass = 'isProvidedBy'").count());
}
}

View File

@ -0,0 +1,20 @@
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"hasParticipant","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"projectOrganization","subRelType":"participation","relClass":"isParticipant","source":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43","target":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::a727cc288016db7132ef9a799aa83350","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::a727cc288016db7132ef9a799aa83350"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od________18::062cf091d5c7a7d730001c34177042e3","target":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::9826e8aba3e8f3a2a46545cf341838a8","target":"50|od________18::062cf091d5c7a7d730001c34177042e3"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|webcrawl____::68c191d9b972b47a235d311804c7f6f5"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::1b172ab34639e7935e2357119cf20830","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|od_______908::1b172ab34639e7935e2357119cf20830"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423","target":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::c3d0b21615b129cd7395e24f9cf6bb64","target":"50|doajarticles::cb234c66327d29ba5f13c0db7a4cf423"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od______1146::e2fafaba636a14e408f02c6ea26acb0e"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|od_______908::b8e86ed982ff331764456e1f0759ed9c"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021","target":"40|corda_______::35695c955c51f0bb39482ce5477047c7"}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"relType":"resultProject","subRelType":"outcome","relClass":"produces","source":"40|corda_______::35695c955c51f0bb39482ce5477047c7","target":"50|webcrawl____::c472bf5944ce0495844d505d43d1c021"}

View File

@ -0,0 +1,14 @@
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______177::67c1385662f2fa0bde310bec15427646"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}

View File

@ -0,0 +1,14 @@
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______908::a47e1c3ede9a21ee5278a2e5c338d69b"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|doiboost____::8ea1631fa01adcbafc3f384b6a2c5cc3"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"dedup","target":"50|od_______166::67c1385662f2fa0bde310bec15427646"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|od_______935::0bf7d9c5d2e1115a31cd558f83ae8ee3"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::d2a45f0f42d8dd66c364219924c37c3f","subRelType":"dedup","target":"50|doajarticles::d695fee344cb367a38ce6622f5fe9430"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|od_______267::14e952745e4b602ff72919aa881b8945"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|doiboost____::43941031067842fac90604d37b2a4149"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|core________::5c62b3ad05a23de613636607a424899d"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::325525b879b17d8059a4e58def2f7225","subRelType":"dedup","target":"50|scholexplore::1c467aabe5108ee840a4500d58f19328"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doiboost____::0ff61beeb12c49ed8a826b2b1883c8f8"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::cd4fc0411683ee762d50bfd30436f95b","subRelType":"dedup","target":"50|doajarticles::fca1220426b10ccb8b46e4967b353f37"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|doiboost____::dd96d41ee05d4022065c9d3096e1023a"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|erc_________::7d9a29ff323c2fe0ecf037189bf71b8e"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"relClass":"merges","source":"50|dedup_wf_001::a87be24a4fcac13c9298f0cc3acfc6ea","subRelType":"dedup","target":"50|webcrawl____::fdd999801fec35d4c6190bcabb850c52"}

View File

@ -0,0 +1,13 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProducedBy","relType":"resultProject","source":"50|dedup_wf_001::3668b9bd87532a085dc7a18ce2086715","subRelType":"outcome","target":"40|corda_______::189ff31d637eaaeaf4d3584dc490b1cf"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}

View File

@ -0,0 +1,13 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::018cb61ed43c01704decc66183ce5d60","subRelType":"provision","target":"20|dedup_wf_001::b9fff055ce5efacecbe4ef918c127f86"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}

View File

@ -0,0 +1,13 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda__h2020::5161f53ab205d803c36b4c888fe7deef","subRelType":"participation","target":"20|dedup_wf_001::157af406bc653aa4d9749318b644de43"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::05c5c5d2920c01e194d6760f24885a82","subRelType":"provision","target":"20|dedup_wf_001::cd07e6c09886e59266fdbae32a9e319b"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::07022f119fc3d1cb66fe84494aa820c9","subRelType":"provision","target":"20|doajarticles::c48e93350cf5287e604ef631f2a67087"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::09ea05970871d7d923caaa8d2416d10e","subRelType":"provision","target":"20|doajarticles::cd84ef51b2de10ff01d679e4e662594e"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0af8c8ecf992b177304eb8f5d978100b","subRelType":"provision","target":"20|doajarticles::4eb6845b141d2b36ed94918d2bf382f0"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0b48a767b2b8d323ccdcaf2d40642746","subRelType":"provision","target":"20|doajarticles::46a4942a4707e842611278cfa26789f9"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0dd9573adad4e5cc322612f6e9ecc8ce","subRelType":"provision","target":"20|doajarticles::e34526e7b5efb700ddb4544700234a0b"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0e870ab15f231d954306bb658fc747a2","subRelType":"provision","target":"20|doajarticles::ccac83f4f971e3cdc194ddb796850a37"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f19a2d702e31d451e9806f701584c97","subRelType":"provision","target":"20|doajarticles::7a02d64772c121c1f10c17f8e2bf2aec"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::0f4b6db6c02966acbfb60af527728c85","subRelType":"provision","target":"20|doajarticles::acd96b3bd87b176202b8ea494c318b21"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::11f65dc66da7ef1b1f3a3e59199e4d70","subRelType":"provision","target":"20|dedup_wf_001::6132363e7458cbd7c22aa284c7df1307"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::149fd06e8702d94aa648641fd1602284","subRelType":"provision","target":"20|dedup_wf_001::35ae35032078bc33bc92e2b0f2ecfa17"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::15581a45537ceb854bbddee49b2942b4","subRelType":"provision","target":"20|doajarticles::0b25b0ce56da469cc8ad74c7d83c16a3"}

View File

@ -0,0 +1,65 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-bulktag</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.11</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.3-jre</version>
</dependency>
<dependency>
<groupId>io.github.classgraph</groupId>
<artifactId>classgraph</artifactId>
<version>4.8.71</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,7 @@
#sandboxName when not provided explicitly will be generated
sandboxName=${sandboxName}
sandboxDir=/user/${dhp.hadoop.frontend.user.name}/${sandboxName}
workingDir=${sandboxDir}/working_dir
oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir}
oozieTopWfApplicationPath = ${oozie.wf.application.path}

View File

@ -0,0 +1,120 @@
package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.community.*;
import eu.dnetlib.dhp.schema.oaf.*;
public class SparkBulkTagJob {
private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkBulkTagJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
Boolean isTest = Optional
.ofNullable(parser.get("isTest"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
log.info("isTest: {} ", isTest);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class);
log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
CommunityConfiguration cc;
String taggingConf = parser.get("taggingConf");
if (isTest) {
cc = CommunityConfigurationFactory.newInstance(taggingConf);
} else {
cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookUpUrl"));
}
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
});
}
private static <R extends Result> void execBulkTag(
SparkSession spark,
String inputPath,
String outputPath,
ProtoMap protoMappingParams,
Class<R> resultClazz,
CommunityConfiguration communityConfiguration) {
ResultTagger resultTagger = new ResultTagger();
readPath(spark, inputPath, resultClazz)
.map(
(MapFunction<R, R>) value -> resultTagger
.enrichContextCriteria(
value, communityConfiguration, protoMappingParams),
Encoders.bean(resultClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,65 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.gson.Gson;
/** Created by miriam on 01/08/2018. */
public class Community implements Serializable {
private static final Log log = LogFactory.getLog(Community.class);
private String id;
private List<String> subjects = new ArrayList<>();
private List<Datasource> datasources = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
public String toJson() {
final Gson g = new Gson();
return g.toJson(this);
}
public boolean isValid() {
return !getSubjects().isEmpty()
|| !getDatasources().isEmpty()
|| !getZenodoCommunities().isEmpty();
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<String> getSubjects() {
return subjects;
}
public void setSubjects(List<String> subjects) {
this.subjects = subjects;
}
public List<Datasource> getDatasources() {
return datasources;
}
public void setDatasources(List<Datasource> datasources) {
this.datasources = datasources;
}
public List<ZenodoCommunity> getZenodoCommunities() {
return zenodoCommunities;
}
public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) {
this.zenodoCommunities = zenodoCommunities;
}
}

View File

@ -0,0 +1,196 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
/** Created by miriam on 02/08/2018. */
public class CommunityConfiguration implements Serializable {
private static final Log log = LogFactory.getLog(CommunityConfiguration.class);
private Map<String, Community> communities;
// map subject -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> subjectMap = new HashMap<>();
// map datasourceid -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap = new HashMap<>();
// map zenodocommunityid -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap = new HashMap<>();
public Map<String, List<Pair<String, SelectionConstraints>>> getSubjectMap() {
return subjectMap;
}
public void setSubjectMap(Map<String, List<Pair<String, SelectionConstraints>>> subjectMap) {
this.subjectMap = subjectMap;
}
public Map<String, List<Pair<String, SelectionConstraints>>> getDatasourceMap() {
return datasourceMap;
}
public void setDatasourceMap(
Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap) {
this.datasourceMap = datasourceMap;
}
public Map<String, List<Pair<String, SelectionConstraints>>> getZenodocommunityMap() {
return zenodocommunityMap;
}
public void setZenodocommunityMap(
Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap) {
this.zenodocommunityMap = zenodocommunityMap;
}
CommunityConfiguration(final Map<String, Community> communities) {
this.communities = communities;
init();
}
void init() {
if (subjectMap == null) {
subjectMap = Maps.newHashMap();
}
if (datasourceMap == null) {
datasourceMap = Maps.newHashMap();
}
if (zenodocommunityMap == null) {
zenodocommunityMap = Maps.newHashMap();
}
for (Community c : getCommunities().values()) {
// get subjects
final String id = c.getId();
for (String sbj : c.getSubjects()) {
Pair<String, SelectionConstraints> p = new Pair<>(id, new SelectionConstraints());
add(sbj.toLowerCase().trim(), p, subjectMap);
}
// get datasources
for (Datasource d : c.getDatasources()) {
add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
}
// get zenodo communities
for (ZenodoCommunity zc : c.getZenodoCommunities()) {
add(
zc.getZenodoCommunityId(),
new Pair<>(id, zc.getSelCriteria()),
zenodocommunityMap);
}
}
}
private void add(
String key,
Pair<String, SelectionConstraints> value,
Map<String, List<Pair<String, SelectionConstraints>>> map) {
List<Pair<String, SelectionConstraints>> values = map.get(key);
if (values == null) {
values = new ArrayList<>();
map.put(key, values);
}
values.add(value);
}
public List<Pair<String, SelectionConstraints>> getCommunityForSubject(String sbj) {
return subjectMap.get(sbj);
}
public List<Pair<String, SelectionConstraints>> getCommunityForDatasource(String dts) {
return datasourceMap.get(dts);
}
public List<String> getCommunityForDatasource(
final String dts, final Map<String, List<String>> param) {
List<Pair<String, SelectionConstraints>> lp = datasourceMap.get(dts);
if (lp == null)
return Lists.newArrayList();
return lp
.stream()
.map(
p -> {
if (p.getSnd() == null)
return p.getFst();
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
return p.getFst();
else
return null;
})
.filter(st -> (st != null))
.collect(Collectors.toList());
}
public List<Pair<String, SelectionConstraints>> getCommunityForZenodoCommunity(String zc) {
return zenodocommunityMap.get(zc);
}
public List<String> getCommunityForSubjectValue(String value) {
return getContextIds(subjectMap.get(value));
}
public List<String> getCommunityForDatasourceValue(String value) {
return getContextIds(datasourceMap.get(value.toLowerCase()));
}
public List<String> getCommunityForZenodoCommunityValue(String value) {
return getContextIds(zenodocommunityMap.get(value.toLowerCase()));
}
private List<String> getContextIds(List<Pair<String, SelectionConstraints>> list) {
if (list != null) {
return list.stream().map(p -> p.getFst()).collect(Collectors.toList());
}
return Lists.newArrayList();
}
public Map<String, Community> getCommunities() {
return communities;
}
public void setCommunities(Map<String, Community> communities) {
this.communities = communities;
}
public String toJson() {
GsonBuilder builder = new GsonBuilder();
builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
Gson gson = builder.create();
return gson.toJson(this);
}
public int size() {
return communities.keySet().size();
}
public Community getCommunityById(String id) {
return communities.get(id);
}
public List<Community> getCommunityList() {
return Lists.newLinkedList(communities.values());
}
}

View File

@ -0,0 +1,138 @@
package eu.dnetlib.dhp.community;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory;
/** Created by miriam on 03/08/2018. */
public class CommunityConfigurationFactory {
private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class);
private static VerbResolver resolver = VerbResolverFactory.newInstance();
public static CommunityConfiguration newInstance(final String xml) throws DocumentException {
log.debug(String.format("parsing community configuration from:\n%s", xml));
final Document doc = new SAXReader().read(new StringReader(xml));
final Map<String, Community> communities = Maps.newHashMap();
for (final Object o : doc.selectNodes("//community")) {
final Node node = (Node) o;
final Community community = parseCommunity(node);
if (community.isValid()) {
communities.put(community.getId(), community);
}
}
log.info(String.format("loaded %s community configuration profiles", communities.size()));
log.debug(String.format("loaded community configuration:\n%s", communities.toString()));
return new CommunityConfiguration(communities);
}
public static CommunityConfiguration fromJson(final String json) {
GsonBuilder builder = new GsonBuilder();
builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
Gson gson = builder.create();
final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class);
log.info(String.format("loaded %s community configuration profiles", conf.size()));
conf.init();
log.info("created inverse maps");
return conf;
}
private static Community parseCommunity(final Node node) {
final Community c = new Community();
c.setId(node.valueOf("./@id"));
log.info(String.format("community id: %s", c.getId()));
c.setSubjects(parseSubjects(node));
c.setDatasources(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node));
return c;
}
private static List<String> parseSubjects(final Node node) {
final List<String> subjects = Lists.newArrayList();
final List<Node> list = node.selectNodes("./subjects/subject");
for (Node n : list) {
log.debug("text of the node " + n.getText());
subjects.add(StringUtils.trim(n.getText()));
}
log.info("size of the subject list " + subjects.size());
return subjects;
}
private static List<Datasource> parseDatasources(final Node node) {
final List<Node> list = node.selectNodes("./datasources/datasource");
final List<Datasource> datasourceList = new ArrayList<>();
for (Node n : list) {
Datasource d = new Datasource();
d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
datasourceList.add(d);
}
log.info("size of the datasource list " + datasourceList.size());
return datasourceList;
}
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) {
final Node oacommunitynode = node.selectSingleNode("./oacommunity");
String oacommunity = null;
if (oacommunitynode != null) {
String tmp = oacommunitynode.getText();
if (StringUtils.isNotBlank(tmp))
oacommunity = tmp;
}
final List<Node> list = node.selectNodes("./zenodocommunities/zenodocommunity");
final List<ZenodoCommunity> zenodoCommunityList = new ArrayList<>();
for (Node n : list) {
ZenodoCommunity zc = new ZenodoCommunity();
zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText());
zc.setSelCriteria(n.selectSingleNode("./selcriteria"));
zenodoCommunityList.add(zc);
}
if (oacommunity != null) {
ZenodoCommunity zc = new ZenodoCommunity();
zc.setZenodoCommunityId(oacommunity);
zenodoCommunityList.add(zc);
}
log.info("size of the zenodo community list " + zenodoCommunityList.size());
return zenodoCommunityList;
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
public class Constraint implements Serializable {
private String verb;
private String field;
private String value;
private Selection selection;
public Constraint() {
}
public String getVerb() {
return verb;
}
public void setVerb(String verb) {
this.verb = verb;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public void setSelection(Selection sel) {
selection = sel;
}
public void setSelection(VerbResolver resolver)
throws InvocationTargetException, NoSuchMethodException, InstantiationException,
IllegalAccessException {
selection = resolver.getSelectionCriteria(verb, value);
}
public boolean verifyCriteria(String metadata) {
return selection.apply(metadata);
}
}

View File

@ -0,0 +1,74 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Type;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
/** Created by miriam on 02/08/2018. */
public class Constraints implements Serializable {
private static final Log log = LogFactory.getLog(Constraints.class);
// private ConstraintEncapsulator ce;
private List<Constraint> constraint;
public Constraints() {
}
public List<Constraint> getConstraint() {
return constraint;
}
public void setConstraint(List<Constraint> constraint) {
this.constraint = constraint;
}
public void setSc(String json) {
Type collectionType = new TypeToken<Collection<Constraint>>() {
}.getType();
constraint = new Gson().fromJson(json, collectionType);
}
void setSelection(VerbResolver resolver) {
for (Constraint st : constraint) {
try {
st.setSelection(resolver);
} catch (NoSuchMethodException e) {
log.error(e.getMessage());
} catch (IllegalAccessException e) {
log.error(e.getMessage());
} catch (InvocationTargetException e) {
log.error(e.getMessage());
} catch (InstantiationException e) {
log.error(e.getMessage());
}
}
}
// Constraint in and
public boolean verifyCriteria(final Map<String, List<String>> param) {
for (Constraint sc : constraint) {
boolean verified = false;
for (String value : param.get(sc.getField())) {
if (sc.verifyCriteria(value.trim())) {
verified = true;
}
}
if (!verified)
return verified;
}
return true;
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Node;
import com.google.gson.Gson;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
/** Created by miriam on 01/08/2018. */
public class Datasource implements Serializable {
private static final Log log = LogFactory.getLog(Datasource.class);
private String openaireId;
private SelectionConstraints selectionConstraints;
public SelectionConstraints getSelCriteria() {
return selectionConstraints;
}
public SelectionConstraints getSelectionConstraints() {
return selectionConstraints;
}
public void setSelectionConstraints(SelectionConstraints selectionConstraints) {
this.selectionConstraints = selectionConstraints;
}
public void setSelCriteria(SelectionConstraints selCriteria) {
this.selectionConstraints = selCriteria;
}
public String getOpenaireId() {
return openaireId;
}
public void setOpenaireId(String openaireId) {
this.openaireId = openaireId;
}
private void setSelCriteria(String json, VerbResolver resolver) {
log.info("Selection constraints for datasource = " + json);
selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
}
public void setSelCriteria(Node n, VerbResolver resolver) {
try {
setSelCriteria(n.getText(), resolver);
} catch (Exception e) {
log.info("not set selection criteria... ");
selectionConstraints = null;
}
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import com.google.gson.Gson;
/** Created by miriam on 03/08/2018. */
public class Pair<A, B> implements Serializable {
private A fst;
private B snd;
public A getFst() {
return fst;
}
public Pair setFst(A fst) {
this.fst = fst;
return this;
}
public B getSnd() {
return snd;
}
public Pair setSnd(B snd) {
this.snd = snd;
return this;
}
public Pair(A a, B b) {
fst = a;
snd = b;
}
public String toJson() {
return new Gson().toJson(this);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import java.util.HashMap;
public class ProtoMap extends HashMap<String, String> implements Serializable {
public ProtoMap() {
super();
}
}

View File

@ -0,0 +1,65 @@
package eu.dnetlib.dhp.community;
import java.util.List;
import org.dom4j.DocumentException;
import com.google.common.base.Joiner;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class QueryInformationSystem {
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+ " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
+ " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
+ " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
+ " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] "
+ " return "
+ " <community> "
+ " { $x//CONFIGURATION/context/@id} "
+ " <subjects> "
+ " {for $y in tokenize($subj,',') "
+ " return "
+ " <subject>{$y}</subject>} "
+ " </subjects> "
+ " <datasources> "
+ " {for $d in $datasources "
+ " where $d/param[./@name='enabled']/text()='true' "
+ " return "
+ " <datasource> "
+ " <openaireId> "
+ " {$d//param[./@name='openaireId']/text()} "
+ " </openaireId> "
+ " <selcriteria> "
+ " {$d/param[./@name='selcriteria']/text()} "
+ " </selcriteria> "
+ " </datasource> } "
+ " </datasources> "
+ " <zenodocommunities> "
+ " {for $zc in $communities "
+ " return "
+ " <zenodocommunity> "
+ " <zenodoid> "
+ " {$zc/param[./@name='zenodoid']/text()} "
+ " </zenodoid> "
+ " <selcriteria> "
+ " {$zc/param[./@name='selcriteria']/text()} "
+ " </selcriteria> "
+ " </zenodocommunity>} "
+ " </zenodocommunities> "
+ " </community>";
public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
throws ISLookUpException, DocumentException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
final List<String> res = isLookUp.quickSearchProfile(XQUERY);
final String xmlConf = "<communities>" + Joiner.on(" ").join(res) + "</communities>";
return CommunityConfigurationFactory.newInstance(xmlConf);
}
}

View File

@ -0,0 +1,246 @@
package eu.dnetlib.dhp.community;
import static eu.dnetlib.dhp.community.TagginConstants.*;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
/** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable {
private String trust = "0.8";
private boolean clearContext(Result result) {
int tmp = result.getContext().size();
List<Context> clist = result
.getContext()
.stream()
.filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
.collect(Collectors.toList());
result.setContext(clist);
return (tmp != clist.size());
}
private Map<String, List<String>> getParamMap(final Result result, Map<String, String> params) {
Map<String, List<String>> param = new HashMap<>();
String json = new Gson().toJson(result, Result.class);
DocumentContext jsonContext = JsonPath.parse(json);
if (params == null) {
params = new HashMap<>();
}
for (String key : params.keySet()) {
try {
param.put(key, jsonContext.read(params.get(key)));
} catch (com.jayway.jsonpath.PathNotFoundException e) {
param.put(key, new ArrayList<>());
// throw e;
}
}
return param;
}
public <R extends Result> R enrichContextCriteria(
final R result, final CommunityConfiguration conf, final Map<String, String> criteria) {
// }
// public Result enrichContextCriteria(final Result result, final CommunityConfiguration
// conf, final Map<String,String> criteria) {
final Map<String, List<String>> param = getParamMap(result, criteria);
// Verify if the entity is deletedbyinference. In case verify if to clean the context list
// from all the zenodo communities
if (result.getDataInfo().getDeletedbyinference()) {
clearContext(result);
return result;
}
// communities contains all the communities to be added as context for the result
final Set<String> communities = new HashSet<>();
// tagging for Subject
final Set<String> subjects = new HashSet<>();
Optional<List<StructuredProperty>> oresultsubj = Optional.ofNullable(result.getSubject());
if (oresultsubj.isPresent()) {
oresultsubj
.get()
.stream()
.map(subject -> subject.getValue())
.filter(StringUtils::isNotBlank)
.map(String::toLowerCase)
.map(String::trim)
.collect(Collectors.toCollection(HashSet::new))
.forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
}
communities.addAll(subjects);
// Tagging for datasource
final Set<String> datasources = new HashSet<>();
final Set<String> tmp = new HashSet<>();
Optional<List<Instance>> oresultinstance = Optional.ofNullable(result.getInstance());
if (oresultinstance.isPresent()) {
for (Instance i : oresultinstance.get()) {
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
}
oresultinstance
.get()
.stream()
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
.flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
.map(s -> StringUtils.substringAfter(s, "|"))
.collect(Collectors.toCollection(HashSet::new))
.forEach(
dsId -> datasources
.addAll(
conf.getCommunityForDatasource(dsId, param)));
}
communities.addAll(datasources);
/* Tagging for Zenodo Communities */
final Set<String> czenodo = new HashSet<>();
Optional<List<Context>> oresultcontext = Optional.ofNullable(result.getContext());
if (oresultcontext.isPresent()) {
oresultcontext
.get()
.stream()
.filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
.collect(Collectors.toList())
.forEach(
c -> czenodo
.addAll(
conf
.getCommunityForZenodoCommunityValue(
c
.getId()
.substring(
c.getId().lastIndexOf("/") + 1)
.trim())));
}
communities.addAll(czenodo);
clearContext(result);
/* Verify if there is something to bulktag */
if (communities.isEmpty()) {
return result;
}
result
.getContext()
.stream()
.map(
c -> {
if (communities.contains(c.getId())) {
Optional<List<DataInfo>> opt_dataInfoList = Optional.ofNullable(c.getDataInfo());
List<DataInfo> dataInfoList;
if (opt_dataInfoList.isPresent())
dataInfoList = opt_dataInfoList.get();
else {
dataInfoList = new ArrayList<>();
c.setDataInfo(dataInfoList);
}
if (subjects.contains(c.getId()))
dataInfoList
.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_SUBJECT,
CLASS_NAME_BULKTAG_SUBJECT));
if (datasources.contains(c.getId()))
dataInfoList
.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_DATASOURCE,
CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c.getId()))
dataInfoList
.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_CZENODO,
CLASS_NAME_BULKTAG_ZENODO));
}
return c;
})
.collect(Collectors.toList());
communities
.removeAll(
result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
if (communities.isEmpty())
return result;
List<Context> toaddcontext = communities
.stream()
.map(
c -> {
Context context = new Context();
context.setId(c);
List<DataInfo> dataInfoList = new ArrayList<>();
if (subjects.contains(c))
dataInfoList
.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_SUBJECT,
CLASS_NAME_BULKTAG_SUBJECT));
if (datasources.contains(c))
dataInfoList
.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_DATASOURCE,
CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c))
dataInfoList
.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_CZENODO,
CLASS_NAME_BULKTAG_ZENODO));
context.setDataInfo(dataInfoList);
return context;
})
.collect(Collectors.toList());
result.getContext().addAll(toaddcontext);
return result;
}
public static DataInfo getDataInfo(
String inference_provenance, String inference_class_id, String inference_class_name) {
DataInfo di = new DataInfo();
di.setInferred(true);
di.setInferenceprovenance(inference_provenance);
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
return di;
}
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
Qualifier pa = new Qualifier();
pa.setClassid(inference_class_id);
pa.setClassname(inference_class_name);
pa.setSchemeid(DNET_SCHEMA_ID);
pa.setSchemename(DNET_SCHEMA_NAME);
return pa;
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import java.lang.reflect.Type;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
public class SelectionConstraints implements Serializable {
private List<Constraints> criteria;
public SelectionConstraints() {
}
public List<Constraints> getCriteria() {
return criteria;
}
public void setCriteria(List<Constraints> criteria) {
this.criteria = criteria;
}
public void setSc(String json) {
Type collectionType = new TypeToken<Collection<Constraints>>() {
}.getType();
criteria = new Gson().fromJson(json, collectionType);
}
// Constraints in or
public boolean verifyCriteria(final Map<String, List<String>> param) {
for (Constraints selc : criteria) {
if (selc.verifyCriteria(param)) {
return true;
}
}
return false;
}
public void setSelection(VerbResolver resolver) {
for (Constraints cs : criteria) {
cs.setSelection(resolver);
}
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.community;
public class TagginConstants {
public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
public static final String CLASS_ID_SUBJECT = "community:subject";
public static final String CLASS_ID_DATASOURCE = "community:datasource";
public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
public static final String SCHEMA_ID = "dnet:provenanceActions";
public static final String COUNTER_GROUP = "Bulk Tagging";
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import org.dom4j.Node;
import com.google.gson.Gson;
/** Created by miriam on 01/08/2018. */
public class ZenodoCommunity implements Serializable {
private String zenodoCommunityId;
private SelectionConstraints selCriteria;
public String getZenodoCommunityId() {
return zenodoCommunityId;
}
public void setZenodoCommunityId(String zenodoCommunityId) {
this.zenodoCommunityId = zenodoCommunityId;
}
public SelectionConstraints getSelCriteria() {
return selCriteria;
}
public void setSelCriteria(SelectionConstraints selCriteria) {
this.selCriteria = selCriteria;
}
private void setSelCriteria(String json) {
// Type collectionType = new TypeToken<Collection<Constraints>>(){}.getType();
selCriteria = new Gson().fromJson(json, SelectionConstraints.class);
}
public void setSelCriteria(Node n) {
if (n == null) {
selCriteria = null;
} else {
setSelCriteria(n.getText());
}
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("contains")
public class ContainsVerb implements Selection, Serializable {
private String param;
public ContainsVerb() {
}
public ContainsVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.contains(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("contains_ignorecase")
public class ContainsVerbIgnoreCase implements Selection, Serializable {
private String param;
public ContainsVerbIgnoreCase() {
}
public ContainsVerbIgnoreCase(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.toLowerCase().contains(param.toLowerCase());
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("equals")
public class EqualVerb implements Selection, Serializable {
private String param;
public EqualVerb() {
}
public EqualVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.equals(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("equals_ignorecase")
public class EqualVerbIgnoreCase implements Selection, Serializable {
private String param;
public EqualVerbIgnoreCase() {
}
public EqualVerbIgnoreCase(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.equalsIgnoreCase(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.lang.reflect.Type;
import com.google.gson.*;
public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
private static final String CLASSNAME = "CLASSNAME";
private static final String DATA = "DATA";
public Object deserialize(
JsonElement jsonElement,
Type type,
JsonDeserializationContext jsonDeserializationContext)
throws JsonParseException {
JsonObject jsonObject = jsonElement.getAsJsonObject();
JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME);
String className = prim.getAsString();
Class klass = getObjectClass(className);
return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass);
}
public JsonElement serialize(
Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) {
JsonObject jsonObject = new JsonObject();
jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName());
jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement));
return jsonObject;
}
/** **** Helper method to get the className of the object to be deserialized **** */
public Class getObjectClass(String className) {
try {
return Class.forName(className);
} catch (ClassNotFoundException e) {
// e.printStackTrace();
throw new JsonParseException(e.getMessage());
}
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("not_contains")
public class NotContainsVerb implements Selection, Serializable {
private String param;
public NotContainsVerb() {
}
public NotContainsVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return !value.contains(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("not_contains_ignorecase")
public class NotContainsVerbIgnoreCase implements Selection, Serializable {
private String param;
public NotContainsVerbIgnoreCase() {
}
public NotContainsVerbIgnoreCase(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return !(value.toLowerCase().contains(param.toLowerCase()));
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("not_equals")
public class NotEqualVerb implements Selection, Serializable {
private String param;
public NotEqualVerb(final String param) {
this.param = param;
}
public NotEqualVerb() {
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return !value.equals(param);
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
@VerbClass("not_equals_ignorecase")
public class NotEqualVerbIgnoreCase implements Selection, Serializable {
private String param;
public NotEqualVerbIgnoreCase(final String param) {
this.param = param;
}
public NotEqualVerbIgnoreCase() {
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return !value.equalsIgnoreCase(param);
}
}

View File

@ -0,0 +1,7 @@
package eu.dnetlib.dhp.selectioncriteria;
public interface Selection {
boolean apply(String value);
}

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
@interface VerbClass {
String value();
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import io.github.classgraph.ClassGraph;
import io.github.classgraph.ClassInfo;
import io.github.classgraph.ClassInfoList;
import io.github.classgraph.ScanResult;
public class VerbResolver implements Serializable {
private Map<String, Class<Selection>> map = null; // = new HashMap<>();
private final ClassGraph classgraph = new ClassGraph();
public VerbResolver() {
try (ScanResult scanResult = // Assign scanResult in try-with-resources
classgraph // Create a new ClassGraph instance
.verbose() // If you want to enable logging to stderr
.enableAllInfo() // Scan classes, methods, fields, annotations
.whitelistPackages(
"eu.dnetlib.dhp.selectioncriteria") // Scan com.xyz and subpackages
.scan()) { // Perform the scan and return a ScanResult
ClassInfoList routeClassInfoList = scanResult
.getClassesWithAnnotation(
"eu.dnetlib.dhp.selectioncriteria.VerbClass");
this.map = routeClassInfoList
.stream()
.collect(
Collectors
.toMap(
value -> (String) ((ClassInfo) value)
.getAnnotationInfo()
.get(0)
.getParameterValues()
.get(0)
.getValue(),
value -> (Class<Selection>) ((ClassInfo) value).loadClass()));
} catch (Exception e) {
e.printStackTrace();
}
}
public Selection getSelectionCriteria(String name, String param)
throws NoSuchMethodException, IllegalAccessException, InvocationTargetException,
InstantiationException {
// return Class.forName(tmp_map.get(name)).
return map.get(name).getDeclaredConstructor((String.class)).newInstance(param);
}
}

View File

@ -0,0 +1,10 @@
package eu.dnetlib.dhp.selectioncriteria;
public class VerbResolverFactory {
public static VerbResolver newInstance() {
return new VerbResolver();
}
}

View File

@ -0,0 +1,51 @@
[
{
"paramName":"is",
"paramLongName":"isLookUpUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "pm",
"paramLongName":"pathMap",
"paramDescription": "the json path associated to each selection field",
"paramRequired": true
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "test",
"paramLongName": "isTest",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "tg",
"paramLongName": "taggingConf",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -0,0 +1,54 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,216 @@
<workflow-app name="bulk_tagging" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>isLookUpUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>pathMap</name>
<description>the json path associated to each selection field</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters>
<start to="reset_outputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_exec_bulktag"/>
<fork name="fork_exec_bulktag">
<path start="join_bulktag_publication"/>
<path start="join_bulktag_dataset"/>
<path start="join_bulktag_otherresearchproduct"/>
<path start="join_bulktag_software"/>
</fork>
<action name="join_bulktag_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging-publication</name>
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_bulktag_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging-dataset</name>
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_bulktag_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging-orp</name>
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_bulktag_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging-software</name>
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,853 @@
package eu.dnetlib.dhp;
import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.mortbay.util.IO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bulktag.SparkBulkTagJob;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
public class BulkTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class);
private static String taggingConf = "";
static {
try {
taggingConf = IO
.toString(
BulkTagJobTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml"));
} catch (IOException e) {
e.printStackTrace();
}
}
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(BulkTagJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void noUpdatesTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/sample/dataset/no_updates").getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
// "-preparedInfoPath",
// getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath()
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
Assertions.assertEquals(0, spark.sql(query).count());
}
@Test
public void bulktagBySubjectNoPreviousContextTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
Assertions.assertEquals(5, spark.sql(query).count());
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
Assertions
.assertEquals(
5, idExplodeCommunity.filter("provenance = 'community:subject'").count());
Assertions
.assertEquals(
5,
idExplodeCommunity.filter("name = 'Bulktagging for Community - Subject'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count());
Assertions
.assertEquals(
1,
idExplodeCommunity
.filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'")
.count());
Assertions
.assertEquals(
1,
idExplodeCommunity
.filter(
"community = 'covid-19' and id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'")
.count());
Assertions
.assertEquals(
2,
idExplodeCommunity
.filter("id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'")
.count());
Assertions
.assertEquals(
2,
idExplodeCommunity
.filter(
"(community = 'covid-19' or community = 'aginfra') and id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'")
.count());
Assertions
.assertEquals(
2,
idExplodeCommunity
.filter("id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'")
.count());
Assertions
.assertEquals(
2,
idExplodeCommunity
.filter(
"(community = 'mes' or community = 'fam') and id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'")
.count());
}
@Test
public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyT.id = 'covid-19' ";
Assertions.assertEquals(3, spark.sql(query).count());
org.apache.spark.sql.Dataset<Row> communityContext = spark.sql(query);
Assertions
.assertEquals(
2,
communityContext
.filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'")
.count());
Assertions
.assertEquals(
1,
communityContext
.filter(
"id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'community:subject'")
.count());
Assertions
.assertEquals(
1,
communityContext
.filter(
"id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'propagation:community:productsthroughsemrel'")
.count());
query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ "from dataset "
+ "lateral view explode (context) as MyT "
+ "where size(MyT.datainfo) > 0";
Assertions
.assertEquals(
2,
spark
.sql(query)
.select("datainfosize")
.where(
"id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' a"
+ "nd community = 'covid-19'")
.collectAsList()
.get(0)
.getInt(0));
}
@Test
public void bulktagByDatasourceTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/sample/publication/update_datasource")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/publication",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Publication> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from publication "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
Assertions.assertEquals(5, idExplodeCommunity.count());
Assertions
.assertEquals(
5, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions
.assertEquals(
5,
idExplodeCommunity
.filter("name = 'Bulktagging for Community - Datasource'")
.count());
Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'fam'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count());
Assertions
.assertEquals(
3,
idExplodeCommunity
.filter(
"community = 'fam' and (id = '50|ec_fp7health::000085c89f4b96dc2269bd37edb35306' "
+ "or id = '50|ec_fp7health::000b9e61f83f5a4b0c35777b7bccdf38' "
+ "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')")
.count());
Assertions
.assertEquals(
2,
idExplodeCommunity
.filter(
"community = 'aginfra' and (id = '50|ec_fp7health::000c8195edd542e4e64ebb32172cbf89' "
+ "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')")
.count());
}
@Test
public void bulktagByZenodoCommunityTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
"-outputPath",
workingDir.toString() + "/orp",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<OtherResearchProduct> tmp = sc
.textFile(workingDir.toString() + "/orp")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<OtherResearchProduct> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(OtherResearchProduct.class));
verificationDataset.createOrReplaceTempView("orp");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from orp "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
Assertions.assertEquals(8, idExplodeCommunity.count());
Assertions
.assertEquals(
8, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count());
Assertions
.assertEquals(
8,
idExplodeCommunity.filter("name = 'Bulktagging for Community - Zenodo'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'covid-19'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'beopen'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'mes'").count());
Assertions
.assertEquals(
1,
idExplodeCommunity
.filter(
"id = '50|od______2017::0750a4d0782265873d669520f5e33c07' "
+ "and community = 'covid-19'")
.count());
Assertions
.assertEquals(
3,
idExplodeCommunity
.filter(
"id = '50|od______2017::1bd97baef19dbd2db3203b112bb83bc5' and "
+ "(community = 'aginfra' or community = 'mes' or community = 'fam')")
.count());
Assertions
.assertEquals(
1,
idExplodeCommunity
.filter(
"id = '50|od______2017::1e400f1747487fd15998735c41a55c72' "
+ "and community = 'beopen'")
.count());
Assertions
.assertEquals(
3,
idExplodeCommunity
.filter(
"id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' and "
+ "(community = 'beopen' or community = 'fam' or community = 'mes')")
.count());
query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ "from orp "
+ "lateral view explode (context) as MyT "
+ "where size(MyT.datainfo) > 0";
Assertions
.assertEquals(
2,
spark
.sql(query)
.select("datainfosize")
.where(
"id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' a"
+ "nd community = 'beopen'")
.collectAsList()
.get(0)
.getInt(0));
// verify the zenodo community context is not present anymore in the records
query = "select id, MyT.id community "
+ "from orp "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD ";
org.apache.spark.sql.Dataset<Row> tmp2 = spark.sql(query);
Assertions
.assertEquals(
0,
tmp2
.select("community")
.where(tmp2.col("community").contains(ZENODO_COMMUNITY_INDICATOR))
.count());
}
@Test
public void bulktagBySubjectDatasourceTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/sample/dataset/update_subject_datasource")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
Assertions.assertEquals(7, idExplodeCommunity.count());
Assertions
.assertEquals(
5, idExplodeCommunity.filter("provenance = 'community:subject'").count());
Assertions
.assertEquals(
2, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count());
query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ "from dataset "
+ "lateral view explode (context) as MyT "
+ "where size(MyT.datainfo) > 0";
org.apache.spark.sql.Dataset<Row> tmp2 = spark.sql(query);
Assertions
.assertEquals(
2,
tmp2
.select("datainfosize")
.where(
"id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and "
+ "community = 'aginfra'")
.collectAsList()
.get(0)
.getInt(0));
Assertions
.assertEquals(
1,
tmp2
.select("datainfosize")
.where(
"id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and "
+ "community = 'covid-19'")
.collectAsList()
.get(0)
.getInt(0));
Assertions
.assertEquals(
2,
tmp2
.select("datainfosize")
.where(
"id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and "
+ "community = 'fam'")
.collectAsList()
.get(0)
.getInt(0));
Assertions
.assertEquals(
2,
tmp2
.select("datainfosize")
.where(
"id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and "
+ "community = 'covid-19'")
.collectAsList()
.get(0)
.getInt(0));
Assertions
.assertEquals(
1,
tmp2
.select("datainfosize")
.where(
"id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and "
+ "community = 'fam'")
.collectAsList()
.get(0)
.getInt(0));
Assertions
.assertEquals(
1,
tmp2
.select("datainfosize")
.where(
"id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and "
+ "community = 'mes'")
.collectAsList()
.get(0)
.getInt(0));
}
@Test
public void bulktagBySubjectDatasourceZenodoCommunityTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/sample/software/").getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-outputPath",
workingDir.toString() + "/software",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Software> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Software.class));
verificationDataset.createOrReplaceTempView("software");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from software "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
Assertions.assertEquals(10, idExplodeCommunity.count());
idExplodeCommunity.show(false);
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:subject'").count());
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions
.assertEquals(
4, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count());
Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'covid-19'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dh-ch'").count());
Assertions.assertEquals(4, idExplodeCommunity.filter("community = 'aginfra'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dariah'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count());
Assertions
.assertEquals(
2,
idExplodeCommunity
.filter(
"provenance = 'community:zenodocommunity' and "
+ "id = '50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4' and ("
+ "community = 'dh-ch' or community = 'dariah')")
.count());
query = "select id, MyT.id community, size(MyT.datainfo) datainfosize "
+ "from software "
+ "lateral view explode (context) as MyT "
+ "where size(MyT.datainfo) > 0";
org.apache.spark.sql.Dataset<Row> tmp2 = spark.sql(query);
Assertions
.assertEquals(
2,
tmp2
.select("datainfosize")
.where(
"id = '50|od______1582::501b25d420f808c8eddcd9b16e917f11' and "
+ "community = 'covid-19'")
.collectAsList()
.get(0)
.getInt(0));
Assertions
.assertEquals(
3,
tmp2
.select("datainfosize")
.where(
"id = '50|od______1582::581621232a561b7e8b4952b18b8b0e56' and "
+ "community = 'aginfra'")
.collectAsList()
.get(0)
.getInt(0));
}
@Test
public void bulktagDatasourcewithConstraintsTest() throws Exception {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(3, idExplodeCommunity.count());
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
}
}

View File

@ -0,0 +1,166 @@
package eu.dnetlib.dhp;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.DocumentException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import com.google.gson.Gson;
import eu.dnetlib.dhp.community.CommunityConfiguration;
import eu.dnetlib.dhp.community.CommunityConfigurationFactory;
import eu.dnetlib.dhp.community.Constraint;
import eu.dnetlib.dhp.community.SelectionConstraints;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
/** Created by miriam on 03/08/2018. */
public class CommunityConfigurationFactoryTest {
private final VerbResolver resolver = new VerbResolver();
@Test
public void parseTest() throws DocumentException, IOException {
String xml = IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml"));
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
Assertions.assertEquals(5, cc.size());
cc
.getCommunityList()
.forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId())));
}
@Test
public void applyVerb()
throws InvocationTargetException, IllegalAccessException, NoSuchMethodException,
InstantiationException {
Constraint sc = new Constraint();
sc.setVerb("not_contains");
sc.setField("contributor");
sc.setValue("DARIAH");
sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue()));
String metadata = "This work has been partially supported by DARIAH-EU infrastructure";
Assertions.assertFalse(sc.verifyCriteria(metadata));
}
@Test
public void loadSelCriteriaTest() throws DocumentException, IOException {
String xml = IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml"));
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
Map<String, List<String>> param = new HashMap<>();
param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi")));
param
.put(
"description",
new ArrayList<>(
Collections
.singletonList(
"This work has been partially supported by DARIAH-EU infrastructure")));
param
.put(
"contributor",
new ArrayList<>(
Collections
.singletonList(
"Pallino ha aiutato a scrivere il paper. Pallino lavora per DARIAH")));
List<String> comm = cc
.getCommunityForDatasource(
"openaire____::1cfdb2e14977f31a98e0118283401f32", param);
Assertions.assertEquals(1, comm.size());
Assertions.assertEquals("dariah", comm.get(0));
}
@Test
public void test4() throws DocumentException, IOException {
final CommunityConfiguration cc = CommunityConfigurationFactory
.fromJson(
IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json")));
cc.toString();
}
@Test
public void test5() throws IOException, DocumentException {
// final CommunityConfiguration cc =
// CommunityConfigurationFactory.newInstance(IOUtils.toString(getClass().getResourceAsStream("test.xml")));
final CommunityConfiguration cc = CommunityConfigurationFactory
.fromJson(
IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration.json")));
System.out.println(cc.toJson());
}
@Test
public void test6() {
String json = "{\"criteria\":[{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}]}";
String step1 = "{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}";
Constraint c = new Gson().fromJson(step1, Constraint.class);
//
// String step2 =
// "{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}";
//
// ConstraintEncapsulator ce = new
// Gson().fromJson(step2,ConstraintEncapsulator.class);
//
//
// String step3 =
// "{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}";
//
// Constraints cons = new Gson().fromJson(step3,Constraints.class);
//
// String step4 =
// "{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}";
//
// ConstraintsList cl = new Gson().fromJson(step4,ConstraintsList.class);
//
// String step5 =
// "{\"cl\":{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}}";
SelectionConstraints sl = new Gson().fromJson(json, SelectionConstraints.class);
}
@Test
public void test7() throws IOException {
final CommunityConfiguration cc = CommunityConfigurationFactory
.fromJson(
IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json")));
System.out.println(cc.toJson());
}
@Test
public void temporaneo() throws Exception {
String xml = IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml"));
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
System.out.println(cc.toJson());
}
}

View File

@ -0,0 +1,694 @@
{"communities": {
"clarin": {
"id": "clarin",
"subjects": [],
"datasources": [
{
"openaireId": "re3data_____::a507cdacc5bbcc08761c92185dee5cab"
}
],
"zenodoCommunities": [
]
},
"ee": {
"id": "ee",
"subjects": [
"SDG13 - Climate action",
"SDG8 - Decent work and economic\n\t\t\t\t\tgrowth",
"SDG15 - Life on land",
"SDG2 - Zero hunger",
"SDG17 - Partnerships for the\n\t\t\t\t\tgoals",
"SDG10 - Reduced inequalities",
"SDG5 - Gender equality",
"SDG12 - Responsible\n\t\t\t\t\tconsumption and production",
"SDG14 - Life below water",
"SDG6 - Clean water and\n\t\t\t\t\tsanitation",
"SDG11 - Sustainable cities and communities",
"SDG1 - No poverty",
"SDG3 -\n\t\t\t\t\tGood health and well being",
"SDG7 - Affordable and clean energy",
"SDG4 - Quality\n\t\t\t\t\teducation",
"SDG9 - Industry innovation and infrastructure",
"SDG16 - Peace justice\n\t\t\t\t\tand strong institutions"
],
"datasources": [
],
"zenodoCommunities": [
]
},
"aginfra": {
"id": "aginfra",
"subjects": [
"animal production and health",
"fisheries and aquaculture",
"food safety and human nutrition",
"information management",
"food technology",
"agri-food education and extension",
"natural resources and environment",
"food system",
"engineering technology and Research",
"agriculture",
"food safety risk assessment",
"food security",
"farming practices and systems",
"plant production and protection",
"agri-food economics and policy",
"food distribution",
"forestry"
],
"datasources": [
{
"openaireId": "opendoar____::1a551829d50f1400b0dab21fdd969c04"
},
{
"openaireId": "opendoar____::49af6c4e558a7569d80eee2e035e2bd7"
},
{
"openaireId": "opendoar____::0266e33d3f546cb5436a10798e657d97"
},
{
"openaireId": "opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06"
},
{
"openaireId": "opendoar____::41bfd20a38bb1b0bec75acf0845530a7"
},
{
"openaireId": "opendoar____::87ae6fb631f7c8a627e8e28785d9992d"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "edenis"
},
{
"zenodoCommunityId": "efsa-pilot"
},
{
"zenodoCommunityId": "egene3"
},
{
"zenodoCommunityId": "efsa-kj"
},
{
"zenodoCommunityId": "euromixproject"
},
{
"zenodoCommunityId": "discardless"
},
{
"zenodoCommunityId": "sedinstcjfst"
},
{
"zenodoCommunityId": "afinet-kc"
},
{
"zenodoCommunityId": "2231-4784"
},
{
"zenodoCommunityId": "2231-0606"
},
{
"zenodoCommunityId": "solace"
},
{
"zenodoCommunityId": "pa17"
},
{
"zenodoCommunityId": "smartakis"
},
{
"zenodoCommunityId": "sedinstcjae"
},
{
"zenodoCommunityId": "phenology_camera"
},
{
"zenodoCommunityId": "aginfra"
},
{
"zenodoCommunityId": "erosa"
},
{
"zenodoCommunityId": "bigdatagrapes"
}
]
},
"fam": {
"id": "fam",
"subjects": [
"Stock Assessment",
"pelagic",
"Fish farming",
"EMFF",
"Fisheries",
"Fishermen",
"maximum sustainable yield",
"trawler",
"Fishing vessel",
"Fisherman",
"Fishing gear",
"RFMO",
"Fish Aggregating Device",
"Bycatch",
"Fishery",
"common fisheries policy",
"Fishing fleet",
"Aquaculture"
],
"datasources": [
{
"openaireId": "doajarticles::8cec81178926caaca531afbd8eb5d64c"
},
{
"openaireId": "doajarticles::0f7a7f30b5400615cae1829f3e743982"
},
{
"openaireId": "doajarticles::9740f7f5af3e506d2ad2c215cdccd51a"
},
{
"openaireId": "doajarticles::9f3fbaae044fa33cb7069b72935a3254"
},
{
"openaireId": "doajarticles::cb67f33eb9819f5c624ce0313957f6b3"
},
{
"openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906"
},
{
"openaireId": "doajarticles::554cde3be9e5c4588b4c4f9f503120cb"
},
{
"openaireId": "tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b"
},
{
"openaireId": "doajarticles::57c5d3837da943e93b28ec4db82ec7a5"
},
{
"openaireId": "doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1"
},
{
"openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906"
},
{
"openaireId": "doajarticles::dca64612dfe0963fffc119098a319957"
},
{
"openaireId": "doajarticles::dd70e44479f0ade25aa106aef3e87a0a"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "discardless"
},
{
"zenodoCommunityId": "farfish2020"
},
{
"zenodoCommunityId": "facts"
},
{
"zenodoCommunityId": "climefish"
},
{
"zenodoCommunityId": "proeel"
},
{
"zenodoCommunityId": "primefish"
},
{
"zenodoCommunityId": "h2020_vicinaqua"
},
{
"zenodoCommunityId": "meece"
},
{
"zenodoCommunityId": "rlsadb"
}
]
},
"instruct": {
"id": "instruct",
"subjects": [
],
"datasources": [
],
"zenodoCommunities": [
{
"zenodoCommunityId": "instruct"
},
{
"zenodoCommunityId": "west-life"
}
]
},
"mes": {
"id": "mes",
"subjects": [
"marine",
"ocean",
"fish",
"aqua",
"sea"
],
"datasources": [
],
"zenodoCommunities": [
{
"zenodoCommunityId": "adriplan"
},
{
"zenodoCommunityId": "devotes-project"
},
{
"zenodoCommunityId": "euro-basin"
},
{
"zenodoCommunityId": "naclim"
},
{
"zenodoCommunityId": "discardless"
},
{
"zenodoCommunityId": "assisibf"
},
{
"zenodoCommunityId": "meece"
},
{
"zenodoCommunityId": "facts"
},
{
"zenodoCommunityId": "proeel"
},
{
"zenodoCommunityId": "aquatrace"
},
{
"zenodoCommunityId": "myfish"
},
{
"zenodoCommunityId": "atlas"
},
{
"zenodoCommunityId": "blue-actionh2020"
},
{
"zenodoCommunityId": "sponges"
},
{
"zenodoCommunityId": "merces_project"
},
{
"zenodoCommunityId": "bigdataocean"
},
{
"zenodoCommunityId": "columbus"
},
{
"zenodoCommunityId": "h2020-aquainvad-ed"
},
{
"zenodoCommunityId": "aquarius"
},
{
"zenodoCommunityId": "southern-ocean-observing-system"
},
{
"zenodoCommunityId": "eawag"
},
{
"zenodoCommunityId": "mossco"
},
{
"zenodoCommunityId": "onc"
},
{
"zenodoCommunityId": "oceanbiogeochemistry"
},
{
"zenodoCommunityId": "oceanliteracy"
},
{
"zenodoCommunityId": "openearth"
},
{
"zenodoCommunityId": "ocean"
},
{
"zenodoCommunityId": "calcifierraman"
},
{
"zenodoCommunityId": "bermudabream"
},
{
"zenodoCommunityId": "brcorp1"
},
{
"zenodoCommunityId": "mce"
},
{
"zenodoCommunityId": "biogeochem"
},
{
"zenodoCommunityId": "ecc2014"
},
{
"zenodoCommunityId": "fisheries"
},
{
"zenodoCommunityId": "sedinstcjfas"
},
{
"zenodoCommunityId": "narmada"
},
{
"zenodoCommunityId": "umr-entropie"
},
{
"zenodoCommunityId": "farfish2020"
},
{
"zenodoCommunityId": "primefish"
},
{
"zenodoCommunityId": "zf-ilcs"
},
{
"zenodoCommunityId": "climefish"
},
{
"zenodoCommunityId": "afrimed_eu"
},
{
"zenodoCommunityId": "spi-ace"
},
{
"zenodoCommunityId": "cice-consortium"
},
{
"zenodoCommunityId": "nemo-ocean"
},
{
"zenodoCommunityId": "mesopp-h2020"
},
{
"zenodoCommunityId": "marxiv"
}
]
},
"ni": {
"id": "ni",
"subjects": [
"brain mapping",
"brain imaging",
"electroencephalography",
"arterial spin labelling",
"brain fingerprinting",
"brain",
"neuroimaging",
"Multimodal Brain Image Analysis",
"fMRI",
"neuroinformatics",
"fetal brain",
"brain ultrasonic imaging",
"topographic brain mapping",
"diffusion tensor imaging",
"computerized knowledge assessment",
"connectome mapping",
"brain magnetic resonance imaging",
"brain abnormalities"
],
"datasources": [
{
"openaireId": "re3data_____::5b9bf9171d92df854cf3c520692e9122"
},
{
"openaireId": "doajarticles::c7d3de67dc77af72f6747157441252ec"
},
{
"openaireId": "re3data_____::8515794670370f49c1d176c399c714f5"
},
{
"openaireId": "doajarticles::d640648c84b10d425f96f11c3de468f3"
},
{
"openaireId": "doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"
},
{
"openaireId": "rest________::fb1a3d4523c95e63496e3bc7ba36244b"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "neuroinformatics"
},
{
"zenodoCommunityId": "hbp"
},
{
"zenodoCommunityId": "from_neuroscience_to_machine_learning"
},
{
"zenodoCommunityId": "ci2c"
},
{
"zenodoCommunityId": "opensourcebrain"
},
{
"zenodoCommunityId": "brainspeak"
},
{
"zenodoCommunityId": "braincom"
},
{
"zenodoCommunityId": "nextgenvis"
},
{
"zenodoCommunityId": "meso-brain"
},
{
"zenodoCommunityId": "neuroplasticity-workshop"
},
{
"zenodoCommunityId": "bionics"
},
{
"zenodoCommunityId": "brainmattrain-676408"
},
{
"zenodoCommunityId": "repronim"
},
{
"zenodoCommunityId": "affectiveneuro"
},
{
"zenodoCommunityId": "con"
},
{
"zenodoCommunityId": "lab_neurol_sperim_irfmn_irccs_milano_it"
}
]
},
"dariah": {
"id": "dariah",
"subjects": [
],
"datasources": [
{
"openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a",
"sc": {
"cl": {
"criteria": [
{
"ce": {
"constraint": [
{
"verb": "contains",
"field": "contributor",
"value": "DARIAH"
}
]
}
}
]
}
}
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "dimpo"
}
]
},
"rda": {
"id": "rda",
"subjects": [
],
"datasources": [
],
"zenodoCommunities": [
{
"zenodoCommunityId": "rda"
}
]
},
"dh-ch": {
"id": "dh-ch",
"subjects": [
"modern art",
"metadata",
"monuments",
"sites",
"field walking",
"frescoes",
"excavation",
"ontologies",
"mapping",
"cities",
"temples",
"lithics",
"roads",
"digital cultural heritage",
"interoperability",
"archaeological reports",
"churches",
"standards",
"archaeological stratigraphy",
"buidings",
"digital humanities",
"survey",
"archaeological sites",
"CIDOC CRM",
"decorations",
"classic art",
"stratigraphy",
"digital archaeology",
"walls",
"data science",
"chapels",
"paintings",
"archaeology",
"fair data",
"mosaics",
"data visualization",
"burials",
"medieval art",
"castles",
"statues",
"natural language processing",
"inscriptions",
"vaults",
"open data",
"contemporary art",
"3D",
"pottery",
"site",
"metadata schema",
"architectural",
"vessels"
],
"datasources": [
{
"openaireId": "re3data_____::9ebe127e5f3a0bf401875690f3bb6b81"
},
{
"openaireId": "doajarticles::c6cd4b532e12868c1d760a8d7cda6815"
},
{
"openaireId": "doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b"
},
{
"openaireId": "doajarticles::6eb31d13b12bc06bbac06aef63cf33c9"
},
{
"openaireId": "doajarticles::0da84e9dfdc8419576169e027baa8028"
},
{
"openaireId": "re3data_____::84e123776089ce3c7a33db98d9cd15a8"
},
{
"openaireId": "openaire____::c5502a43e76feab55dd00cf50f519125"
},
{
"openaireId": "re3data_____::a48f09c562b247a9919acfe195549b47"
},
{
"openaireId": "opendoar____::97275a23ca44226c9964043c8462be96"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "storm"
},
{
"zenodoCommunityId": "crosscult"
},
{
"zenodoCommunityId": "wholodance_eu"
},
{
"zenodoCommunityId": "digcur2013"
},
{
"zenodoCommunityId": "gravitate"
},
{
"zenodoCommunityId": "dipp2014"
},
{
"zenodoCommunityId": "digitalhumanities"
},
{
"zenodoCommunityId": "dimpo"
},
{
"zenodoCommunityId": "adho"
},
{
"zenodoCommunityId": "chc"
},
{
"zenodoCommunityId": "wahr"
},
{
"zenodoCommunityId": "ibe"
},
{
"zenodoCommunityId": "ariadne"
},
{
"zenodoCommunityId": "parthenos-hub"
},
{
"zenodoCommunityId": "parthenos-training"
},
{
"zenodoCommunityId": "gandhara"
},
{
"zenodoCommunityId": "cmsouthasia"
},
{
"zenodoCommunityId": "nilgirihills"
},
{
"zenodoCommunityId": "shamsa_mustecio"
},
{
"zenodoCommunityId": "bodhgaya"
}
]
}
}
}

View File

@ -0,0 +1,176 @@
<communities>
<community id="fet-fp7">
<oacommunity/>
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fet-h2020">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="oa-pg">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ee">
<subjects>
<subject>SDG13 - Climate action</subject>
<subject>SDG8 - Decent work and economic growth</subject>
<subject>SDG15 - Life on land</subject>
<subject>SDG2 - Zero hunger</subject>
<subject>SDG17 - Partnerships for the goals</subject>
<subject>SDG10 - Reduced inequalities</subject>
<subject>SDG5 - Gender equality</subject>
<subject>SDG12 - Responsible consumption and production</subject>
<subject>SDG14 - Life below water</subject>
<subject>SDG6 - Clean water and sanitation</subject>
<subject>SDG11 - Sustainable cities and communities</subject>
<subject>SDG1 - No poverty</subject>
<subject>SDG3 - Good health and well being</subject>
<subject>SDG7 - Affordable and clean energy</subject>
<subject>SDG4 - Quality education</subject>
<subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject>
</subjects>
<datasources/>
<zenodocommunities>
<zenodocommunity>
<zenodoid>123</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
</community>
<community id="dh-ch">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fam">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ni">
<subjects>
<subject>brain mapping</subject>
<subject>brain imaging</subject>
<subject>electroencephalography</subject>
<subject>arterial spin labelling</subject>
<subject>brain fingerprinting</subject>
<subject>brain</subject>
<subject>neuroimaging</subject>
<subject>Multimodal Brain Image Analysis</subject>
<subject>fMRI</subject>
<subject>neuroinformatics</subject>
<subject>fetal brain</subject>
<subject>brain ultrasonic imaging</subject>
<subject>topographic brain mapping</subject>
<subject>diffusion tensor imaging</subject>
<subject>computerized knowledge assessment</subject>
<subject>connectome mapping</subject>
<subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::c7d3de67dc77af72f6747157441252ec</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>re3data_____::8515794670370f49c1d176c399c714f5</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::d640648c84b10d425f96f11c3de468f3</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="mes">
<subjects>
<subject>marine</subject>
<subject>ocean</subject>
<subject>fish</subject>
<subject>aqua</subject>
<subject>sea</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="aginfra">
<subjects>
<subject>animal production and health</subject>
<subject>fisheries and aquaculture</subject>
<subject>food safety and human nutrition</subject>
<subject>information management</subject>
<subject>food technology</subject>
<subject>agri-food education and extension</subject>
<subject>natural resources and environment</subject>
<subject>food system</subject>
<subject>engineering technology and Research</subject>
<subject>agriculture</subject>
<subject>food safety risk assessment</subject>
<subject>food security</subject>
<subject>farming practices and systems</subject>
<subject>plant production and protection</subject>
<subject>agri-food economics and policy</subject>
<subject>food distribution</subject>
<subject>forestry</subject>
</subjects>
<datasources>
<datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::49af6c4e558a7569d80eee2e035e2bd7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::0266e33d3f546cb5436a10798e657d97</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::41bfd20a38bb1b0bec75acf0845530a7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="clarin">
<oacommunity>oac_clarin</oacommunity>
<subjects/>
<datasources>
<datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
</communities>

View File

@ -0,0 +1,37 @@
{
"communities": {
"dariah": {
"id": "dariah",
"subjects": [
],
"datasources": [
{
"openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a",
"sc": {
"cl": {
"criteria": [
{
"ce": {
"constraint": [
{
"verb": "contains",
"field": "contributor",
"value": "DARIAH"
}
]
}
}
]
}
}
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "dimpo"
}
]
}
}
}

View File

@ -0,0 +1,193 @@
<communities>
<community id="fet-fp7">
<oacommunity/>
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fet-h2020">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="oa-pg">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ee">
<subjects>
<subject>SDG13 - Climate action</subject>
<subject>SDG8 - Decent work and economic growth</subject>
<subject>SDG15 - Life on land</subject>
<subject>SDG2 - Zero hunger</subject>
<subject>SDG17 - Partnerships for the goals</subject>
<subject>SDG10 - Reduced inequalities</subject>
<subject>SDG5 - Gender equality</subject>
<subject>SDG12 - Responsible consumption and production</subject>
<subject>SDG14 - Life below water</subject>
<subject>SDG6 - Clean water and sanitation</subject>
<subject>SDG11 - Sustainable cities and communities</subject>
<subject>SDG1 - No poverty</subject>
<subject>SDG3 - Good health and well being</subject>
<subject>SDG7 - Affordable and clean energy</subject>
<subject>SDG4 - Quality education</subject>
<subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject>
</subjects>
<datasources/>
<zenodocommunities>
<zenodocommunity>
<zenodoid>123</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
</community>
<community id="dh-ch">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fam">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ni">
<subjects>
<subject>brain mapping</subject>
<subject>brain imaging</subject>
<subject>electroencephalography</subject>
<subject>arterial spin labelling</subject>
<subject>brain fingerprinting</subject>
<subject>brain</subject>
<subject>neuroimaging</subject>
<subject>Multimodal Brain Image Analysis</subject>
<subject>fMRI</subject>
<subject>neuroinformatics</subject>
<subject>fetal brain</subject>
<subject>brain ultrasonic imaging</subject>
<subject>topographic brain mapping</subject>
<subject>diffusion tensor imaging</subject>
<subject>computerized knowledge assessment</subject>
<subject>connectome mapping</subject>
<subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::c7d3de67dc77af72f6747157441252ec</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>re3data_____::8515794670370f49c1d176c399c714f5</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::d640648c84b10d425f96f11c3de468f3</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="mes">
<subjects>
<subject>marine</subject>
<subject>ocean</subject>
<subject>fish</subject>
<subject>aqua</subject>
<subject>sea</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="aginfra">
<subjects>
<subject>animal production and health</subject>
<subject>fisheries and aquaculture</subject>
<subject>food safety and human nutrition</subject>
<subject>information management</subject>
<subject>food technology</subject>
<subject>agri-food education and extension</subject>
<subject>natural resources and environment</subject>
<subject>food system</subject>
<subject>engineering technology and Research</subject>
<subject>agriculture</subject>
<subject>food safety risk assessment</subject>
<subject>food security</subject>
<subject>farming practices and systems</subject>
<subject>plant production and protection</subject>
<subject>agri-food economics and policy</subject>
<subject>food distribution</subject>
<subject>forestry</subject>
</subjects>
<datasources>
<datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::49af6c4e558a7569d80eee2e035e2bd7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::0266e33d3f546cb5436a10798e657d97</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::41bfd20a38bb1b0bec75acf0845530a7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="clarin">
<oacommunity>oac_clarin</oacommunity>
<subjects/>
<datasources>
<datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="dariah">
<oacommunity>oaa_dariah</oacommunity>
<subjects/>
<datasources>
<datasource>
<openaireId>openaire____::1cfdb2e14977f31a98e0118283401f32</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}
</selcriteria>
</datasource>
</datasources>
<zenodocommunities>
<zenodocommunity>
<zenodoid>dimpo</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
</community>
</communities>

File diff suppressed because one or more lines are too long

View File

@ -24,8 +24,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.DbClient;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2; import scala.Tuple2;

View File

@ -30,8 +30,8 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
import eu.dnetlib.dhp.oa.graph.raw.common.DbClient;
import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
@ -94,7 +94,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
log.info("Processing orgs..."); log.info("Processing orgs...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
log.info("Processing relations ds <-> orgs ..."); log.info("Processing relationsNoRemoval ds <-> orgs ...");
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
log.info("Processing projects <-> orgs ..."); log.info("Processing projects <-> orgs ...");
@ -370,6 +370,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
final DataInfo info = dataInfo( final DataInfo info = dataInfo(
false, null, false, false, false, null, false, false,
qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9"); qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9");
final List<KeyValue> collectedFrom = listKeyValues( final List<KeyValue> collectedFrom = listKeyValues(
@ -460,7 +461,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
final Boolean inferred = rs.getBoolean("inferred"); final Boolean inferred = rs.getBoolean("inferred");
final String trust = rs.getString("trust"); final String trust = rs.getString("trust");
return dataInfo( return dataInfo(
deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
} }
private Qualifier prepareQualifierSplitting(final String s) { private Qualifier prepareQualifierSplitting(final String s) {
@ -516,6 +519,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
if (arr.length == 3) { if (arr.length == 3) {
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null; final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0].trim() : null;
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null; final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1].trim() : null;
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2].trim() : null;
if (issn != null || eissn != null || lissn != null) { if (issn != null || eissn != null || lissn != null) {

View File

@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-propagation</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,169 @@
package eu.dnetlib.dhp;
import java.util.List;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
import eu.dnetlib.dhp.schema.oaf.*;
public class PropagationConstant {
public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional";
public static final String PROPAGATION_DATA_INFO_TYPE = "propagation";
public static final String TRUE = "true";
public static final String DNET_COUNTRY_SCHEMA = "dnet:countries";
public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = "result:organization:instrepo";
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository";
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel";
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation";
public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = "result:community:semrel";
public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation";
public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = "result:community:organization";
public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization";
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy";
public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization";
public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation";
public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf";
public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution";
public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult";
public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject";
public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome";
public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy";
public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces";
public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges";
public static final String PROPAGATION_AUTHOR_PID = "ORCID";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
+
"from result r " +
"lateral view explode(instance) i as inst " +
"where r.datainfo.deletedbyinference=false";
public static Country getCountry(String classid, String classname) {
Country nc = new Country();
nc.setClassid(classid);
nc.setClassname(classname);
nc.setSchemename(DNET_COUNTRY_SCHEMA);
nc.setSchemeid(DNET_COUNTRY_SCHEMA);
nc
.setDataInfo(
getDataInfo(
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_COUNTRY_INSTREPO_CLASS_ID,
PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME));
return nc;
}
public static DataInfo getDataInfo(
String inference_provenance, String inference_class_id, String inference_class_name) {
DataInfo di = new DataInfo();
di.setInferred(true);
di.setDeletedbyinference(false);
di.setTrust("0.85");
di.setInferenceprovenance(inference_provenance);
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
return di;
}
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
Qualifier pa = new Qualifier();
pa.setClassid(inference_class_id);
pa.setClassname(inference_class_name);
pa.setSchemeid(DNET_SCHEMA_ID);
pa.setSchemename(DNET_SCHEMA_NAME);
return pa;
}
public static Relation getRelation(
String source,
String target,
String rel_class,
String rel_type,
String subrel_type,
String inference_provenance,
String inference_class_id,
String inference_class_name) {
Relation r = new Relation();
r.setSource(source);
r.setTarget(target);
r.setRelClass(rel_class);
r.setRelType(rel_type);
r.setSubRelType(subrel_type);
r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name));
return r;
}
public static String getConstraintList(String text, List<String> constraints) {
String ret = " and (" + text + constraints.get(0) + "'";
for (int i = 1; i < constraints.size(); i++) {
ret += " OR " + text + constraints.get(i) + "'";
}
ret += ")";
return ret;
}
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
return Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
}
public static Boolean isTest(ArgumentApplicationParser parser) {
return Optional
.ofNullable(parser.get("isTest"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
}
public static void createCfHbforResult(SparkSession spark) {
org.apache.spark.sql.Dataset<Row> cfhb = spark.sql(cfHbforResultQuery);
cfhb.createOrReplaceTempView("cfhb");
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.countrypropagation;
import java.io.Serializable;
public class CountrySbs implements Serializable {
private String classid;
private String classname;
public String getClassid() {
return classid;
}
public void setClassid(String classid) {
this.classid = classid;
}
public String getClassname() {
return classname;
}
public void setClassname(String classname) {
this.classname = classname;
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.countrypropagation;
import java.io.Serializable;
public class DatasourceCountry implements Serializable {
private String dataSourceId;
private CountrySbs country;
public String getDataSourceId() {
return dataSourceId;
}
public void setDataSourceId(String dataSourceId) {
this.dataSourceId = dataSourceId;
}
public CountrySbs getCountry() {
return country;
}
public void setCountry(CountrySbs country) {
this.country = country;
}
}

View File

@ -0,0 +1,121 @@
package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
/**
* For the association of the country to the datasource The association is computed only for datasource of specific type
* or having whitelisted ids The country is registered in the Organization associated to the Datasource, so the relation
* provides between Datasource and Organization is exploited to get the country for the datasource
*/
public class PrepareDatasourceCountryAssociation {
private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareDatasourceCountryAssociation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareDatasourceCountryAssociation(
spark,
Arrays.asList(parser.get("whitelist").split(";")),
Arrays.asList(parser.get("allowedtypes").split(";")),
inputPath,
outputPath);
});
}
private static void prepareDatasourceCountryAssociation(
SparkSession spark,
List<String> whitelist,
List<String> allowedtypes,
String inputPath,
String outputPath) {
String whitelisted = "";
for (String i : whitelist) {
whitelisted += " OR id = '" + i + "'";
}
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
datasource.createOrReplaceTempView("datasource");
relation.createOrReplaceTempView("relation");
organization.createOrReplaceTempView("organization");
String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
+ "FROM ( SELECT id "
+ " FROM datasource "
+ " WHERE (datainfo.deletedbyinference = false "
+ whitelisted
+ ") "
+ getConstraintList("datasourcetype.classid = '", allowedtypes)
+ ") d "
+ "JOIN ( SELECT source, target "
+ " FROM relation "
+ " WHERE relclass = '"
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
+ "' "
+ " AND datainfo.deletedbyinference = false ) rel "
+ "ON d.id = rel.source "
+ "JOIN (SELECT id, country "
+ " FROM organization "
+ " WHERE datainfo.deletedbyinference = false "
+ " AND length(country.classid) > 0) o "
+ "ON o.id = rel.target";
spark
.sql(query)
.as(Encoders.bean(DatasourceCountry.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}

View File

@ -0,0 +1,98 @@
package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
public class PrepareResultCountrySet {
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet "
+ "FROM ( SELECT id, country "
+ "FROM datasource_country JOIN cfhb ON cf = dataSourceId "
+ "UNION ALL "
+ "SELECT id, country FROM datasource_country "
+ "JOIN cfhb ON hb = dataSourceId ) tmp "
+ "GROUP BY id";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultCountrySet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String datasourcecountrypath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", datasourcecountrypath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
getPotentialResultToUpdate(
spark,
inputPath,
outputPath,
datasourcecountrypath,
resultClazz);
});
}
private static <R extends Result> void getPotentialResultToUpdate(
SparkSession spark,
String inputPath,
String outputPath,
String datasourcecountrypath,
Class<R> resultClazz) {
Dataset<R> result = readPath(spark, inputPath, resultClazz);
result.createOrReplaceTempView("result");
// log.info("number of results: {}", result.count());
createCfHbforResult(spark);
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
datasource_country.createOrReplaceTempView("datasource_country");
// log.info("datasource_country number : {}", datasource_country.count());
spark
.sql(RESULT_COUNTRYSET_QUERY)
.as(Encoders.bean(ResultCountrySet.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.json(outputPath);
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.countrypropagation;
import java.io.Serializable;
import java.util.ArrayList;
public class ResultCountrySet implements Serializable {
private String resultId;
private ArrayList<CountrySbs> countrySet;
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public ArrayList<CountrySbs> getCountrySet() {
return countrySet;
}
public void setCountrySet(ArrayList<CountrySbs> countrySet) {
this.countrySet = countrySet;
}
}

View File

@ -0,0 +1,132 @@
package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Result;
import scala.Tuple2;
public class SparkCountryPropagationJob {
private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkCountryPropagationJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
String preparedInfoPath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", preparedInfoPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> execPropagation(
spark,
sourcePath,
preparedInfoPath,
outputPath,
resultClazz,
saveGraph));
}
private static <R extends Result> void execPropagation(
SparkSession spark,
String sourcePath,
String preparedInfoPath,
String outputPath,
Class<R> resultClazz,
boolean saveGraph) {
if (saveGraph) {
// updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
log.info("Reading Graph table from: {}", sourcePath);
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
log.info("Reading prepared info: {}", preparedInfoPath);
Dataset<ResultCountrySet> prepared = spark
.read()
.json(preparedInfoPath)
.as(Encoders.bean(ResultCountrySet.class));
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
return (MapFunction<Tuple2<R, ResultCountrySet>, R>) t -> {
Optional.ofNullable(t._2()).ifPresent(r -> {
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
});
return t._1();
};
}
private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
HashSet<String> countries = c1
.stream()
.map(c -> c.getClassid())
.collect(Collectors.toCollection(HashSet::new));
return c2
.stream()
.filter(c -> !countries.contains(c.getClassid()))
.map(c -> getCountry(c.getClassid(), c.getClassname()))
.collect(Collectors.toList());
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
public class AutoritativeAuthor {
private String name;
private String surname;
private String fullname;
private String orcid;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSurname() {
return surname;
}
public void setSurname(String surname) {
this.surname = surname;
}
public String getFullname() {
return fullname;
}
public void setFullname(String fullname) {
this.fullname = fullname;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
}

View File

@ -0,0 +1,125 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
public class PrepareResultOrcidAssociationStep1 {
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);
public static void main(String[] args) throws Exception {
String jsonConf = IOUtils
.toString(
PrepareResultOrcidAssociationStep1.class
.getResourceAsStream(
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
String inputRelationPath = inputPath + "/relation";
log.info("inputRelationPath: {}", inputRelationPath);
String inputResultPath = inputPath + "/" + resultType;
log.info("inputResultPath: {}", inputResultPath);
String outputResultPath = outputPath + "/" + resultType;
log.info("outputResultPath: {}", outputResultPath);
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
prepareInfo(
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
});
}
private static <R extends Result> void prepareInfo(
SparkSession spark,
String inputRelationPath,
String inputResultPath,
String outputResultPath,
Class<R> resultClazz,
List<String> allowedsemrel) {
Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class);
relation.createOrReplaceTempView("relation");
log.info("Reading Graph table from: {}", inputResultPath);
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
result.createOrReplaceTempView("result");
String query = " select target resultId, author authorList"
+ " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
+ " from ( "
+ " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
+ " from result "
+ " lateral view explode (author) a as MyT "
+ " lateral view explode (MyT.pid) p as MyP "
+ " where MyP.qualifier.classid = 'ORCID') tmp "
+ " group by id) r_t "
+ " join ("
+ " select source, target "
+ " from relation "
+ " where datainfo.deletedbyinference = false "
+ getConstraintList(" relclass = '", allowedsemrel)
+ ") rel_rel "
+ " on source = id";
spark
.sql(query)
.as(Encoders.bean(ResultOrcidList.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputResultPath);
}
}

View File

@ -0,0 +1,97 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import scala.Tuple2;
public class PrepareResultOrcidAssociationStep2 {
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultOrcidAssociationStep2.class
.getResourceAsStream(
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
mergeInfo(spark, inputPath, outputPath);
});
}
private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {
Dataset<ResultOrcidList> resultOrcidAssoc = readPath(spark, inputPath + "/publication", ResultOrcidList.class)
.union(readPath(spark, inputPath + "/dataset", ResultOrcidList.class))
.union(readPath(spark, inputPath + "/otherresearchproduct", ResultOrcidList.class))
.union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
resultOrcidAssoc
.toJavaRDD()
.mapToPair(r -> new Tuple2<>(r.getResultId(), r))
.reduceByKey(
(a, b) -> {
if (a == null) {
return b;
}
if (b == null) {
return a;
}
Set<String> orcid_set = new HashSet<>();
a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
b
.getAuthorList()
.stream()
.forEach(
aa -> {
if (!orcid_set.contains(aa.getOrcid())) {
a.getAuthorList().add(aa);
orcid_set.add(aa.getOrcid());
}
});
return a;
})
.map(c -> c._2())
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(outputPath, GzipCodec.class);
}
}

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
public class ResultOrcidList implements Serializable {
String resultId;
List<AutoritativeAuthor> authorList = new ArrayList<>();
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public List<AutoritativeAuthor> getAuthorList() {
return authorList;
}
public void setAuthorList(List<AutoritativeAuthor> authorList) {
this.authorList = authorList;
}
}

View File

@ -0,0 +1,199 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.Tuple2;
public class SparkOrcidToResultFromSemRelJob {
private static final Logger log = LoggerFactory.getLogger(SparkOrcidToResultFromSemRelJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkOrcidToResultFromSemRelJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String possibleUpdates = parser.get("possibleUpdatesPath");
log.info("possibleUpdatesPath: {}", possibleUpdates);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
if (saveGraph)
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
});
}
private static <R extends Result> void execPropagation(
SparkSession spark,
String possibleUpdatesPath,
String inputPath,
String outputPath,
Class<R> resultClazz) {
// read possible updates (resultId and list of possible orcid to add
Dataset<ResultOrcidList> possible_updates = readPath(spark, possibleUpdatesPath, ResultOrcidList.class);
// read the result we have been considering
Dataset<R> result = readPath(spark, inputPath, resultClazz);
// make join result left_outer with possible updates
result
.joinWith(
possible_updates,
result.col("id").equalTo(possible_updates.col("resultId")),
"left_outer")
.map(authorEnrichFn(), Encoders.bean(resultClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <R extends Result> MapFunction<Tuple2<R, ResultOrcidList>, R> authorEnrichFn() {
return (MapFunction<Tuple2<R, ResultOrcidList>, R>) value -> {
R ret = value._1();
Optional<ResultOrcidList> rol = Optional.ofNullable(value._2());
if (rol.isPresent()) {
List<Author> toenrich_author = ret.getAuthor();
List<AutoritativeAuthor> autoritativeAuthors = rol.get().getAuthorList();
for (Author author : toenrich_author) {
if (!containsAllowedPid(author)) {
enrichAuthor(author, autoritativeAuthors);
}
}
}
return ret;
};
}
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
for (AutoritativeAuthor aa : au) {
if (enrichAuthor(aa, a)) {
return;
}
}
}
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
boolean toaddpid = false;
if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) {
if (StringUtils.isNoneEmpty(author.getSurname())) {
if (autoritative_author
.getSurname()
.trim()
.equalsIgnoreCase(author.getSurname().trim())) {
// have the same surname. Check the name
if (StringUtils.isNoneEmpty(autoritative_author.getName())) {
if (StringUtils.isNoneEmpty(author.getName())) {
if (autoritative_author
.getName()
.trim()
.equalsIgnoreCase(author.getName().trim())) {
toaddpid = true;
}
// they could be differently written (i.e. only the initials of the name
// in one of the two
if (autoritative_author
.getName()
.trim()
.substring(0, 0)
.equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
toaddpid = true;
}
}
}
}
}
}
if (toaddpid) {
StructuredProperty p = new StructuredProperty();
p.setValue(autoritative_author.getOrcid());
p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID));
p
.setDataInfo(
getDataInfo(
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID,
PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME));
Optional<List<StructuredProperty>> authorPid = Optional.ofNullable(author.getPid());
if (authorPid.isPresent()) {
authorPid.get().add(p);
} else {
author.setPid(Lists.newArrayList(p));
}
}
return toaddpid;
}
private static boolean containsAllowedPid(Author a) {
Optional<List<StructuredProperty>> pids = Optional.ofNullable(a.getPid());
if (!pids.isPresent()) {
return false;
}
for (StructuredProperty pid : pids.get()) {
if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) {
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,126 @@
package eu.dnetlib.dhp.projecttoresult;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.PropagationConstant.getConstraintList;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareProjectResultsAssociation {
private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareProjectResultsAssociation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String potentialUpdatePath = parser.get("potentialUpdatePath");
log.info("potentialUpdatePath {}: ", potentialUpdatePath);
String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath: {} ", alreadyLinkedPath);
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
prepareResultProjProjectResults(
spark,
inputPath,
potentialUpdatePath,
alreadyLinkedPath,
allowedsemrel);
});
}
private static void prepareResultProjProjectResults(
SparkSession spark,
String inputPath,
String potentialUpdatePath,
String alreadyLinkedPath,
List<String> allowedsemrel) {
Dataset<Relation> relation = readPath(spark, inputPath, Relation.class);
relation.createOrReplaceTempView("relation");
String resproj_relation_query = "SELECT source, target "
+ " FROM relation "
+ " WHERE datainfo.deletedbyinference = false "
+ " AND relClass = '"
+ RELATION_RESULT_PROJECT_REL_CLASS
+ "'";
Dataset<Row> resproj_relation = spark.sql(resproj_relation_query);
resproj_relation.createOrReplaceTempView("resproj_relation");
String potential_update_query = "SELECT resultId, collect_set(projectId) projectSet "
+ "FROM ( "
+ "SELECT r1.target resultId, r2.target projectId "
+ " FROM (SELECT source, target "
+ " FROM relation "
+ " WHERE datainfo.deletedbyinference = false "
+ getConstraintList(" relClass = '", allowedsemrel)
+ " ) r1"
+ " JOIN resproj_relation r2 "
+ " ON r1.source = r2.source "
+ " ) tmp "
+ "GROUP BY resultId ";
spark
.sql(potential_update_query)
.as(Encoders.bean(ResultProjectSet.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(potentialUpdatePath);
String result_projectset_query = "SELECT source resultId, collect_set(target) projectSet "
+ "FROM resproj_relation "
+ "GROUP BY source";
spark
.sql(result_projectset_query)
.as(Encoders.bean(ResultProjectSet.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(alreadyLinkedPath);
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.projecttoresult;
import java.io.Serializable;
import java.util.ArrayList;
public class ResultProjectSet implements Serializable {
private String resultId;
private ArrayList<String> projectSet;
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public ArrayList<String> getProjectSet() {
return projectSet;
}
public void setProjectSet(ArrayList<String> project) {
this.projectSet = project;
}
}

View File

@ -0,0 +1,147 @@
package eu.dnetlib.dhp.projecttoresult;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class SparkResultToProjectThroughSemRelJob {
private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkResultToProjectThroughSemRelJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String potentialUpdatePath = parser.get("potentialUpdatePath");
log.info("potentialUpdatePath {}: ", potentialUpdatePath);
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
log.info("saveGraph: {}", saveGraph);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
execPropagation(
spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph);
});
}
private static void execPropagation(
SparkSession spark,
String outputPath,
String alreadyLinkedPath,
String potentialUpdatePath,
Boolean saveGraph) {
Dataset<ResultProjectSet> toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class);
Dataset<ResultProjectSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class);
if (saveGraph) {
toaddrelations
.joinWith(
alreadyLinked,
toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")),
"left_outer")
.flatMap(mapRelationRn(), Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath);
}
}
private static FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation> mapRelationRn() {
return (FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation>) value -> {
List<Relation> new_relations = new ArrayList<>();
ResultProjectSet potential_update = value._1();
Optional<ResultProjectSet> already_linked = Optional.ofNullable(value._2());
if (already_linked.isPresent()) {
already_linked
.get()
.getProjectSet()
.stream()
.forEach(
(p -> {
if (potential_update
.getProjectSet()
.contains(p)) {
potential_update.getProjectSet().remove(p);
}
}));
}
String resId = potential_update.getResultId();
potential_update
.getProjectSet()
.stream()
.forEach(
projectId -> {
new_relations
.add(
getRelation(
resId,
projectId,
RELATION_RESULT_PROJECT_REL_CLASS,
RELATION_RESULTPROJECT_REL_TYPE,
RELATION_RESULTPROJECT_SUBREL_TYPE,
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
new_relations
.add(
getRelation(
projectId,
resId,
RELATION_PROJECT_RESULT_REL_CLASS,
RELATION_RESULTPROJECT_REL_TYPE,
RELATION_RESULTPROJECT_SUBREL_TYPE,
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
});
return new_relations.iterator();
};
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.resulttocommunityfromorganization;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class OrganizationMap extends HashMap<String, List<String>> {
public OrganizationMap() {
super();
}
public List<String> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
}
return super.get(key);
}
}

View File

@ -0,0 +1,130 @@
package eu.dnetlib.dhp.resulttocommunityfromorganization;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareResultCommunitySet {
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultCommunitySet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final OrganizationMap organizationMap = new Gson()
.fromJson(
parser.get("organizationtoresultcommunitymap"),
OrganizationMap.class);
log.info("organizationMap: {}", new Gson().toJson(organizationMap));
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
prepareInfo(spark, inputPath, outputPath, organizationMap);
});
}
private static void prepareInfo(
SparkSession spark,
String inputPath,
String outputPath,
OrganizationMap organizationMap) {
Dataset<Relation> relation = readPath(spark, inputPath, Relation.class);
relation.createOrReplaceTempView("relation");
String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges "
+ "FROM (SELECT source, target "
+ " FROM relation "
+ " WHERE datainfo.deletedbyinference = false "
+ " AND relClass = '"
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
+ "') result_organization "
+ "LEFT JOIN (SELECT source, collect_set(target) org_set "
+ " FROM relation "
+ " WHERE datainfo.deletedbyinference = false "
+ " AND relClass = '"
+ RELATION_REPRESENTATIVERESULT_RESULT_CLASS
+ "' "
+ " GROUP BY source) organization_organization "
+ "ON result_organization.target = organization_organization.source ";
Dataset<ResultOrganizations> result_organizationset = spark
.sql(query)
.as(Encoders.bean(ResultOrganizations.class));
result_organizationset
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
OrganizationMap organizationMap) {
return (MapFunction<ResultOrganizations, ResultCommunityList>) value -> {
String rId = value.getResultId();
Optional<List<String>> orgs = Optional.ofNullable(value.getMerges());
String oTarget = value.getOrgId();
Set<String> communitySet = new HashSet<>();
if (organizationMap.containsKey(oTarget)) {
communitySet.addAll(organizationMap.get(oTarget));
}
if (orgs.isPresent())
for (String oId : orgs.get()) {
if (organizationMap.containsKey(oId)) {
communitySet.addAll(organizationMap.get(oId));
}
}
if (communitySet.size() > 0) {
ResultCommunityList rcl = new ResultCommunityList();
rcl.setResultId(rId);
ArrayList<String> communityList = new ArrayList<>();
communityList.addAll(communitySet);
rcl.setCommunityList(communityList);
return rcl;
}
return null;
};
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.resulttocommunityfromorganization;
import java.io.Serializable;
import java.util.ArrayList;
public class ResultCommunityList implements Serializable {
private String resultId;
private ArrayList<String> communityList;
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public ArrayList<String> getCommunityList() {
return communityList;
}
public void setCommunityList(ArrayList<String> communityList) {
this.communityList = communityList;
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.resulttocommunityfromorganization;
import java.io.Serializable;
import java.util.ArrayList;
public class ResultOrganizations implements Serializable {
private String resultId;
private String orgId;
private ArrayList<String> merges;
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public String getOrgId() {
return orgId;
}
public void setOrgId(String orgId) {
this.orgId = orgId;
}
public ArrayList<String> getMerges() {
return merges;
}
public void setMerges(ArrayList<String> merges) {
this.merges = merges;
}
}

View File

@ -0,0 +1,137 @@
package eu.dnetlib.dhp.resulttocommunityfromorganization;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
public class SparkResultToCommunityFromOrganizationJob {
private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromOrganizationJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkResultToCommunityFromOrganizationJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String possibleupdatespath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", possibleupdatespath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
if (saveGraph)
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
});
}
private static <R extends Result> void execPropagation(
SparkSession spark,
String inputPath,
String outputPath,
Class<R> resultClazz,
String possibleUpdatesPath) {
Dataset<ResultCommunityList> possibleUpdates = readPath(spark, possibleUpdatesPath, ResultCommunityList.class);
Dataset<R> result = readPath(spark, inputPath, resultClazz);
result
.joinWith(
possibleUpdates,
result.col("id").equalTo(possibleUpdates.col("resultId")),
"left_outer")
.map(resultCommunityFn(), Encoders.bean(resultClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> resultCommunityFn() {
return (MapFunction<Tuple2<R, ResultCommunityList>, R>) value -> {
R ret = value._1();
Optional<ResultCommunityList> rcl = Optional.ofNullable(value._2());
if (rcl.isPresent()) {
ArrayList<String> communitySet = rcl.get().getCommunityList();
List<String> contextList = ret
.getContext()
.stream()
.map(con -> con.getId())
.collect(Collectors.toList());
Result res = new Result();
res.setId(ret.getId());
List<Context> propagatedContexts = new ArrayList<>();
for (String cId : communitySet) {
if (!contextList.contains(cId)) {
Context newContext = new Context();
newContext.setId(cId);
newContext
.setDataInfo(
Arrays
.asList(
getDataInfo(
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID,
PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME)));
propagatedContexts.add(newContext);
}
}
res.setContext(propagatedContexts);
ret.mergeFrom(res);
}
return ret;
};
}
}

View File

@ -0,0 +1,167 @@
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class PrepareResultCommunitySetStep1 {
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class);
private static final String COMMUNITY_LIST_XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')"
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']"
+ " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'"
+ " return $x//CONFIGURATION/context/@id/string()";
/**
* associates to each result the set of community contexts they are associated to; associates to each target of a
* relation with allowed semantics the set of community context it could possibly inherit from the source of the
* relation
*/
// TODO
private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context "
+ "from (select id, collect_set(co.id) community_context "
+ " from result "
+ " lateral view explode (context) c as co "
+ " where datainfo.deletedbyinference = false %s group by id) p "
+ " JOIN "
+ " (select source, target from relation "
+ " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source";
/**
* a dataset for example could be linked to more than one publication. For each publication linked to that dataset
* the previous query will produce a row: targetId set of community context the target could possibly inherit with
* the following query there will be a single row for each result linked to more than one result of the result type
* currently being used
*/
// TODO
private static final String RESULT_COMMUNITY_LIST_QUERY = "select resultId , collect_set(co) communityList "
+ "from result_context "
+ "lateral view explode (community_context) c as co "
+ "where length(co) > 0 "
+ "group by resultId";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultCommunitySetStep1.class
.getResourceAsStream(
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
final String isLookupUrl = parser.get("isLookUpUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final List<String> communityIdList = getCommunityList(isLookupUrl);
log.info("communityIdList: {}", new Gson().toJson(communityIdList));
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
prepareInfo(
spark,
inputPath,
outputPath,
allowedsemrel,
resultClazz,
resultType,
communityIdList);
});
}
private static <R extends Result> void prepareInfo(
SparkSession spark,
String inputPath,
String outputPath,
List<String> allowedsemrel,
Class<R> resultClazz,
String resultType,
List<String> communityIdList) {
final String inputResultPath = inputPath + "/" + resultType;
log.info("Reading Graph table from: {}", inputResultPath);
final String inputRelationPath = inputPath + "/relation";
log.info("Reading relation table from: {}", inputResultPath);
Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class);
relation.createOrReplaceTempView("relation");
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
result.createOrReplaceTempView("result");
final String outputResultPath = outputPath + "/" + resultType;
log.info("writing output results to: {}", outputResultPath);
String resultContextQuery = String
.format(
RESULT_CONTEXT_QUERY_TEMPLATE,
getConstraintList(" co.id = '", communityIdList),
getConstraintList(" relClass = '", allowedsemrel));
Dataset<Row> result_context = spark.sql(resultContextQuery);
result_context.createOrReplaceTempView("result_context");
spark
.sql(RESULT_COMMUNITY_LIST_QUERY)
.as(Encoders.bean(ResultCommunityList.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputResultPath);
}
public static List<String> getCommunityList(final String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
return isLookUp.quickSearchProfile(COMMUNITY_LIST_XQUERY);
}
}

View File

@ -0,0 +1,101 @@
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
import scala.Tuple2;
public class PrepareResultCommunitySetStep2 {
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep2.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultCommunitySetStep2.class
.getResourceAsStream(
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
mergeInfo(spark, inputPath, outputPath);
});
}
private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {
Dataset<ResultCommunityList> resultOrcidAssocCommunityList = readPath(
spark, inputPath + "/publication", ResultCommunityList.class)
.union(readPath(spark, inputPath + "/dataset", ResultCommunityList.class))
.union(readPath(spark, inputPath + "/otherresearchproduct", ResultCommunityList.class))
.union(readPath(spark, inputPath + "/software", ResultCommunityList.class));
resultOrcidAssocCommunityList
.toJavaRDD()
.mapToPair(r -> new Tuple2<>(r.getResultId(), r))
.reduceByKey(
(a, b) -> {
if (a == null) {
return b;
}
if (b == null) {
return a;
}
Set<String> community_set = new HashSet<>();
a.getCommunityList().stream().forEach(aa -> community_set.add(aa));
b
.getCommunityList()
.stream()
.forEach(
aa -> {
if (!community_set.contains(aa)) {
a.getCommunityList().add(aa);
community_set.add(aa);
}
});
return a;
})
.map(c -> c._2())
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(outputPath, GzipCodec.class);
}
}

View File

@ -0,0 +1,143 @@
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
public class SparkResultToCommunityThroughSemRelJob {
private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityThroughSemRelJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkResultToCommunityThroughSemRelJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String preparedInfoPath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", preparedInfoPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
if (saveGraph) {
execPropagation(
spark, inputPath, outputPath, preparedInfoPath, resultClazz);
}
});
}
private static <R extends Result> void execPropagation(
SparkSession spark,
String inputPath,
String outputPath,
String preparedInfoPath,
Class<R> resultClazz) {
Dataset<ResultCommunityList> possibleUpdates = readPath(spark, preparedInfoPath, ResultCommunityList.class);
Dataset<R> result = readPath(spark, inputPath, resultClazz);
result
.joinWith(
possibleUpdates,
result.col("id").equalTo(possibleUpdates.col("resultId")),
"left_outer")
.map(contextUpdaterFn(), Encoders.bean(resultClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> contextUpdaterFn() {
return (MapFunction<Tuple2<R, ResultCommunityList>, R>) value -> {
R ret = value._1();
Optional<ResultCommunityList> rcl = Optional.ofNullable(value._2());
if (rcl.isPresent()) {
Set<String> context_set = new HashSet<>();
ret.getContext().stream().forEach(c -> context_set.add(c.getId()));
List<Context> contextList = rcl
.get()
.getCommunityList()
.stream()
.map(
c -> {
if (!context_set.contains(c)) {
Context newContext = new Context();
newContext.setId(c);
newContext
.setDataInfo(
Arrays
.asList(
getDataInfo(
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID,
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME)));
return newContext;
}
return null;
})
.filter(Objects::nonNull)
.collect(Collectors.toList());
Result r = new Result();
r.setId(ret.getId());
r.setContext(contextList);
ret.mergeFrom(r);
}
return ret;
};
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
import java.io.Serializable;
public class DatasourceOrganization implements Serializable {
private String datasourceId;
private String organizationId;
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getOrganizationId() {
return organizationId;
}
public void setOrganizationId(String organizationId) {
this.organizationId = organizationId;
}
}

View File

@ -0,0 +1,122 @@
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareResultInstRepoAssociation {
private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultInstRepoAssociation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath");
log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath);
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
readNeededResources(spark, inputPath);
prepareDatasourceOrganization(spark, datasourceOrganizationPath);
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
});
}
private static void prepareAlreadyLinkedAssociation(
SparkSession spark, String alreadyLinkedPath) {
String query = "Select source resultId, collect_set(target) organizationSet "
+ "from relation "
+ "where datainfo.deletedbyinference = false "
+ "and relClass = '"
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
+ "' "
+ "group by source";
spark
.sql(query)
.as(Encoders.bean(ResultOrganizationSet.class))
// TODO retry to stick with datasets
.toJavaRDD()
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
}
private static void readNeededResources(SparkSession spark, String inputPath) {
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
datasource.createOrReplaceTempView("datasource");
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
relation.createOrReplaceTempView("relation");
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
organization.createOrReplaceTempView("organization");
}
private static void prepareDatasourceOrganization(
SparkSession spark, String datasourceOrganizationPath) {
String query = "SELECT source datasourceId, target organizationId "
+ "FROM ( SELECT id "
+ "FROM datasource "
+ "WHERE datasourcetype.classid = '"
+ INSTITUTIONAL_REPO_TYPE
+ "' "
+ "AND datainfo.deletedbyinference = false ) d "
+ "JOIN ( SELECT source, target "
+ "FROM relation "
+ "WHERE relclass = '"
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
+ "' "
+ "AND datainfo.deletedbyinference = false ) rel "
+ "ON d.id = rel.source ";
spark
.sql(query)
.as(Encoders.bean(DatasourceOrganization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(datasourceOrganizationPath);
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
import java.io.Serializable;
import java.util.ArrayList;
public class ResultOrganizationSet implements Serializable {
private String resultId;
private ArrayList<String> organizationSet;
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public ArrayList<String> getOrganizationSet() {
return organizationSet;
}
public void setOrganizationSet(ArrayList<String> organizationSet) {
this.organizationSet = organizationSet;
}
}

View File

@ -0,0 +1,193 @@
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
public class SparkResultToOrganizationFromIstRepoJob {
private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromIstRepoJob.class);
private static final String RESULT_ORGANIZATIONSET_QUERY = "SELECT id resultId, collect_set(organizationId) organizationSet "
+ "FROM ( SELECT id, organizationId "
+ "FROM rels "
+ "JOIN cfhb "
+ " ON cf = datasourceId "
+ "UNION ALL "
+ "SELECT id , organizationId "
+ "FROM rels "
+ "JOIN cfhb "
+ " ON hb = datasourceId ) tmp "
+ "GROUP BY id";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkResultToOrganizationFromIstRepoJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String datasourceorganization = parser.get("datasourceOrganizationPath");
log.info("datasourceOrganizationPath: {}", datasourceorganization);
final String alreadylinked = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath: {}", alreadylinked);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
if (saveGraph)
execPropagation(
spark,
datasourceorganization,
alreadylinked,
inputPath,
outputPath,
resultClazz);
});
}
private static void execPropagation(
SparkSession spark,
String datasourceorganization,
String alreadyLinkedPath,
String inputPath,
String outputPath,
Class<? extends Result> clazz) {
Dataset<DatasourceOrganization> ds_org = readPath(spark, datasourceorganization, DatasourceOrganization.class);
Dataset<ResultOrganizationSet> potentialUpdates = getPotentialRelations(spark, inputPath, clazz, ds_org);
Dataset<ResultOrganizationSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultOrganizationSet.class);
potentialUpdates
.joinWith(
alreadyLinked,
potentialUpdates.col("resultId").equalTo(alreadyLinked.col("resultId")),
"left_outer")
.flatMap(createRelationFn(), Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath);
}
private static FlatMapFunction<Tuple2<ResultOrganizationSet, ResultOrganizationSet>, Relation> createRelationFn() {
return (FlatMapFunction<Tuple2<ResultOrganizationSet, ResultOrganizationSet>, Relation>) value -> {
List<Relation> new_relations = new ArrayList<>();
ResultOrganizationSet potential_update = value._1();
Optional<ResultOrganizationSet> already_linked = Optional.ofNullable(value._2());
List<String> organization_list = potential_update.getOrganizationSet();
if (already_linked.isPresent()) {
already_linked
.get()
.getOrganizationSet()
.stream()
.forEach(
rId -> {
if (organization_list.contains(rId)) {
organization_list.remove(rId);
}
});
}
String resultId = potential_update.getResultId();
organization_list
.stream()
.forEach(
orgId -> {
new_relations
.add(
getRelation(
orgId,
resultId,
RELATION_ORGANIZATION_RESULT_REL_CLASS,
RELATION_RESULTORGANIZATION_REL_TYPE,
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
new_relations
.add(
getRelation(
resultId,
orgId,
RELATION_RESULT_ORGANIZATION_REL_CLASS,
RELATION_RESULTORGANIZATION_REL_TYPE,
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
});
return new_relations.iterator();
};
}
private static <R extends Result> Dataset<ResultOrganizationSet> getPotentialRelations(
SparkSession spark,
String inputPath,
Class<R> resultClazz,
Dataset<DatasourceOrganization> ds_org) {
Dataset<R> result = readPath(spark, inputPath, resultClazz);
result.createOrReplaceTempView("result");
createCfHbforResult(spark);
ds_org.createOrReplaceTempView("rels");
return spark
.sql(RESULT_ORGANIZATIONSET_QUERY)
.as(Encoders.bean(ResultOrganizationSet.class));
}
}

Some files were not shown because too many files have changed in this diff Show More