master #11

Manually merged
claudio.atzori merged 275 commits from :master into enrichment_wfs 2020-05-11 15:14:56 +02:00
32 changed files with 2764 additions and 271 deletions
Showing only changes of commit 90c768dde6 - Show all commits

View File

@ -0,0 +1,52 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-build</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.1.7-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-shaded-libs</artifactId>
<dependencies>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.3-jre</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<relocations>
<relocation>
<pattern>com.google.guava</pattern>
<shadedPattern>shaded.com.google.guava</shadedPattern>
</relocation>
</relocations>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,7 +1,5 @@
package eu.dnetlib.dhp; package eu.dnetlib.dhp;
import java.io.File;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -10,32 +8,36 @@ import eu.dnetlib.dhp.community.ProtoMap;
import eu.dnetlib.dhp.community.QueryInformationSystem; import eu.dnetlib.dhp.community.QueryInformationSystem;
import eu.dnetlib.dhp.community.ResultTagger; import eu.dnetlib.dhp.community.ResultTagger;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import java.io.File;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
public class SparkBulkTagJob { public class SparkBulkTagJob {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkBulkTagJob.class.getResourceAsStream("/eu/dnetlib/dhp/input_bulktag_parameters.json"))); final ArgumentApplicationParser parser =
new ArgumentApplicationParser(
IOUtils.toString(
SparkBulkTagJob.class.getResourceAsStream(
"/eu/dnetlib/dhp/input_bulktag_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
final SparkSession spark = SparkSession final SparkSession spark =
.builder() SparkSession.builder()
.appName(SparkBulkTagJob.class.getSimpleName()) .appName(SparkBulkTagJob.class.getSimpleName())
.master(parser.get("master")) .master(parser.get("master"))
.enableHiveSupport() .enableHiveSupport()
.getOrCreate(); .getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
final String outputPath = "/tmp/provision/bulktagging"; final String outputPath = "/tmp/provision/bulktagging";
final ResultTagger resultTagger = new ResultTagger(); final ResultTagger resultTagger = new ResultTagger();
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("mappingProto"),ProtoMap.class);; ProtoMap protoMappingParams =
new Gson().fromJson(parser.get("mappingProto"), ProtoMap.class);
;
File directory = new File(outputPath); File directory = new File(outputPath);
@ -43,31 +45,28 @@ public class SparkBulkTagJob {
directory.mkdirs(); directory.mkdirs();
} }
CommunityConfiguration cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookupUrl")); CommunityConfiguration cc =
QueryInformationSystem.getCommunityConfiguration(parser.get("isLookupUrl"));
sc.textFile(inputPath + "/publication")
sc.sequenceFile(inputPath + "/publication", Text.class, Text.class) .map(item -> new ObjectMapper().readValue(item, Publication.class))
.map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p)) .map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/publication"); .saveAsTextFile(outputPath + "/publication");
sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class) sc.textFile(inputPath + "/dataset")
.map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class)) .map(item -> new ObjectMapper().readValue(item, Dataset.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p)) .map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/dataset"); .saveAsTextFile(outputPath + "/dataset");
sc.sequenceFile(inputPath + "/software", Text.class, Text.class) sc.textFile(inputPath + "/software")
.map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class)) .map(item -> new ObjectMapper().readValue(item, Software.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p)) .map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/software"); .saveAsTextFile(outputPath + "/software");
sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class) sc.textFile(inputPath + "/otherresearchproduct")
.map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class)) .map(item -> new ObjectMapper().readValue(item, OtherResearchProduct.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p)) .map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/otherresearchproduct"); .saveAsTextFile(outputPath + "/otherresearchproduct");
} }
} }

View File

@ -0,0 +1,161 @@
package eu.dnetlib.dhp;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.community.*;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SparkBulkTagJob2 {
private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob2.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration =
IOUtils.toString(
SparkBulkTagJob2.class.getResourceAsStream(
"/eu/dnetlib/dhp/input_bulktag_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged =
Optional.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
Boolean isTest =
Optional.ofNullable(parser.get("isTest"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
log.info("isTest: {} ", isTest);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("protoMap"), ProtoMap.class);
;
log.info("protoMap: {}", new Gson().toJson(protoMappingParams));
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph =
Optional.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz =
(Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
CommunityConfiguration cc;
String taggingConf = parser.get("taggingConf");
if (isTest) {
cc = CommunityConfigurationFactory.fromJson(taggingConf);
} else {
cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookupUrl"));
}
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
});
// runWithSparkSession(conf, isSparkSessionManaged,
// spark -> {
// if(isTest(parser)) {
// removeOutputDir(spark, outputPath);
// }
// if(saveGraph)
// execPropagation(spark, possibleUpdates, inputPath, outputPath,
// resultClazz);
// });
//
//
//
//
//
//
// sc.textFile(inputPath + "/publication")
// .map(item -> new ObjectMapper().readValue(item, Publication.class))
// .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
// .map(p -> new ObjectMapper().writeValueAsString(p))
// .saveAsTextFile(outputPath+"/publication");
// sc.textFile(inputPath + "/dataset")
// .map(item -> new ObjectMapper().readValue(item, Dataset.class))
// .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
// .map(p -> new ObjectMapper().writeValueAsString(p))
// .saveAsTextFile(outputPath+"/dataset");
// sc.textFile(inputPath + "/software")
// .map(item -> new ObjectMapper().readValue(item, Software.class))
// .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
// .map(p -> new ObjectMapper().writeValueAsString(p))
// .saveAsTextFile(outputPath+"/software");
// sc.textFile(inputPath + "/otherresearchproduct")
// .map(item -> new ObjectMapper().readValue(item,
// OtherResearchProduct.class))
// .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
// .map(p -> new ObjectMapper().writeValueAsString(p))
// .saveAsTextFile(outputPath+"/otherresearchproduct");
//
}
private static <R extends Result> void execBulkTag(
SparkSession spark,
String inputPath,
String outputPath,
ProtoMap protoMappingParams,
Class<R> resultClazz,
CommunityConfiguration communityConfiguration) {
ResultTagger resultTagger = new ResultTagger();
Dataset<R> result = readPathEntity(spark, inputPath, resultClazz);
result.map(
value ->
resultTagger.enrichContextCriteria(
value, communityConfiguration, protoMappingParams),
Encoders.bean(resultClazz))
.toJSON()
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(outputPath);
}
private static <R extends Result> org.apache.spark.sql.Dataset<R> readPathEntity(
SparkSession spark, String inputEntityPath, Class<R> resultClazz) {
return spark.read()
.textFile(inputEntityPath)
.map(
(MapFunction<String, R>)
value -> OBJECT_MAPPER.readValue(value, resultClazz),
Encoders.bean(resultClazz));
}
}

View File

@ -1,16 +1,14 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.community;
import com.google.gson.Gson; import com.google.gson.Gson;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import java.util.ArrayList; /** Created by miriam on 01/08/2018. */
import java.util.List; public class Community implements Serializable {
/**
* Created by miriam on 01/08/2018.
*/
public class Community {
private static final Log log = LogFactory.getLog(Community.class); private static final Log log = LogFactory.getLog(Community.class);
@ -19,14 +17,15 @@ public class Community {
private List<Datasource> datasources = new ArrayList<>(); private List<Datasource> datasources = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>(); private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
public String toJson() { public String toJson() {
final Gson g = new Gson(); final Gson g = new Gson();
return g.toJson(this); return g.toJson(this);
} }
public boolean isValid() { public boolean isValid() {
return !getSubjects().isEmpty() || !getDatasources().isEmpty() || !getZenodoCommunities().isEmpty(); return !getSubjects().isEmpty()
|| !getDatasources().isEmpty()
|| !getZenodoCommunities().isEmpty();
} }
public String getId() { public String getId() {
@ -60,5 +59,4 @@ public class Community {
public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) { public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) {
this.zenodoCommunities = zenodoCommunities; this.zenodoCommunities = zenodoCommunities;
} }
} }

View File

@ -3,38 +3,58 @@ package eu.dnetlib.dhp.community;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter; import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection; import eu.dnetlib.dhp.selectioncriteria.Selection;
import org.apache.commons.logging.Log; import java.io.Serializable;
import org.apache.commons.logging.LogFactory;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/** /** Created by miriam on 02/08/2018. */
* Created by miriam on 02/08/2018. public class CommunityConfiguration implements Serializable {
*/
public class CommunityConfiguration {
private static final Log log = LogFactory.getLog(CommunityConfiguration.class); private static final Log log = LogFactory.getLog(CommunityConfiguration.class);
private Map<String, Community> communities;
private Map<String,Community> communities; // map subject -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> subjectMap = new HashMap<>();
// map datasourceid -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap = new HashMap<>();
// map zenodocommunityid -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap =
new HashMap<>();
public Map<String, List<Pair<String, SelectionConstraints>>> getSubjectMap() {
return subjectMap;
}
//map subject -> communityid public void setSubjectMap(Map<String, List<Pair<String, SelectionConstraints>>> subjectMap) {
private transient Map<String,List<Pair<String,SelectionConstraints>>> subjectMap = new HashMap<>(); this.subjectMap = subjectMap;
//map datasourceid -> communityid }
private transient Map<String,List<Pair<String,SelectionConstraints>>> datasourceMap = new HashMap<>();
//map zenodocommunityid -> communityid public Map<String, List<Pair<String, SelectionConstraints>>> getDatasourceMap() {
private transient Map<String,List<Pair<String,SelectionConstraints>>> zenodocommunityMap = new HashMap<>(); return datasourceMap;
}
public void setDatasourceMap(
Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap) {
this.datasourceMap = datasourceMap;
}
public Map<String, List<Pair<String, SelectionConstraints>>> getZenodocommunityMap() {
return zenodocommunityMap;
}
public void setZenodocommunityMap(
Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap) {
this.zenodocommunityMap = zenodocommunityMap;
}
CommunityConfiguration(final Map<String, Community> communities) { CommunityConfiguration(final Map<String, Community> communities) {
this.communities = communities; this.communities = communities;
@ -53,65 +73,67 @@ public class CommunityConfiguration {
zenodocommunityMap = Maps.newHashMap(); zenodocommunityMap = Maps.newHashMap();
} }
for (Community c : getCommunities().values()) {
for(Community c : getCommunities().values()) { // get subjects
//get subjects
final String id = c.getId(); final String id = c.getId();
for(String sbj : c.getSubjects()){ for (String sbj : c.getSubjects()) {
Pair<String,SelectionConstraints> p = new Pair<>(id,new SelectionConstraints()); Pair<String, SelectionConstraints> p = new Pair<>(id, new SelectionConstraints());
add(sbj.toLowerCase().trim() , p, subjectMap); add(sbj.toLowerCase().trim(), p, subjectMap);
} }
//get datasources // get datasources
for(Datasource d: c.getDatasources()){ for (Datasource d : c.getDatasources()) {
add(d.getOpenaireId(),new Pair<>(id,d.getSelectionConstraints()),datasourceMap); add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
} }
//get zenodo communities // get zenodo communities
for(ZenodoCommunity zc : c.getZenodoCommunities()){ for (ZenodoCommunity zc : c.getZenodoCommunities()) {
add(zc.getZenodoCommunityId(),new Pair<>(id,zc.getSelCriteria()),zenodocommunityMap); add(
zc.getZenodoCommunityId(),
new Pair<>(id, zc.getSelCriteria()),
zenodocommunityMap);
} }
} }
} }
private void add(String key, Pair<String,SelectionConstraints> value, Map<String,List<Pair<String,SelectionConstraints>>> map){ private void add(
List<Pair<String,SelectionConstraints>> values = map.get(key); String key,
Pair<String, SelectionConstraints> value,
Map<String, List<Pair<String, SelectionConstraints>>> map) {
List<Pair<String, SelectionConstraints>> values = map.get(key);
if (values == null){ if (values == null) {
values = new ArrayList<>(); values = new ArrayList<>();
map.put(key,values); map.put(key, values);
} }
values.add(value); values.add(value);
} }
public List<Pair<String,SelectionConstraints>> getCommunityForSubject(String sbj){ public List<Pair<String, SelectionConstraints>> getCommunityForSubject(String sbj) {
return subjectMap.get(sbj); return subjectMap.get(sbj);
} }
public List<Pair<String,SelectionConstraints>> getCommunityForDatasource(String dts){ public List<Pair<String, SelectionConstraints>> getCommunityForDatasource(String dts) {
return datasourceMap.get(dts); return datasourceMap.get(dts);
} }
public List<String> getCommunityForDatasource(
final String dts, final Map<String, List<String>> param) {
List<Pair<String, SelectionConstraints>> lp = datasourceMap.get(dts);
if (lp == null) return Lists.newArrayList();
public List<String> getCommunityForDatasource(final String dts, final Map<String, List<String>> param) { return lp.stream()
List<Pair<String,SelectionConstraints>> lp = datasourceMap.get(dts); .map(
if (lp==null) p -> {
return Lists.newArrayList(); if (p.getSnd() == null) return p.getFst();
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
return lp.stream().map(p -> { return p.getFst();
if (p.getSnd() == null) else return null;
return p.getFst(); })
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param)) .filter(st -> (st != null))
return p.getFst(); .collect(Collectors.toList());
else
return null;
}).filter(st->(st!=null)).collect(Collectors.toList());
} }
public List<Pair<String,SelectionConstraints>> getCommunityForZenodoCommunity(String zc){ public List<Pair<String, SelectionConstraints>> getCommunityForZenodoCommunity(String zc) {
return zenodocommunityMap.get(zc); return zenodocommunityMap.get(zc);
} }
@ -125,7 +147,7 @@ public class CommunityConfiguration {
return getContextIds(datasourceMap.get(value.toLowerCase())); return getContextIds(datasourceMap.get(value.toLowerCase()));
} }
public List<String> getCommunityForZenodoCommunityValue(String value){ public List<String> getCommunityForZenodoCommunityValue(String value) {
return getContextIds(zenodocommunityMap.get(value.toLowerCase())); return getContextIds(zenodocommunityMap.get(value.toLowerCase()));
} }
@ -137,7 +159,6 @@ public class CommunityConfiguration {
return Lists.newArrayList(); return Lists.newArrayList();
} }
public Map<String, Community> getCommunities() { public Map<String, Community> getCommunities() {
return communities; return communities;
} }
@ -158,7 +179,7 @@ public class CommunityConfiguration {
return communities.keySet().size(); return communities.keySet().size();
} }
public Community getCommunityById(String id){ public Community getCommunityById(String id) {
return communities.get(id); return communities.get(id);
} }

View File

@ -1,24 +1,20 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.community;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver; import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import java.io.Serializable;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.dom4j.Node; import org.dom4j.Node;
/** /** Created by miriam on 01/08/2018. */
* Created by miriam on 01/08/2018. public class Datasource implements Serializable {
*/
public class Datasource {
private static final Log log = LogFactory.getLog(Datasource.class); private static final Log log = LogFactory.getLog(Datasource.class);
private String openaireId; private String openaireId;
private SelectionConstraints selectionConstraints; private SelectionConstraints selectionConstraints;
public SelectionConstraints getSelCriteria() { public SelectionConstraints getSelCriteria() {
return selectionConstraints; return selectionConstraints;
} }
@ -43,23 +39,19 @@ public class Datasource {
this.openaireId = openaireId; this.openaireId = openaireId;
} }
private void setSelCriteria(String json, VerbResolver resolver){ private void setSelCriteria(String json, VerbResolver resolver) {
log.info("Selection constraints for datasource = " + json); log.info("Selection constraints for datasource = " + json);
selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class); selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class);
selectionConstraints.setSelection(resolver); selectionConstraints.setSelection(resolver);
} }
public void setSelCriteria(Node n, VerbResolver resolver){ public void setSelCriteria(Node n, VerbResolver resolver) {
try{ try {
setSelCriteria(n.getText(),resolver); setSelCriteria(n.getText(), resolver);
}catch(Exception e) { } catch (Exception e) {
log.info("not set selection criteria... "); log.info("not set selection criteria... ");
selectionConstraints =null; selectionConstraints = null;
} }
} }
} }

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.community;
import com.google.gson.Gson; import com.google.gson.Gson;
import java.io.Serializable;
/** /** Created by miriam on 03/08/2018. */
* Created by miriam on 03/08/2018. public class Pair<A, B> implements Serializable {
*/
public class Pair<A,B> {
private A fst; private A fst;
private B snd; private B snd;
@ -27,12 +26,12 @@ public class Pair<A,B> {
return this; return this;
} }
public Pair(A a, B b){ public Pair(A a, B b) {
fst = a; fst = a;
snd = b; snd = b;
} }
public String toJson(){ public String toJson() {
return new Gson().toJson(this); return new Gson().toJson(this);
} }
} }

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.community;
import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
public class ProtoMap extends HashMap<String,String> { public class ProtoMap extends HashMap<String, String> implements Serializable {
public ProtoMap(){ public ProtoMap() {
super(); super();
} }
} }

View File

@ -1,67 +1,68 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.community;
import static eu.dnetlib.dhp.community.TagginConstants.*;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import static eu.dnetlib.dhp.community.TagginConstants.*; /** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable {
/**
* Created by miriam on 02/08/2018.
*/
public class ResultTagger {
private String trust = "0.8"; private String trust = "0.8";
private boolean clearContext(Result result) {
private boolean clearContext(Result result){
int tmp = result.getContext().size(); int tmp = result.getContext().size();
List<Context> clist = result.getContext().stream() List<Context> clist =
.filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR))).collect(Collectors.toList()); result.getContext().stream()
.filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
.collect(Collectors.toList());
result.setContext(clist); result.setContext(clist);
return (tmp != clist.size()); return (tmp != clist.size());
} }
private Map<String,List<String>> getParamMap(final Result result, Map<String,String> params) { private Map<String, List<String>> getParamMap(final Result result, Map<String, String> params) {
Map<String,List<String>> param = new HashMap<>(); Map<String, List<String>> param = new HashMap<>();
String json = new Gson().toJson(result,Result.class); String json = new Gson().toJson(result, Result.class);
DocumentContext jsonContext = JsonPath.parse(json); DocumentContext jsonContext = JsonPath.parse(json);
if (params == null){ if (params == null) {
params = new HashMap<>(); params = new HashMap<>();
} }
for(String key : params.keySet()) { for (String key : params.keySet()) {
try { try {
param.put(key, jsonContext.read(params.get(key))); param.put(key, jsonContext.read(params.get(key)));
} catch (com.jayway.jsonpath.PathNotFoundException e) { } catch (com.jayway.jsonpath.PathNotFoundException e) {
param.put(key, new ArrayList<>()); param.put(key, new ArrayList<>());
// throw e; // throw e;
} }
} }
return param; return param;
} }
public <R extends Result> R enrichContextCriteria(
final R result, final CommunityConfiguration conf, final Map<String, String> criteria) {
public Result enrichContextCriteria(final Result result, final CommunityConfiguration conf, final Map<String,String> criteria) { // }
// public Result enrichContextCriteria(final Result result, final CommunityConfiguration
// conf, final Map<String,String> criteria) {
final Map<String, List<String>> param = getParamMap(result, criteria); final Map<String, List<String>> param = getParamMap(result, criteria);
//Verify if the entity is deletedbyinference. In case verify if to clean the context list from all the zenodo communities // Verify if the entity is deletedbyinference. In case verify if to clean the context list
if(result.getDataInfo().getDeletedbyinference()){ // from all the zenodo communities
return result; if (result.getDataInfo().getDeletedbyinference()) {
if (clearContext(result)) return result;
} }
//communities contains all the communities to be added as context for the result // communities contains all the communities to be added as context for the result
final Set<String> communities = new HashSet<>(); final Set<String> communities = new HashSet<>();
// tagging for Subject
//tagging for Subject
final Set<String> subjects = new HashSet<>(); final Set<String> subjects = new HashSet<>();
result.getSubject().stream() result.getSubject().stream()
.map(subject -> subject.getValue()) .map(subject -> subject.getValue())
@ -73,89 +74,115 @@ public class ResultTagger {
communities.addAll(subjects); communities.addAll(subjects);
// Tagging for datasource
//Tagging for datasource
final Set<String> datasources = new HashSet<>(); final Set<String> datasources = new HashSet<>();
final Set<String> tmp = new HashSet<>(); final Set<String> tmp = new HashSet<>();
for(Instance i : result.getInstance()){ for (Instance i : result.getInstance()) {
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(),"|")); tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(),"|")); tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
} }
result.getInstance() result.getInstance().stream()
.stream()
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey())) .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
.flatMap(p -> Stream.of(p.getFst(), p.getSnd())) .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
.map(s -> StringUtils.substringAfter(s, "|")) .map(s -> StringUtils.substringAfter(s, "|"))
.collect(Collectors.toCollection(HashSet::new)) .collect(Collectors.toCollection(HashSet::new))
.forEach(dsId -> datasources.addAll(conf.getCommunityForDatasource(dsId,param))); .forEach(dsId -> datasources.addAll(conf.getCommunityForDatasource(dsId, param)));
communities.addAll(datasources); communities.addAll(datasources);
/*Tagging for Zenodo Communities*/ /*Tagging for Zenodo Communities*/
final Set<String> czenodo = new HashSet<>(); final Set<String> czenodo = new HashSet<>();
//final ResultProtos.Result.Metadata.Builder mBuilder = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder(); result.getContext().stream()
result.getContext()
.stream()
.filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR)) .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
.collect(Collectors.toList()) .collect(Collectors.toList())
.forEach(c->czenodo.addAll(conf.getCommunityForZenodoCommunityValue(c.getId().substring(c.getId().lastIndexOf("/")+1).trim()))); .forEach(
c ->
czenodo.addAll(
conf.getCommunityForZenodoCommunityValue(
c.getId()
.substring(c.getId().lastIndexOf("/") + 1)
.trim())));
communities.addAll(czenodo); communities.addAll(czenodo);
clearContext(result); clearContext(result);
/*Verify if there is something to bulktag*/ /*Verify if there is something to bulktag*/
if(communities.isEmpty()){ if (communities.isEmpty()) {
return result; return result;
} }
result.getContext() result.getContext().stream()
.stream() .map(
.map(c -> { c -> {
if(communities.contains(c.getId())){ if (communities.contains(c.getId())) {
List<DataInfo> dataInfoList = c.getDataInfo(); List<DataInfo> dataInfoList = c.getDataInfo();
if (subjects.contains(c.getId())) if (subjects.contains(c.getId()))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT)); dataInfoList.add(
if (datasources.contains(c.getId())) getDataInfo(
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE)); BULKTAG_DATA_INFO_TYPE,
if (czenodo.contains(c.getId())) CLASS_ID_SUBJECT,
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO)); CLASS_NAME_BULKTAG_SUBJECT));
} if (datasources.contains(c.getId()))
return c; dataInfoList.add(
}) getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_DATASOURCE,
CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c.getId()))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_CZENODO,
CLASS_NAME_BULKTAG_ZENODO));
}
return c;
})
.collect(Collectors.toList()); .collect(Collectors.toList());
communities.removeAll(
result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
communities.removeAll(result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet())); if (communities.isEmpty()) return result;
if(communities.isEmpty()) List<Context> toaddcontext =
return result; communities.stream()
.map(
List<Context> toaddcontext = communities c -> {
.stream() Context context = new Context();
.map(c -> { context.setId(c);
Context context = new Context(); List<DataInfo> dataInfoList = Arrays.asList();
context.setId(c); if (subjects.contains(c))
List<DataInfo> dataInfoList = Arrays.asList(); dataInfoList.add(
if (subjects.contains(c)) getDataInfo(
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT)); BULKTAG_DATA_INFO_TYPE,
if (datasources.contains(c)) CLASS_ID_SUBJECT,
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE)); CLASS_NAME_BULKTAG_SUBJECT));
if (czenodo.contains(c)) if (datasources.contains(c))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO)); dataInfoList.add(
context.setDataInfo(dataInfoList); getDataInfo(
return context; BULKTAG_DATA_INFO_TYPE,
}) CLASS_ID_DATASOURCE,
.collect(Collectors.toList()); CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_CZENODO,
CLASS_NAME_BULKTAG_ZENODO));
context.setDataInfo(dataInfoList);
return context;
})
.collect(Collectors.toList());
result.getContext().addAll(toaddcontext); result.getContext().addAll(toaddcontext);
return result; return result;
} }
public static DataInfo getDataInfo(String inference_provenance, String inference_class_id, String inference_class_name){ public static DataInfo getDataInfo(
String inference_provenance, String inference_class_id, String inference_class_name) {
DataInfo di = new DataInfo(); DataInfo di = new DataInfo();
di.setInferred(true); di.setInferred(true);
di.setInferenceprovenance(inference_provenance); di.setInferenceprovenance(inference_provenance);
@ -171,5 +198,4 @@ public class ResultTagger {
pa.setSchemename(DNET_SCHEMA_NAME); pa.setSchemename(DNET_SCHEMA_NAME);
return pa; return pa;
} }
} }

View File

@ -1,27 +1,23 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.community;
public class TagginConstants { public class TagginConstants {
public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
public final static String BULKTAG_DATA_INFO_TYPE = "bulktagging"; public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
public final static String DNET_SCHEMA_NAME = "dnet:provenanceActions"; public static final String CLASS_ID_SUBJECT = "community:subject";
public final static String DNET_SCHEMA_ID = "dnet:provenanceActions"; public static final String CLASS_ID_DATASOURCE = "community:datasource";
public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
public final static String CLASS_ID_SUBJECT = "bulktagging:community:subject";
public final static String CLASS_ID_DATASOURCE = "bulktagging:community:datasource";
public final static String CLASS_ID_CZENODO = "bulktagging:community:zenodocommunity";
public final static String SCHEMA_ID = "dnet:provenanceActions";
public final static String COUNTER_GROUP = "Bulk Tagging";
public final static String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public final static String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
public final static String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
public final static String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
public static final String SCHEMA_ID = "dnet:provenanceActions";
public static final String COUNTER_GROUP = "Bulk Tagging";
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
public static final String CLASS_NAME_BULKTAG_DATASOURCE =
"Bulktagging for Community - Datasource";
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
} }

View File

@ -1,13 +1,11 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.community;
import com.google.gson.Gson; import com.google.gson.Gson;
import java.io.Serializable;
import org.dom4j.Node; import org.dom4j.Node;
/** Created by miriam on 01/08/2018. */
/** public class ZenodoCommunity implements Serializable {
* Created by miriam on 01/08/2018.
*/
public class ZenodoCommunity {
private String zenodoCommunityId; private String zenodoCommunityId;
@ -29,18 +27,16 @@ public class ZenodoCommunity {
this.selCriteria = selCriteria; this.selCriteria = selCriteria;
} }
private void setSelCriteria(String json){ private void setSelCriteria(String json) {
//Type collectionType = new TypeToken<Collection<Constraints>>(){}.getType(); // Type collectionType = new TypeToken<Collection<Constraints>>(){}.getType();
selCriteria = new Gson().fromJson(json, SelectionConstraints.class); selCriteria = new Gson().fromJson(json, SelectionConstraints.class);
} }
public void setSelCriteria(Node n){ public void setSelCriteria(Node n) {
if (n==null){ if (n == null) {
selCriteria = null; selCriteria = null;
}else{ } else {
setSelCriteria(n.getText()); setSelCriteria(n.getText());
} }
} }
} }

View File

@ -7,7 +7,7 @@ import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME) @Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE) @Target(ElementType.TYPE)
public @interface VerbClass { @interface VerbClass {
public String value(); String value();
} }

View File

@ -5,12 +5,6 @@
"paramDescription": "URL of the isLookUp Service", "paramDescription": "URL of the isLookUp Service",
"paramRequired": true "paramRequired": true
}, },
{
"paramName":"mt",
"paramLongName":"master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{ {
"paramName":"s", "paramName":"s",
"paramLongName":"sourcePath", "paramLongName":"sourcePath",
@ -22,6 +16,36 @@
"paramLongName":"protoMap", "paramLongName":"protoMap",
"paramDescription": "the json path associated to each selection field", "paramDescription": "the json path associated to each selection field",
"paramRequired": true "paramRequired": true
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "test",
"paramLongName": "isTest",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "tg",
"paramLongName": "taggingConf",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
} }
] ]

View File

@ -19,4 +19,28 @@
<name>hive_metastore_uris</name> <name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value> <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property> </property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>1</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration> </configuration>

View File

@ -1,13 +1,9 @@
<workflow-app name="result_to_community_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="bulk_tagging" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
<description>the source path</description> <description>the source path</description>
</property> </property>
<property>
<name>allowedsemrels</name>
<description>the semantic relationships allowed for propagation</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -24,38 +20,163 @@
<name>isLookupUrl</name> <name>isLookupUrl</name>
<description>the isLookup service endpoint</description> <description>the isLookup service endpoint</description>
</property> </property>
<property>
<name>protoMap</name>
<description>the json path associated to each selection field</description>
</property>
</parameters> </parameters>
<start to="ResultToCommunityFromSemRelPropagation"/> <start to="reset-outputpath"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ResultToCommunityFromSemRelPropagation"> <action name="reset-outputpath">
<spark xmlns="uri:oozie:spark-action:0.2"> <fs>
<job-tracker>${jobTracker}</job-tracker> <delete path='${workingDir}/relation'/>
<name-node>${nameNode}</name-node> <delete path='${workingDir}/publication'/>
<master>yarn-cluster</master> <delete path='${workingDir}/dataset'/>
<mode>cluster</mode> <delete path='${workingDir}/otherresearchproduct'/>
<name>ResultToCommunitySemRelPropagation</name> <delete path='${workingDir}/software'/>
<class>eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob</class> </fs>
<jar>dhp-propagation-${projectVersion}.jar</jar> <ok to="copy_relation"/>
<spark-opts>--executor-memory ${sparkExecutorMemory} <error to="Kill"/>
--executor-cores ${sparkExecutorCores} </action>
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<!-- <arg>-allowedsemrels</arg><arg>${allowedsemrels}</arg>-->
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<!-- <arg>-isLookupUrl</arg><arg>${isLookupUrl}</arg>-->
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${workingDir}/relation</arg>
</distcp>
<ok to="fork_exec_bulktag"/>
<error to="Kill"/>
</action>
<action name="fork_exec_bulktag">
<path start="join_bulktag_publication"/>
<path start="join_bulktag_dataset"/>
<path start="join_bulktag_otherresearchproduct"/>
<path start="join_bulktag_software"/>
</action>
<action name="join_bulktag_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--proto_map</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_bulktag_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--proto_map</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_bulktag_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--proto_map</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_bulktag_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>bulkTagging</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--proto_map</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="End"/>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -0,0 +1,233 @@
package eu.dnetlib.dhp;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.mortbay.util.IO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class BulkTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class);
private static String taggingConf = "";
static {
try {
taggingConf =
IO.toString(
BulkTagJobTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json"));
} catch (IOException e) {
e.printStackTrace();
}
}
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark =
SparkSession.builder()
.appName(BulkTagJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void test1() throws Exception {
SparkBulkTagJob2.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/sample/dataset").getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookupUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-protoMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
// "-preparedInfoPath",
// getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath()
});
}
}
/*
import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest;
import eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob2;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.apache.spark.sql.functions.desc;
@Test
public void test1() throws Exception {
SparkResultToCommunityThroughSemRelJob4.main(new String[]{
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample").getPath(),
"-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-preparedInfoPath", getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath()
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community " +
"from dataset " +
"lateral view explode(context) c as MyT " +
"lateral view explode(MyT.datainfo) d as MyD " +
"where MyD.inferenceprovenance = 'propagation'";
org.apache.spark.sql.Dataset<Row> resultExplodedProvenance = spark.sql(query);
Assertions.assertEquals(5, resultExplodedProvenance.count());
Assertions.assertEquals(0, resultExplodedProvenance.filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'").count());
Assertions.assertEquals(1, resultExplodedProvenance.filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'").count());
Assertions.assertEquals("dh-ch",resultExplodedProvenance.select("community")
.where(resultExplodedProvenance.col("id").equalTo("50|dedup_wf_001::0489ae524201eedaa775da282dce35e7"))
.collectAsList().get(0).getString(0));
Assertions.assertEquals(3, resultExplodedProvenance.filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'").count());
List<Row> rowList = resultExplodedProvenance.select("community")
.where(resultExplodedProvenance.col("id")
.equalTo("50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28"))
.sort(desc("community")).collectAsList();
Assertions.assertEquals("mes", rowList.get(0).getString(0));
Assertions.assertEquals("fam", rowList.get(1).getString(0));
Assertions.assertEquals("ee", rowList.get(2).getString(0));
Assertions.assertEquals(1, resultExplodedProvenance.filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'").count());
Assertions.assertEquals("aginfra", resultExplodedProvenance.select("community")
.where(resultExplodedProvenance.col("id")
.equalTo("50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc"))
.collectAsList().get(0).getString(0));
query = "select id, MyT.id community " +
"from dataset " +
"lateral view explode(context) c as MyT " +
"lateral view explode(MyT.datainfo) d as MyD ";
org.apache.spark.sql.Dataset<Row> resultCommunityId = spark.sql(query);
Assertions.assertEquals(10, resultCommunityId.count());
Assertions.assertEquals(2, resultCommunityId.filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'").count());
rowList = resultCommunityId.select("community")
.where(resultCommunityId.col("id").equalTo("50|dedup_wf_001::0489ae524201eedaa775da282dce35e7"))
.sort(desc("community"))
.collectAsList();
Assertions.assertEquals("dh-ch", rowList.get(0).getString(0));
Assertions.assertEquals("beopen", rowList.get(1).getString(0));
Assertions.assertEquals(3, resultCommunityId.filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'").count());
rowList = resultCommunityId.select("community")
.where(resultCommunityId.col("id").equalTo("50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28"))
.sort(desc("community"))
.collectAsList();
Assertions.assertEquals("mes", rowList.get(0).getString(0));
Assertions.assertEquals("fam", rowList.get(1).getString(0));
Assertions.assertEquals("ee", rowList.get(2).getString(0));
Assertions.assertEquals(2, resultCommunityId.filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'").count());
rowList = resultCommunityId.select("community")
.where(resultCommunityId.col("id").equalTo("50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc"))
.sort(desc("community"))
.collectAsList();
Assertions.assertEquals("beopen", rowList.get(0).getString(0));
Assertions.assertEquals("aginfra", rowList.get(1).getString(0));
Assertions.assertEquals(2, resultCommunityId.filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'").count());
rowList = resultCommunityId.select("community")
.where(resultCommunityId.col("id").equalTo("50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b"))
.sort(desc("community"))
.collectAsList();
Assertions.assertEquals("euromarine", rowList.get(1).getString(0));
Assertions.assertEquals("ni", rowList.get(0).getString(0));
Assertions.assertEquals(1, resultCommunityId.filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'").count());
Assertions.assertEquals("euromarine", resultCommunityId.select("community")
.where(resultCommunityId.col("id")
.equalTo("50|doajarticles::8d817039a63710fcf97e30f14662c6c8"))
.collectAsList().get(0).getString(0));
}
*/

View File

@ -0,0 +1,147 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
import eu.dnetlib.dhp.community.CommunityConfiguration;
import eu.dnetlib.dhp.community.CommunityConfigurationFactory;
import eu.dnetlib.dhp.community.Constraint;
import eu.dnetlib.dhp.community.SelectionConstraints;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.DocumentException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
/** Created by miriam on 03/08/2018. */
public class CommunityConfigurationFactoryTest {
private static String xml;
private static String xml1;
private final VerbResolver resolver = new VerbResolver();
@Test
public void parseTest() throws DocumentException, IOException {
String xml =
IOUtils.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml"));
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
Assertions.assertEquals(5, cc.size());
cc.getCommunityList()
.forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId())));
}
@Test
public void applyVerb()
throws InvocationTargetException, IllegalAccessException, NoSuchMethodException,
InstantiationException {
Constraint sc = new Constraint();
sc.setVerb("not_contains");
sc.setField("contributor");
sc.setValue("DARIAH");
sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue()));
String metadata = "This work has been partially supported by DARIAH-EU infrastructure";
Assertions.assertFalse(sc.verifyCriteria(metadata));
}
@Test
public void loadSelCriteriaTest() throws DocumentException, IOException {
String xml =
IOUtils.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml"));
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
Map<String, List<String>> param = new HashMap<>();
param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi")));
param.put(
"description",
new ArrayList<>(
Collections.singletonList(
"This work has been partially supported by DARIAH-EU infrastructure")));
param.put(
"contributor",
new ArrayList<>(
Collections.singletonList(
"Pallino ha aiutato a scrivere il paper. Pallino lavora per DARIAH")));
List<String> comm =
cc.getCommunityForDatasource(
"openaire____::1cfdb2e14977f31a98e0118283401f32", param);
Assertions.assertEquals(1, comm.size());
Assertions.assertEquals("dariah", comm.get(0));
}
@Test
public void test4() throws DocumentException, IOException {
final CommunityConfiguration cc =
CommunityConfigurationFactory.fromJson(
IOUtils.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json")));
cc.toString();
}
@Test
public void test5() throws IOException, DocumentException {
// final CommunityConfiguration cc =
// CommunityConfigurationFactory.newInstance(IOUtils.toString(getClass().getResourceAsStream("test.xml")));
final CommunityConfiguration cc =
CommunityConfigurationFactory.fromJson(
IOUtils.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/community_configuration.json")));
System.out.println(cc.toJson());
}
@Test
public void test6() {
String json =
"{\"criteria\":[{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}]}";
String step1 = "{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}";
Constraint c = new Gson().fromJson(step1, Constraint.class);
//
// String step2 =
// "{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}";
//
// ConstraintEncapsulator ce = new
// Gson().fromJson(step2,ConstraintEncapsulator.class);
//
//
// String step3 =
// "{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}";
//
// Constraints cons = new Gson().fromJson(step3,Constraints.class);
//
// String step4 =
// "{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}";
//
// ConstraintsList cl = new Gson().fromJson(step4,ConstraintsList.class);
//
// String step5 =
// "{\"cl\":{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}}";
SelectionConstraints sl = new Gson().fromJson(json, SelectionConstraints.class);
}
@Test
public void test7() throws IOException {
final CommunityConfiguration cc =
CommunityConfigurationFactory.fromJson(
IOUtils.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json")));
System.out.println(cc.toJson());
}
}

View File

@ -0,0 +1,694 @@
{"communities": {
"clarin": {
"id": "clarin",
"subjects": [],
"datasources": [
{
"openaireId": "re3data_____::a507cdacc5bbcc08761c92185dee5cab"
}
],
"zenodoCommunities": [
]
},
"ee": {
"id": "ee",
"subjects": [
"SDG13 - Climate action",
"SDG8 - Decent work and economic\n\t\t\t\t\tgrowth",
"SDG15 - Life on land",
"SDG2 - Zero hunger",
"SDG17 - Partnerships for the\n\t\t\t\t\tgoals",
"SDG10 - Reduced inequalities",
"SDG5 - Gender equality",
"SDG12 - Responsible\n\t\t\t\t\tconsumption and production",
"SDG14 - Life below water",
"SDG6 - Clean water and\n\t\t\t\t\tsanitation",
"SDG11 - Sustainable cities and communities",
"SDG1 - No poverty",
"SDG3 -\n\t\t\t\t\tGood health and well being",
"SDG7 - Affordable and clean energy",
"SDG4 - Quality\n\t\t\t\t\teducation",
"SDG9 - Industry innovation and infrastructure",
"SDG16 - Peace justice\n\t\t\t\t\tand strong institutions"
],
"datasources": [
],
"zenodoCommunities": [
]
},
"aginfra": {
"id": "aginfra",
"subjects": [
"animal production and health",
"fisheries and aquaculture",
"food safety and human nutrition",
"information management",
"food technology",
"agri-food education and extension",
"natural resources and environment",
"food system",
"engineering technology and Research",
"agriculture",
"food safety risk assessment",
"food security",
"farming practices and systems",
"plant production and protection",
"agri-food economics and policy",
"food distribution",
"forestry"
],
"datasources": [
{
"openaireId": "opendoar____::1a551829d50f1400b0dab21fdd969c04"
},
{
"openaireId": "opendoar____::49af6c4e558a7569d80eee2e035e2bd7"
},
{
"openaireId": "opendoar____::0266e33d3f546cb5436a10798e657d97"
},
{
"openaireId": "opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06"
},
{
"openaireId": "opendoar____::41bfd20a38bb1b0bec75acf0845530a7"
},
{
"openaireId": "opendoar____::87ae6fb631f7c8a627e8e28785d9992d"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "edenis"
},
{
"zenodoCommunityId": "efsa-pilot"
},
{
"zenodoCommunityId": "egene3"
},
{
"zenodoCommunityId": "efsa-kj"
},
{
"zenodoCommunityId": "euromixproject"
},
{
"zenodoCommunityId": "discardless"
},
{
"zenodoCommunityId": "sedinstcjfst"
},
{
"zenodoCommunityId": "afinet-kc"
},
{
"zenodoCommunityId": "2231-4784"
},
{
"zenodoCommunityId": "2231-0606"
},
{
"zenodoCommunityId": "solace"
},
{
"zenodoCommunityId": "pa17"
},
{
"zenodoCommunityId": "smartakis"
},
{
"zenodoCommunityId": "sedinstcjae"
},
{
"zenodoCommunityId": "phenology_camera"
},
{
"zenodoCommunityId": "aginfra"
},
{
"zenodoCommunityId": "erosa"
},
{
"zenodoCommunityId": "bigdatagrapes"
}
]
},
"fam": {
"id": "fam",
"subjects": [
"Stock Assessment",
"pelagic",
"Fish farming",
"EMFF",
"Fisheries",
"Fishermen",
"maximum sustainable yield",
"trawler",
"Fishing vessel",
"Fisherman",
"Fishing gear",
"RFMO",
"Fish Aggregating Device",
"Bycatch",
"Fishery",
"common fisheries policy",
"Fishing fleet",
"Aquaculture"
],
"datasources": [
{
"openaireId": "doajarticles::8cec81178926caaca531afbd8eb5d64c"
},
{
"openaireId": "doajarticles::0f7a7f30b5400615cae1829f3e743982"
},
{
"openaireId": "doajarticles::9740f7f5af3e506d2ad2c215cdccd51a"
},
{
"openaireId": "doajarticles::9f3fbaae044fa33cb7069b72935a3254"
},
{
"openaireId": "doajarticles::cb67f33eb9819f5c624ce0313957f6b3"
},
{
"openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906"
},
{
"openaireId": "doajarticles::554cde3be9e5c4588b4c4f9f503120cb"
},
{
"openaireId": "tubitakulakb::11e22f49e65b9fd11d5b144b93861a1b"
},
{
"openaireId": "doajarticles::57c5d3837da943e93b28ec4db82ec7a5"
},
{
"openaireId": "doajarticles::a186f5ddb8e8c7ecc992ef51cf3315b1"
},
{
"openaireId": "doajarticles::e21c97cbb7a209afc75703681c462906"
},
{
"openaireId": "doajarticles::dca64612dfe0963fffc119098a319957"
},
{
"openaireId": "doajarticles::dd70e44479f0ade25aa106aef3e87a0a"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "discardless"
},
{
"zenodoCommunityId": "farfish2020"
},
{
"zenodoCommunityId": "facts"
},
{
"zenodoCommunityId": "climefish"
},
{
"zenodoCommunityId": "proeel"
},
{
"zenodoCommunityId": "primefish"
},
{
"zenodoCommunityId": "h2020_vicinaqua"
},
{
"zenodoCommunityId": "meece"
},
{
"zenodoCommunityId": "rlsadb"
}
]
},
"instruct": {
"id": "instruct",
"subjects": [
],
"datasources": [
],
"zenodoCommunities": [
{
"zenodoCommunityId": "instruct"
},
{
"zenodoCommunityId": "west-life"
}
]
},
"mes": {
"id": "mes",
"subjects": [
"marine",
"ocean",
"fish",
"aqua",
"sea"
],
"datasources": [
],
"zenodoCommunities": [
{
"zenodoCommunityId": "adriplan"
},
{
"zenodoCommunityId": "devotes-project"
},
{
"zenodoCommunityId": "euro-basin"
},
{
"zenodoCommunityId": "naclim"
},
{
"zenodoCommunityId": "discardless"
},
{
"zenodoCommunityId": "assisibf"
},
{
"zenodoCommunityId": "meece"
},
{
"zenodoCommunityId": "facts"
},
{
"zenodoCommunityId": "proeel"
},
{
"zenodoCommunityId": "aquatrace"
},
{
"zenodoCommunityId": "myfish"
},
{
"zenodoCommunityId": "atlas"
},
{
"zenodoCommunityId": "blue-actionh2020"
},
{
"zenodoCommunityId": "sponges"
},
{
"zenodoCommunityId": "merces_project"
},
{
"zenodoCommunityId": "bigdataocean"
},
{
"zenodoCommunityId": "columbus"
},
{
"zenodoCommunityId": "h2020-aquainvad-ed"
},
{
"zenodoCommunityId": "aquarius"
},
{
"zenodoCommunityId": "southern-ocean-observing-system"
},
{
"zenodoCommunityId": "eawag"
},
{
"zenodoCommunityId": "mossco"
},
{
"zenodoCommunityId": "onc"
},
{
"zenodoCommunityId": "oceanbiogeochemistry"
},
{
"zenodoCommunityId": "oceanliteracy"
},
{
"zenodoCommunityId": "openearth"
},
{
"zenodoCommunityId": "ocean"
},
{
"zenodoCommunityId": "calcifierraman"
},
{
"zenodoCommunityId": "bermudabream"
},
{
"zenodoCommunityId": "brcorp1"
},
{
"zenodoCommunityId": "mce"
},
{
"zenodoCommunityId": "biogeochem"
},
{
"zenodoCommunityId": "ecc2014"
},
{
"zenodoCommunityId": "fisheries"
},
{
"zenodoCommunityId": "sedinstcjfas"
},
{
"zenodoCommunityId": "narmada"
},
{
"zenodoCommunityId": "umr-entropie"
},
{
"zenodoCommunityId": "farfish2020"
},
{
"zenodoCommunityId": "primefish"
},
{
"zenodoCommunityId": "zf-ilcs"
},
{
"zenodoCommunityId": "climefish"
},
{
"zenodoCommunityId": "afrimed_eu"
},
{
"zenodoCommunityId": "spi-ace"
},
{
"zenodoCommunityId": "cice-consortium"
},
{
"zenodoCommunityId": "nemo-ocean"
},
{
"zenodoCommunityId": "mesopp-h2020"
},
{
"zenodoCommunityId": "marxiv"
}
]
},
"ni": {
"id": "ni",
"subjects": [
"brain mapping",
"brain imaging",
"electroencephalography",
"arterial spin labelling",
"brain fingerprinting",
"brain",
"neuroimaging",
"Multimodal Brain Image Analysis",
"fMRI",
"neuroinformatics",
"fetal brain",
"brain ultrasonic imaging",
"topographic brain mapping",
"diffusion tensor imaging",
"computerized knowledge assessment",
"connectome mapping",
"brain magnetic resonance imaging",
"brain abnormalities"
],
"datasources": [
{
"openaireId": "re3data_____::5b9bf9171d92df854cf3c520692e9122"
},
{
"openaireId": "doajarticles::c7d3de67dc77af72f6747157441252ec"
},
{
"openaireId": "re3data_____::8515794670370f49c1d176c399c714f5"
},
{
"openaireId": "doajarticles::d640648c84b10d425f96f11c3de468f3"
},
{
"openaireId": "doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"
},
{
"openaireId": "rest________::fb1a3d4523c95e63496e3bc7ba36244b"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "neuroinformatics"
},
{
"zenodoCommunityId": "hbp"
},
{
"zenodoCommunityId": "from_neuroscience_to_machine_learning"
},
{
"zenodoCommunityId": "ci2c"
},
{
"zenodoCommunityId": "opensourcebrain"
},
{
"zenodoCommunityId": "brainspeak"
},
{
"zenodoCommunityId": "braincom"
},
{
"zenodoCommunityId": "nextgenvis"
},
{
"zenodoCommunityId": "meso-brain"
},
{
"zenodoCommunityId": "neuroplasticity-workshop"
},
{
"zenodoCommunityId": "bionics"
},
{
"zenodoCommunityId": "brainmattrain-676408"
},
{
"zenodoCommunityId": "repronim"
},
{
"zenodoCommunityId": "affectiveneuro"
},
{
"zenodoCommunityId": "con"
},
{
"zenodoCommunityId": "lab_neurol_sperim_irfmn_irccs_milano_it"
}
]
},
"dariah": {
"id": "dariah",
"subjects": [
],
"datasources": [
{
"openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a",
"sc": {
"cl": {
"criteria": [
{
"ce": {
"constraint": [
{
"verb": "contains",
"field": "contributor",
"value": "DARIAH"
}
]
}
}
]
}
}
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "dimpo"
}
]
},
"rda": {
"id": "rda",
"subjects": [
],
"datasources": [
],
"zenodoCommunities": [
{
"zenodoCommunityId": "rda"
}
]
},
"dh-ch": {
"id": "dh-ch",
"subjects": [
"modern art",
"metadata",
"monuments",
"sites",
"field walking",
"frescoes",
"excavation",
"ontologies",
"mapping",
"cities",
"temples",
"lithics",
"roads",
"digital cultural heritage",
"interoperability",
"archaeological reports",
"churches",
"standards",
"archaeological stratigraphy",
"buidings",
"digital humanities",
"survey",
"archaeological sites",
"CIDOC CRM",
"decorations",
"classic art",
"stratigraphy",
"digital archaeology",
"walls",
"data science",
"chapels",
"paintings",
"archaeology",
"fair data",
"mosaics",
"data visualization",
"burials",
"medieval art",
"castles",
"statues",
"natural language processing",
"inscriptions",
"vaults",
"open data",
"contemporary art",
"3D",
"pottery",
"site",
"metadata schema",
"architectural",
"vessels"
],
"datasources": [
{
"openaireId": "re3data_____::9ebe127e5f3a0bf401875690f3bb6b81"
},
{
"openaireId": "doajarticles::c6cd4b532e12868c1d760a8d7cda6815"
},
{
"openaireId": "doajarticles::a6de4499bb87bf3c01add0a9e2c9ed0b"
},
{
"openaireId": "doajarticles::6eb31d13b12bc06bbac06aef63cf33c9"
},
{
"openaireId": "doajarticles::0da84e9dfdc8419576169e027baa8028"
},
{
"openaireId": "re3data_____::84e123776089ce3c7a33db98d9cd15a8"
},
{
"openaireId": "openaire____::c5502a43e76feab55dd00cf50f519125"
},
{
"openaireId": "re3data_____::a48f09c562b247a9919acfe195549b47"
},
{
"openaireId": "opendoar____::97275a23ca44226c9964043c8462be96"
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "storm"
},
{
"zenodoCommunityId": "crosscult"
},
{
"zenodoCommunityId": "wholodance_eu"
},
{
"zenodoCommunityId": "digcur2013"
},
{
"zenodoCommunityId": "gravitate"
},
{
"zenodoCommunityId": "dipp2014"
},
{
"zenodoCommunityId": "digitalhumanities"
},
{
"zenodoCommunityId": "dimpo"
},
{
"zenodoCommunityId": "adho"
},
{
"zenodoCommunityId": "chc"
},
{
"zenodoCommunityId": "wahr"
},
{
"zenodoCommunityId": "ibe"
},
{
"zenodoCommunityId": "ariadne"
},
{
"zenodoCommunityId": "parthenos-hub"
},
{
"zenodoCommunityId": "parthenos-training"
},
{
"zenodoCommunityId": "gandhara"
},
{
"zenodoCommunityId": "cmsouthasia"
},
{
"zenodoCommunityId": "nilgirihills"
},
{
"zenodoCommunityId": "shamsa_mustecio"
},
{
"zenodoCommunityId": "bodhgaya"
}
]
}
}
}

View File

@ -0,0 +1,176 @@
<communities>
<community id="fet-fp7">
<oacommunity/>
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fet-h2020">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="oa-pg">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ee">
<subjects>
<subject>SDG13 - Climate action</subject>
<subject>SDG8 - Decent work and economic growth</subject>
<subject>SDG15 - Life on land</subject>
<subject>SDG2 - Zero hunger</subject>
<subject>SDG17 - Partnerships for the goals</subject>
<subject>SDG10 - Reduced inequalities</subject>
<subject>SDG5 - Gender equality</subject>
<subject>SDG12 - Responsible consumption and production</subject>
<subject>SDG14 - Life below water</subject>
<subject>SDG6 - Clean water and sanitation</subject>
<subject>SDG11 - Sustainable cities and communities</subject>
<subject>SDG1 - No poverty</subject>
<subject>SDG3 - Good health and well being</subject>
<subject>SDG7 - Affordable and clean energy</subject>
<subject>SDG4 - Quality education</subject>
<subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject>
</subjects>
<datasources/>
<zenodocommunities>
<zenodocommunity>
<zenodoid>123</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
</community>
<community id="dh-ch">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fam">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ni">
<subjects>
<subject>brain mapping</subject>
<subject>brain imaging</subject>
<subject>electroencephalography</subject>
<subject>arterial spin labelling</subject>
<subject>brain fingerprinting</subject>
<subject>brain</subject>
<subject>neuroimaging</subject>
<subject>Multimodal Brain Image Analysis</subject>
<subject>fMRI</subject>
<subject>neuroinformatics</subject>
<subject>fetal brain</subject>
<subject>brain ultrasonic imaging</subject>
<subject>topographic brain mapping</subject>
<subject>diffusion tensor imaging</subject>
<subject>computerized knowledge assessment</subject>
<subject>connectome mapping</subject>
<subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::c7d3de67dc77af72f6747157441252ec</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>re3data_____::8515794670370f49c1d176c399c714f5</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::d640648c84b10d425f96f11c3de468f3</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="mes">
<subjects>
<subject>marine</subject>
<subject>ocean</subject>
<subject>fish</subject>
<subject>aqua</subject>
<subject>sea</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="aginfra">
<subjects>
<subject>animal production and health</subject>
<subject>fisheries and aquaculture</subject>
<subject>food safety and human nutrition</subject>
<subject>information management</subject>
<subject>food technology</subject>
<subject>agri-food education and extension</subject>
<subject>natural resources and environment</subject>
<subject>food system</subject>
<subject>engineering technology and Research</subject>
<subject>agriculture</subject>
<subject>food safety risk assessment</subject>
<subject>food security</subject>
<subject>farming practices and systems</subject>
<subject>plant production and protection</subject>
<subject>agri-food economics and policy</subject>
<subject>food distribution</subject>
<subject>forestry</subject>
</subjects>
<datasources>
<datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::49af6c4e558a7569d80eee2e035e2bd7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::0266e33d3f546cb5436a10798e657d97</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::41bfd20a38bb1b0bec75acf0845530a7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="clarin">
<oacommunity>oac_clarin</oacommunity>
<subjects/>
<datasources>
<datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
</communities>

View File

@ -0,0 +1,37 @@
{
"communities": {
"dariah": {
"id": "dariah",
"subjects": [
],
"datasources": [
{
"openaireId": "opendoar____::7e7757b1e12abcb736ab9a754ffb617a",
"sc": {
"cl": {
"criteria": [
{
"ce": {
"constraint": [
{
"verb": "contains",
"field": "contributor",
"value": "DARIAH"
}
]
}
}
]
}
}
}
],
"zenodoCommunities": [
{
"zenodoCommunityId": "dimpo"
}
]
}
}
}

View File

@ -0,0 +1,193 @@
<communities>
<community id="fet-fp7">
<oacommunity/>
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fet-h2020">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="oa-pg">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ee">
<subjects>
<subject>SDG13 - Climate action</subject>
<subject>SDG8 - Decent work and economic growth</subject>
<subject>SDG15 - Life on land</subject>
<subject>SDG2 - Zero hunger</subject>
<subject>SDG17 - Partnerships for the goals</subject>
<subject>SDG10 - Reduced inequalities</subject>
<subject>SDG5 - Gender equality</subject>
<subject>SDG12 - Responsible consumption and production</subject>
<subject>SDG14 - Life below water</subject>
<subject>SDG6 - Clean water and sanitation</subject>
<subject>SDG11 - Sustainable cities and communities</subject>
<subject>SDG1 - No poverty</subject>
<subject>SDG3 - Good health and well being</subject>
<subject>SDG7 - Affordable and clean energy</subject>
<subject>SDG4 - Quality education</subject>
<subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject>
</subjects>
<datasources/>
<zenodocommunities>
<zenodocommunity>
<zenodoid>123</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
</community>
<community id="dh-ch">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="fam">
<subjects/>
<datasources/>
<zenodocommunities/>
</community>
<community id="ni">
<subjects>
<subject>brain mapping</subject>
<subject>brain imaging</subject>
<subject>electroencephalography</subject>
<subject>arterial spin labelling</subject>
<subject>brain fingerprinting</subject>
<subject>brain</subject>
<subject>neuroimaging</subject>
<subject>Multimodal Brain Image Analysis</subject>
<subject>fMRI</subject>
<subject>neuroinformatics</subject>
<subject>fetal brain</subject>
<subject>brain ultrasonic imaging</subject>
<subject>topographic brain mapping</subject>
<subject>diffusion tensor imaging</subject>
<subject>computerized knowledge assessment</subject>
<subject>connectome mapping</subject>
<subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::c7d3de67dc77af72f6747157441252ec</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>re3data_____::8515794670370f49c1d176c399c714f5</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::d640648c84b10d425f96f11c3de468f3</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="mes">
<subjects>
<subject>marine</subject>
<subject>ocean</subject>
<subject>fish</subject>
<subject>aqua</subject>
<subject>sea</subject>
</subjects>
<datasources>
<datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="aginfra">
<subjects>
<subject>animal production and health</subject>
<subject>fisheries and aquaculture</subject>
<subject>food safety and human nutrition</subject>
<subject>information management</subject>
<subject>food technology</subject>
<subject>agri-food education and extension</subject>
<subject>natural resources and environment</subject>
<subject>food system</subject>
<subject>engineering technology and Research</subject>
<subject>agriculture</subject>
<subject>food safety risk assessment</subject>
<subject>food security</subject>
<subject>farming practices and systems</subject>
<subject>plant production and protection</subject>
<subject>agri-food economics and policy</subject>
<subject>food distribution</subject>
<subject>forestry</subject>
</subjects>
<datasources>
<datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::49af6c4e558a7569d80eee2e035e2bd7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::0266e33d3f546cb5436a10798e657d97</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::41bfd20a38bb1b0bec75acf0845530a7</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="clarin">
<oacommunity>oac_clarin</oacommunity>
<subjects/>
<datasources>
<datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities/>
</community>
<community id="dariah">
<oacommunity>oaa_dariah</oacommunity>
<subjects/>
<datasources>
<datasource>
<openaireId>openaire____::1cfdb2e14977f31a98e0118283401f32</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}
</selcriteria>
</datasource>
</datasources>
<zenodocommunities>
<zenodocommunity>
<zenodoid>dimpo</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
</community>
</communities>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,473 @@
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.QueryInformationSystem;
import eu.dnetlib.dhp.TypedRow;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.*;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.PropagationConstant.*;
public class SparkResultToCommunityThroughSemRelJob3 {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils
.toString(SparkResultToCommunityThroughSemRelJob3.class
.getResourceAsStream("/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_propagationresultcommunityfromsemrel_parameters.json")));
parser.parseArgument(args);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final SparkSession spark = SparkSession
.builder()
.appName(SparkResultToCommunityThroughSemRelJob3.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String outputPath = "/tmp/provision/propagation/resulttocommunityfromsemrel";
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
final List<String> communityIdList = QueryInformationSystem.getCommunityList(parser.get("isLookupUrl"));
createOutputDirs(outputPath, FileSystem.get(spark.sparkContext().hadoopConfiguration()));
JavaRDD<Publication> publication_rdd = sc.textFile(inputPath + "/publication")
.map(item -> new ObjectMapper().readValue(item, Publication.class));
JavaRDD<Dataset> dataset_rdd = sc.textFile(inputPath + "/dataset")
.map(item -> new ObjectMapper().readValue(item, Dataset.class));
JavaRDD<OtherResearchProduct> orp_rdd = sc.textFile(inputPath + "/otherresearchproduct")
.map(item -> new ObjectMapper().readValue(item, OtherResearchProduct.class));
JavaRDD<Software> software_rdd = sc.textFile(inputPath + "/software")
.map(item -> new ObjectMapper().readValue(item, Software.class));
JavaRDD<Relation> relation_rdd = sc.textFile(inputPath + "/relation")
.map(item -> new ObjectMapper().readValue(item, Relation.class));
org.apache.spark.sql.Dataset<Publication> publication = spark.createDataset(publication_rdd.rdd(),
Encoders.bean(Publication.class));
org.apache.spark.sql.Dataset<Relation> relation = spark.createDataset(relation_rdd.rdd(),
Encoders.bean(Relation.class));
org.apache.spark.sql.Dataset<Dataset> dataset = spark.createDataset(dataset_rdd.rdd(),
Encoders.bean(Dataset.class));
org.apache.spark.sql.Dataset<OtherResearchProduct> other = spark.createDataset(orp_rdd.rdd(),
Encoders.bean(OtherResearchProduct.class));
org.apache.spark.sql.Dataset<Software> software = spark.createDataset(software_rdd.rdd(),
Encoders.bean(Software.class));
publication.createOrReplaceTempView("publication");
relation.createOrReplaceTempView("relation");
dataset.createOrReplaceTempView("dataset");
software.createOrReplaceTempView("software");
other.createOrReplaceTempView("other");
String communitylist = getConstraintList(" co.id = '", communityIdList);
String semrellist = getConstraintList(" relClass = '", allowedsemrel );
String query = "Select source, community_context, target " +
"from (select id, collect_set(co.id) community_context " +
"from publication " +
"lateral view explode (context) c as co " +
"where datainfo.deletedbyinference = false "+ communitylist +
" group by id) p " +
"JOIN " +
"(select * " +
"from relation " +
"where datainfo.deletedbyinference = false " + semrellist + ") r " +
"ON p.id = r.source";
org.apache.spark.sql.Dataset<Row> publication_context = spark.sql( query);
publication_context.createOrReplaceTempView("publication_context");
//( source, (mes, dh-ch-, ni), target )
query = "select target , collect_set(co) " +
"from (select target, community_context " +
"from publication_context pc join publication p on " +
"p.id = pc.source) tmp " +
"lateral view explode (community_context) c as co " +
"group by target";
org.apache.spark.sql.Dataset<Row> toupdatepublicationreresult = spark.sql(query);
org.apache.spark.sql.Dataset<Row> toupdatesoftwareresult = getUpdateCommunitiesForTable(spark, "software");
org.apache.spark.sql.Dataset<Row> toupdatedatasetresult = getUpdateCommunitiesForTable(spark, "dataset");
org.apache.spark.sql.Dataset<Row> toupdateotherresult = getUpdateCommunitiesForTable(spark, "other");
createUpdateForResultDatasetWrite(toupdatesoftwareresult.toJavaRDD(), outputPath, "software_update",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
createUpdateForResultDatasetWrite(toupdatedatasetresult.toJavaRDD(), outputPath, "dataset_update",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
createUpdateForResultDatasetWrite(toupdatepublicationreresult.toJavaRDD(), outputPath, "publication_update",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
createUpdateForResultDatasetWrite(toupdateotherresult.toJavaRDD(), outputPath, "other_update",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
updateForDatasetDataset(toupdatedatasetresult.toJavaRDD(), dataset.toJavaRDD(), outputPath, "dataset",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
updateForOtherDataset(toupdateotherresult.toJavaRDD(), other.toJavaRDD(), outputPath, "otherresearchproduct",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
updateForSoftwareDataset(toupdatesoftwareresult.toJavaRDD(), software.toJavaRDD(), outputPath, "software",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
updateForPublicationDataset(toupdatepublicationreresult.toJavaRDD(), publication.toJavaRDD(), outputPath, "publication",
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, communityIdList);
//
/*
JavaPairRDD<String, TypedRow> resultLinkedToCommunities = publication
.map(p -> getTypedRow(communityIdList, p.getContext(), p.getId(),"publication"))
.filter(p -> !(p == null))
.mapToPair(toPair())
.union(datasets
.map(p -> getTypedRow(communityIdList, p.getContext(), p.getId(),"dataset"))
.filter(p -> !(p == null))
.mapToPair(toPair())
)
.union(software
.map(p -> getTypedRow(communityIdList, p.getContext(), p.getId(),"software"))
.filter(p -> !(p == null))
.mapToPair(toPair())
)
.union(other
.map(p -> getTypedRow(communityIdList, p.getContext(), p.getId(),"otherresearchproduct"))
.filter(p -> !(p == null))
.mapToPair(toPair())
);
JavaPairRDD<String, TypedRow> to_add_result_communities = resultLinkedToCommunities.join(result_result).map(r -> r._2()._1().setSourceId(r._2()._2().getTargetId()))
.mapToPair(toPair());
JavaPairRDD<String, Result> pubs = publications.mapToPair(p -> new Tuple2<>(p.getId(),p));
JavaPairRDD<String, Result> dss = datasets.mapToPair(p -> new Tuple2<>(p.getId(),p));
JavaPairRDD<String, Result> sfw = software.mapToPair(p -> new Tuple2<>(p.getId(),p));
JavaPairRDD<String, Result> orp = other.mapToPair(p -> new Tuple2<>(p.getId(),p));
updateResultForCommunity(pubs, to_add_result_communities, outputPath, "publication", PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME);
updateResultForCommunity(dss, to_add_result_communities, outputPath, "dataset", PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME);
updateResultForCommunity(sfw, to_add_result_communities, outputPath, "software", PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME);
updateResultForCommunity(orp, to_add_result_communities, outputPath, "otherresearchproduct", PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME);
//leftouterjoin result.to_add_result_communities (result = java pair rdd result) [left outer join perche' li voglio tutti anche quelli che non ho aggiornato]
//per quelli che matchano cercare nel risultato se i context da aggiungere sono gia' presenti. Se non ci sono aggiungerli, altrimenti nulla
*/
}
private static org.apache.spark.sql.Dataset<Row> getUpdateCommunitiesForTable(SparkSession spark, String table){
String query = "SELECT target_id, collect_set(co.id) context_id " +
" FROM (SELECT t.id target_id, s.context source_context " +
" FROM context_software s " +
" JOIN " + table + " t " +
" ON s.target = t.id " +
" UNION ALL " +
" SELECT t.id target_id, d.context source_context " +
" FROM dataset_context d " +
" JOIN " + table + " t" +
" ON s.target = t.id " +
" UNION ALL " +
" SELECT t.id target_id, p.context source_context " +
" FROM publication_context p" +
" JOIN " + table +" t " +
" on p.target = t.id " +
" UNION ALL " +
" SELECT t.id target_id, o.context source_context " +
" FROM other_context o " +
" JOIN " + table + " t " +
" ON o.target = t.id) TMP " +
" LATERAL VIEW EXPLODE(source_context) MyT as co " +
" GROUP BY target_id" ;
return spark.sql(query);
}
private static JavaRDD<Result> createUpdateForResultDatasetWrite(JavaRDD<Row> toupdateresult, String outputPath, String type, String class_id, String class_name, List<String> communityIdList){
return toupdateresult.map(r -> {
List<Context> contextList = new ArrayList();
List<String> toAddContext = r.getList(1);
for (String cId : toAddContext) {
if (communityIdList.contains(cId)) {
Context newContext = new Context();
newContext.setId(cId);
newContext.setDataInfo(Arrays.asList(getDataInfo(PROPAGATION_DATA_INFO_TYPE, class_id, class_name)));
contextList.add(newContext);
}
}
if (contextList.size() > 0) {
Result ret = new Result();
ret.setId(r.getString(0));
ret.setContext(contextList);
return ret;
}
return null;
}).filter(r -> r != null);
}
private static void updateForSoftwareDataset(JavaRDD<Row> toupdateresult, JavaRDD<Software> result, String outputPath, String type, String class_id, String class_name, List<String> communityIdList){
JavaPairRDD<String, Result> tmp = result.mapToPair(r -> new Tuple2(r.getId(), r));
getUpdateForResultDataset(toupdateresult, tmp, outputPath, type, class_id, class_name, communityIdList)
.map(r -> (Software) r)
.map(s -> new ObjectMapper().writeValueAsString(s))
.saveAsTextFile(outputPath + "/" + type);
}
private static void updateForDatasetDataset(JavaRDD<Row> toupdateresult, JavaRDD<Dataset> result, String outputPath, String type, String class_id, String class_name, List<String> communityIdList){
JavaPairRDD<String, Result> tmp = result.mapToPair(r -> new Tuple2(r.getId(), r));
getUpdateForResultDataset(toupdateresult, tmp, outputPath, type, class_id, class_name, communityIdList)
.map( r-> (Dataset)r)
.map(d -> new ObjectMapper().writeValueAsString(d))
.saveAsTextFile(outputPath + "/" + type);
}
private static void updateForPublicationDataset(JavaRDD<Row> toupdateresult, JavaRDD<Publication> result, String outputPath, String type, String class_id, String class_name, List<String> communityIdList){
JavaPairRDD<String, Result> tmp = result.mapToPair(r -> new Tuple2(r.getId(), r));
getUpdateForResultDataset(toupdateresult, tmp, outputPath, type, class_id, class_name, communityIdList)
.map(r -> (Publication)r)
.map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath + "/" + type);
}
private static void updateForOtherDataset(JavaRDD<Row> toupdateresult, JavaRDD<OtherResearchProduct> result, String outputPath, String type, String class_id, String class_name, List<String> communityIdList){
JavaPairRDD<String, Result> tmp = result.mapToPair(r -> new Tuple2(r.getId(), r));
getUpdateForResultDataset(toupdateresult, tmp, outputPath, type, class_id, class_name, communityIdList)
.map( r -> (OtherResearchProduct)r)
.map( o -> new ObjectMapper().writeValueAsString(o))
.saveAsTextFile(outputPath + "/" + type);
}
private static JavaRDD<Result> getUpdateForResultDataset(JavaRDD<Row> toupdateresult, JavaPairRDD<String, Result> result, String outputPath, String type, String class_id, String class_name, List<String> communityIdList){
return result.leftOuterJoin(toupdateresult.mapToPair(r -> new Tuple2<>(r.getString(0), r.getList(1))))
.map(c -> {
if(! c._2()._2().isPresent()){
return c._2()._1();
}
List<Object> toAddContext = c._2()._2().get();
Set<String> context_set = new HashSet<>();
for(Object cId: toAddContext){
String id = (String)cId;
if (communityIdList.contains(id)){
context_set.add(id);
}
}
for (Context context: c._2()._1().getContext()){
if(context_set.contains(context)){
context_set.remove(context);
}
}
List<Context> contextList = context_set.stream().map(co -> {
Context newContext = new Context();
newContext.setId(co);
newContext.setDataInfo(Arrays.asList(getDataInfo(PROPAGATION_DATA_INFO_TYPE, class_id, class_name)));
return newContext;
}).collect(Collectors.toList());
if(contextList.size() > 0 ){
Result r = new Result();
r.setId(c._1());
r.setContext(contextList);
return r;
}
return null;
}).filter(r -> r != null);
// return toupdateresult.mapToPair(r -> new Tuple2<>(r.getString(0), r.getList(1)))
// .join(result)
// .map(c -> {
// List<Object> toAddContext = c._2()._1();
// Set<String> context_set = new HashSet<>();
// for(Object cId: toAddContext){
// String id = (String)cId;
// if (communityIdList.contains(id)){
// context_set.add(id);
// }
// }
// for (Context context: c._2()._2().getContext()){
// if(context_set.contains(context)){
// context_set.remove(context);
// }
// }
//
// List<Context> contextList = context_set.stream().map(co -> {
// Context newContext = new Context();
// newContext.setId(co);
// newContext.setDataInfo(Arrays.asList(getDataInfo(PROPAGATION_DATA_INFO_TYPE, class_id, class_name)));
// return newContext;
//
// }).collect(Collectors.toList());
//
// if(contextList.size() > 0 ){
// Result r = new Result();
// r.setId(c._1());
// r.setContext(contextList);
// return r;
// }
// return null;
// })
// .filter(r -> r != null);
}
private static JavaRDD<Software> createUpdateForSoftwareDataset(JavaRDD<Row> toupdateresult, List<String> communityList,
JavaRDD<Software> result, String class_id, String class_name) {
return result
.mapToPair(s -> new Tuple2<>(s.getId(), s)).leftOuterJoin(getStringResultJavaPairRDD(toupdateresult, communityList))
.map(c -> {
Software oaf = c._2()._1();
if (c._2()._2().isPresent()) {
HashSet<String> contexts = new HashSet<>(c._2()._2().get());
for (Context context : oaf.getContext()) {
if (contexts.contains(context.getId())){
if (!context.getDataInfo().stream().map(di -> di.getInferenceprovenance())
.collect(Collectors.toSet()).contains(PROPAGATION_DATA_INFO_TYPE)){
context.getDataInfo().add(getDataInfo(PROPAGATION_DATA_INFO_TYPE, class_id, class_name));
//community id already in the context of the result. Remove it from the set that has to be added
contexts.remove(context.getId());
}
}
}
List<Context> cc = oaf.getContext();
for(String cId: contexts){
Context context = new Context();
context.setId(cId);
context.setDataInfo(Arrays.asList(getDataInfo(PROPAGATION_DATA_INFO_TYPE, class_id, class_name)));
cc.add(context);
}
oaf.setContext(cc);
}
return oaf;
});
}
private static JavaPairRDD<String, List<String>> getStringResultJavaPairRDD(JavaRDD<Row> toupdateresult, List<String> communityList) {
return toupdateresult.mapToPair(c -> {
List<String> contextList = new ArrayList<>();
List<String> contexts = c.getList(1);
for (String context : contexts) {
if (communityList.contains(context)) {
contextList.add(context);
}
}
return new Tuple2<>(c.getString(0) ,contextList);
});
}
private static org.apache.spark.sql.Dataset<Row> getContext(SparkSession spark, String table){
String query = "SELECT relation.source, " + table +".context , relation.target " +
"FROM " + table +
" JOIN relation " +
"ON id = source" ;
return spark.sql(query);
}
private static Boolean relatedToCommunities(Result r, List<String> communityIdList) {
Set<String> result_communities = r.getContext()
.stream()
.map(c -> c.getId())
.collect(Collectors.toSet());
for (String communityId : result_communities) {
if (communityIdList.contains(communityId)) {
return true;
}
}
return false;
}
private static void updateResult(JavaPairRDD<String, Result> results, JavaPairRDD<String, TypedRow> toupdateresult, String outputPath, String type) {
results.leftOuterJoin(toupdateresult)
.map(p -> {
Result r = p._2()._1();
if (p._2()._2().isPresent()){
Set<String> communityList = p._2()._2().get().getAccumulator();
for(Context c: r.getContext()){
if (communityList.contains(c.getId())){
//verify if the datainfo for this context contains propagation
if (!c.getDataInfo().stream().map(di -> di.getInferenceprovenance()).collect(Collectors.toSet()).contains(PROPAGATION_DATA_INFO_TYPE)){
c.getDataInfo().add(getDataInfo(PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME));
//community id already in the context of the result. Remove it from the set that has to be added
communityList.remove(c.getId());
}
}
}
List<Context> cc = r.getContext();
for(String cId: communityList){
Context context = new Context();
context.setId(cId);
context.setDataInfo(Arrays.asList(getDataInfo(PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME)));
cc.add(context);
}
r.setContext(cc);
}
return r;
})
.map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/"+type);
}
private static TypedRow getTypedRow(List<String> communityIdList, List<Context> context, String id, String type) {
Set<String> result_communities = context
.stream()
.map(c -> c.getId())
.collect(Collectors.toSet());
TypedRow tp = new TypedRow();
tp.setSourceId(id);
tp.setType(type);
for (String communityId : result_communities) {
if (communityIdList.contains(communityId)) {
tp.add(communityId);
}
}
if (tp.getAccumulator() != null) {
return tp;
}
return null;
}
}

View File

@ -0,0 +1,40 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName": "p",
"paramLongName": "preparedInfoPath",
"paramDescription": "the path where prepared info have been stored",
"paramRequired": true
}
]

View File

@ -0,0 +1,50 @@
[
{
"paramName":"is",
"paramLongName":"isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"as",
"paramLongName":"allowedsemrels",
"paramDescription": "the allowed semantic relations for propagation",
"paramRequired": true
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramRequired": true
},
{
"paramName":"sg",
"paramLongName":"saveGraph",
"paramDescription": "true if the new version of the graph must be saved",
"paramRequired": false
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
}
]

View File

@ -28,6 +28,18 @@
"paramLongName":"hive_metastore_uris", "paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris", "paramDescription": "the hive metastore uris",
"paramRequired": true "paramRequired": true
},
{
"paramName":"wu",
"paramLongName":"writeUpdate",
"paramDescription": "true if the update must be writte. No double check if information is already present",
"paramRequired": true
},
{
"paramName":"sg",
"paramLongName":"saveGraph",
"paramDescription": "true if the new version of the graph must be saved",
"paramRequired": true
} }
] ]

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.resulttocommunityfromorganization;
public class ResultToCommunityJobTest {
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.resulttocommunityfromsemrel;
public class ResultToCommunityJobTest {
}