forked from D-Net/dnet-hadoop
[Enrichment Step] get rid of hive
This commit is contained in:
parent
aecea5a095
commit
d1519fa28f
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.countrypropagation;
|
package eu.dnetlib.dhp;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
|
@ -23,4 +23,5 @@ public class KeyValueSet implements Serializable {
|
||||||
public void setValueSet(ArrayList<String> valueSet) {
|
public void setValueSet(ArrayList<String> valueSet) {
|
||||||
this.valueSet = valueSet;
|
this.valueSet = valueSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,22 +4,21 @@ package eu.dnetlib.dhp;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.Row;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
public class PropagationConstant {
|
public class PropagationConstant {
|
||||||
|
|
||||||
|
@ -221,9 +220,28 @@ public class PropagationConstant {
|
||||||
.orElse(Boolean.FALSE);
|
.orElse(Boolean.FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void createCfHbforResult(SparkSession spark) {
|
// of the results collects the distinct keys for collected from (at the level of the result) and hosted by
|
||||||
org.apache.spark.sql.Dataset<Row> cfhb = spark.sql(cfHbforResultQuery);
|
// and produces pairs resultId, key for each distinct key associated to the result
|
||||||
cfhb.createOrReplaceTempView("cfhb");
|
public static <R extends Result> void createCfHbforResult(SparkSession spark, String inputPath, String outputPath,
|
||||||
|
Class<R> resultClazz) {
|
||||||
|
readPath(spark, inputPath, resultClazz)
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||||
|
!r.getDataInfo().getInvisible())
|
||||||
|
.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
|
||||||
|
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
|
||||||
|
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
|
||||||
|
return cfhb
|
||||||
|
.stream()
|
||||||
|
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.iterator();
|
||||||
|
}, Encoders.bean(EntityEntityRel.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
public static <R> Dataset<R> readPath(
|
||||||
|
|
|
@ -10,7 +10,6 @@ import java.util.List;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.ForeachFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
@ -19,8 +18,7 @@ import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import eu.dnetlib.dhp.EntityEntityRel;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
@ -56,8 +54,8 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
String inputPath = parser.get("sourcePath");
|
String inputPath = parser.get("sourcePath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
log.info("outputPath {}: ", outputPath);
|
log.info("workingPath {}: ", workingPath);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
@ -65,13 +63,13 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, workingPath);
|
||||||
prepareDatasourceCountryAssociation(
|
prepareDatasourceCountryAssociation(
|
||||||
spark,
|
spark,
|
||||||
Arrays.asList(parser.get("whitelist").split(";")),
|
Arrays.asList(parser.get("whitelist").split(";")),
|
||||||
Arrays.asList(parser.get("allowedtypes").split(";")),
|
Arrays.asList(parser.get("allowedtypes").split(";")),
|
||||||
inputPath,
|
inputPath,
|
||||||
outputPath);
|
workingPath + "/datasourceCountry");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,17 +2,14 @@
|
||||||
package eu.dnetlib.dhp.countrypropagation;
|
package eu.dnetlib.dhp.countrypropagation;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
@ -23,6 +20,8 @@ import org.apache.spark.sql.Dataset;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.EntityEntityRel;
|
||||||
|
import eu.dnetlib.dhp.PropagationConstant;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -49,15 +48,18 @@ public class PrepareResultCountrySet {
|
||||||
String inputPath = parser.get("sourcePath");
|
String inputPath = parser.get("sourcePath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
String outputPath = parser.get("outputPath");
|
|
||||||
log.info("outputPath: {}", outputPath);
|
|
||||||
|
|
||||||
final String datasourcecountrypath = parser.get("preparedInfoPath");
|
|
||||||
log.info("preparedInfoPath: {}", datasourcecountrypath);
|
|
||||||
|
|
||||||
final String resultClassName = parser.get("resultTableName");
|
final String resultClassName = parser.get("resultTableName");
|
||||||
log.info("resultTableName: {}", resultClassName);
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
|
||||||
|
log.info("resultType: {}", resultType);
|
||||||
|
|
||||||
|
String outputPath = workingPath + "/" + resultType; // parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String datasourcecountrypath = workingPath + "/datasourceCountry";// parser.get("preparedInfoPath");
|
||||||
|
log.info("preparedInfoPath: {}", datasourcecountrypath);
|
||||||
|
|
||||||
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
@ -72,7 +74,7 @@ public class PrepareResultCountrySet {
|
||||||
inputPath,
|
inputPath,
|
||||||
outputPath,
|
outputPath,
|
||||||
datasourcecountrypath,
|
datasourcecountrypath,
|
||||||
workingPath,
|
workingPath + "/resultCfHb/" + resultType,
|
||||||
resultClazz);
|
resultClazz);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -85,31 +87,11 @@ public class PrepareResultCountrySet {
|
||||||
String workingPath,
|
String workingPath,
|
||||||
Class<R> resultClazz) {
|
Class<R> resultClazz) {
|
||||||
|
|
||||||
// selects all the results non deleted by inference and non invisible
|
PropagationConstant.createCfHbforResult(spark, inputPath, workingPath, resultClazz);
|
||||||
Dataset<R> result = readPath(spark, inputPath, resultClazz)
|
|
||||||
.filter(
|
|
||||||
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
|
||||||
!r.getDataInfo().getInvisible());
|
|
||||||
|
|
||||||
// of the results collects the distinct keys for collected from (at the level of the result) and hosted by
|
|
||||||
// and produces pairs resultId, key for each distinct key associated to the result
|
|
||||||
result.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
|
|
||||||
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
|
|
||||||
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
|
|
||||||
return cfhb
|
|
||||||
.stream()
|
|
||||||
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
|
|
||||||
.collect(Collectors.toList())
|
|
||||||
.iterator();
|
|
||||||
}, Encoders.bean(EntityEntityRel.class))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingPath + "/resultCfHb");
|
|
||||||
|
|
||||||
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
|
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
|
||||||
|
|
||||||
Dataset<EntityEntityRel> cfhb = readPath(spark, workingPath + "/resultCfHb", EntityEntityRel.class);
|
Dataset<EntityEntityRel> cfhb = readPath(spark, workingPath, EntityEntityRel.class);
|
||||||
|
|
||||||
datasource_country
|
datasource_country
|
||||||
.joinWith(
|
.joinWith(
|
||||||
|
|
|
@ -47,8 +47,8 @@ public class SparkCountryPropagationJob {
|
||||||
String sourcePath = parser.get("sourcePath");
|
String sourcePath = parser.get("sourcePath");
|
||||||
log.info("sourcePath: {}", sourcePath);
|
log.info("sourcePath: {}", sourcePath);
|
||||||
|
|
||||||
String preparedInfoPath = parser.get("preparedInfoPath");
|
String workingPath = parser.get("workingPath");
|
||||||
log.info("preparedInfoPath: {}", preparedInfoPath);
|
log.info("workingPath: {}", workingPath);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
@ -67,7 +67,7 @@ public class SparkCountryPropagationJob {
|
||||||
execPropagation(
|
execPropagation(
|
||||||
spark,
|
spark,
|
||||||
sourcePath,
|
sourcePath,
|
||||||
preparedInfoPath,
|
workingPath,
|
||||||
outputPath,
|
outputPath,
|
||||||
resultClazz);
|
resultClazz);
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
public class AutoritativeAuthor {
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class AutoritativeAuthor implements Serializable {
|
||||||
|
|
||||||
private String name;
|
private String name;
|
||||||
private String surname;
|
private String surname;
|
||||||
|
|
|
@ -13,6 +13,7 @@ import java.util.stream.Collectors;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.ForeachFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
@ -22,6 +23,7 @@ import org.apache.spark.sql.sources.v2.reader.InputPartition;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
@ -57,8 +59,10 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
final String resultClassName = parser.get("resultTableName");
|
final String resultClassName = parser.get("resultTableName");
|
||||||
log.info("resultTableName: {}", resultClassName);
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
final List<String> allowedsemrel = Arrays.stream(parser.get("allowedsemrels").split(";"))
|
final List<String> allowedsemrel = Arrays
|
||||||
.map(s -> s.toLowerCase()).collect(Collectors.toList());
|
.stream(parser.get("allowedsemrels").split(";"))
|
||||||
|
.map(s -> s.toLowerCase())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
|
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
|
||||||
|
|
||||||
|
@ -124,29 +128,32 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
|
|
||||||
Dataset<R> result = readPath(spark, outputPath + "/resultSubset", resultClazz);
|
Dataset<R> result = readPath(spark, outputPath + "/resultSubset", resultClazz);
|
||||||
|
|
||||||
|
result.foreach((ForeachFunction<R>) r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
|
||||||
|
|
||||||
result
|
result
|
||||||
.joinWith(relation, result.col("id").equalTo(relation.col("source")))
|
.joinWith(relation, result.col("id").equalTo(relation.col("source")))
|
||||||
.map((MapFunction<Tuple2<R, Relation>, ResultOrcidList>) t2 -> {
|
.map((MapFunction<Tuple2<R, Relation>, ResultOrcidList>) t2 -> {
|
||||||
ResultOrcidList rol = new ResultOrcidList();
|
ResultOrcidList rol = new ResultOrcidList();
|
||||||
rol.setResultId(t2._2().getTarget());
|
rol.setResultId(t2._2().getTarget());
|
||||||
List<AutoritativeAuthor> aal = new ArrayList<>();
|
List<AutoritativeAuthor> aal = new ArrayList<>();
|
||||||
t2._1().getAuthor().stream().forEach(a -> {
|
t2._1().getAuthor().stream().forEach(a -> {
|
||||||
a.getPid().stream().forEach(p -> {
|
a.getPid().stream().forEach(p -> {
|
||||||
if (allowedPids.contains(p.getQualifier().getClassid().toLowerCase())) {
|
if (allowedPids.contains(p.getQualifier().getClassid().toLowerCase())) {
|
||||||
aal
|
aal
|
||||||
.add(
|
.add(
|
||||||
AutoritativeAuthor
|
AutoritativeAuthor
|
||||||
.newInstance(a.getName(), a.getSurname(), a.getFullname(), p.getValue()));
|
.newInstance(a.getName(), a.getSurname(), a.getFullname(), p.getValue()));
|
||||||
}
|
}
|
||||||
});
|
|
||||||
});
|
});
|
||||||
return rol;
|
});
|
||||||
}, Encoders.bean(ResultOrcidList.class)).write()
|
rol.setAuthorList(aal);
|
||||||
|
return rol;
|
||||||
|
}, Encoders.bean(ResultOrcidList.class))
|
||||||
|
.write()
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(outputPath + "/" + resultType);
|
.json(outputPath + "/" + resultType);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean hasAllowedPid(Author a, List<String> allowedPids) {
|
private static boolean hasAllowedPid(Author a, List<String> allowedPids) {
|
||||||
|
|
|
@ -65,30 +65,31 @@ public class PrepareResultOrcidAssociationStep2 {
|
||||||
.union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
|
.union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
|
||||||
|
|
||||||
resultOrcidAssoc
|
resultOrcidAssoc
|
||||||
.groupByKey((MapFunction<ResultOrcidList, String>) rol -> rol.getResultId(), Encoders.STRING())
|
.groupByKey((MapFunction<ResultOrcidList, String>) rol -> rol.getResultId(), Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, ResultOrcidList, ResultOrcidList>) (k, it) ->{
|
.mapGroups((MapGroupsFunction<String, ResultOrcidList, ResultOrcidList>) (k, it) -> {
|
||||||
ResultOrcidList resultOrcidList = it.next();
|
ResultOrcidList resultOrcidList = it.next();
|
||||||
if(it.hasNext())
|
if (it.hasNext()) {
|
||||||
{
|
|
||||||
Set<String> orcid_set = new HashSet<>();
|
Set<String> orcid_set = new HashSet<>();
|
||||||
resultOrcidList.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
|
resultOrcidList.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
|
||||||
it.forEachRemaining(val -> val
|
it
|
||||||
.getAuthorList()
|
.forEachRemaining(
|
||||||
.stream()
|
val -> val
|
||||||
.forEach(
|
.getAuthorList()
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
aa -> {
|
aa -> {
|
||||||
if (!orcid_set.contains(aa.getOrcid())) {
|
if (!orcid_set.contains(aa.getOrcid())) {
|
||||||
resultOrcidList.getAuthorList().add(aa);
|
resultOrcidList.getAuthorList().add(aa);
|
||||||
orcid_set.add(aa.getOrcid());
|
orcid_set.add(aa.getOrcid());
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
return resultOrcidList;
|
return resultOrcidList;
|
||||||
},Encoders.bean(ResultOrcidList.class) )
|
}, Encoders.bean(ResultOrcidList.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression","gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -61,7 +60,6 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
|
|
|
@ -23,4 +23,11 @@ public class DatasourceOrganization implements Serializable {
|
||||||
public void setOrganizationId(String organizationId) {
|
public void setOrganizationId(String organizationId) {
|
||||||
this.organizationId = organizationId;
|
this.organizationId = organizationId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static DatasourceOrganization newInstance(String datasourceId, String organizationId) {
|
||||||
|
DatasourceOrganization dso = new DatasourceOrganization();
|
||||||
|
dso.datasourceId = datasourceId;
|
||||||
|
dso.organizationId = organizationId;
|
||||||
|
return dso;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,17 +2,17 @@
|
||||||
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
@ -28,6 +28,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class PrepareResultInstRepoAssociation {
|
public class PrepareResultInstRepoAssociation {
|
||||||
|
|
||||||
|
@ -49,14 +50,11 @@ public class PrepareResultInstRepoAssociation {
|
||||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
String inputPath = parser.get("sourcePath");
|
final String inputPath = parser.get("sourcePath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath);
|
log.info("workingPath: {}", workingPath);
|
||||||
|
|
||||||
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
|
||||||
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
|
||||||
|
|
||||||
List<String> blacklist = Optional
|
List<String> blacklist = Optional
|
||||||
.ofNullable(parser.get("blacklist"))
|
.ofNullable(parser.get("blacklist"))
|
||||||
|
@ -64,82 +62,92 @@ public class PrepareResultInstRepoAssociation {
|
||||||
.orElse(new ArrayList<>());
|
.orElse(new ArrayList<>());
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
|
||||||
|
|
||||||
runWithSparkHiveSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
readNeededResources(spark, inputPath);
|
readNeededResources(spark, inputPath, workingPath, blacklist);
|
||||||
|
|
||||||
removeOutputDir(spark, datasourceOrganizationPath);
|
prepareDatasourceOrganization(spark, workingPath);
|
||||||
prepareDatasourceOrganization(spark, datasourceOrganizationPath, blacklist);
|
|
||||||
|
|
||||||
removeOutputDir(spark, alreadyLinkedPath);
|
prepareAlreadyLinkedAssociation(spark, workingPath);
|
||||||
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void readNeededResources(SparkSession spark, String inputPath) {
|
private static void readNeededResources(SparkSession spark, String inputPath, String workingPath,
|
||||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
List<String> blacklist) {
|
||||||
datasource.createOrReplaceTempView("datasource");
|
readPath(spark, inputPath + "/datasource", Datasource.class)
|
||||||
|
.filter(
|
||||||
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
|
(FilterFunction<Datasource>) ds -> !blacklist.contains(ds.getId()) &&
|
||||||
relation.createOrReplaceTempView("relation");
|
!ds.getDataInfo().getDeletedbyinference() &&
|
||||||
|
ds.getDatasourcetype().getClassid().equals(INSTITUTIONAL_REPO_TYPE))
|
||||||
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
|
|
||||||
organization.createOrReplaceTempView("organization");
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void prepareDatasourceOrganization(
|
|
||||||
SparkSession spark, String datasourceOrganizationPath, List<String> blacklist) {
|
|
||||||
|
|
||||||
final String blacklisted = blacklist
|
|
||||||
.stream()
|
|
||||||
.map(s -> " AND id != '" + s + "'")
|
|
||||||
.collect(Collectors.joining());
|
|
||||||
|
|
||||||
String query = "SELECT source datasourceId, target organizationId "
|
|
||||||
+ "FROM ( SELECT id "
|
|
||||||
+ "FROM datasource "
|
|
||||||
+ "WHERE datasourcetype.classid = '"
|
|
||||||
+ INSTITUTIONAL_REPO_TYPE
|
|
||||||
+ "' "
|
|
||||||
+ "AND datainfo.deletedbyinference = false " + blacklisted + " ) d "
|
|
||||||
+ "JOIN ( SELECT source, target "
|
|
||||||
+ "FROM relation "
|
|
||||||
+ "WHERE lower(relclass) = '"
|
|
||||||
+ ModelConstants.IS_PROVIDED_BY.toLowerCase()
|
|
||||||
+ "' "
|
|
||||||
+ "AND datainfo.deletedbyinference = false ) rel "
|
|
||||||
+ "ON d.id = rel.source ";
|
|
||||||
|
|
||||||
spark
|
|
||||||
.sql(query)
|
|
||||||
.as(Encoders.bean(DatasourceOrganization.class))
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(datasourceOrganizationPath);
|
.json(workingPath + "/datasource");
|
||||||
|
|
||||||
|
readPath(spark, inputPath + "/relation", Relation.class)
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||||
|
(r.getRelClass().toLowerCase().equals(ModelConstants.IS_PROVIDED_BY.toLowerCase()) ||
|
||||||
|
r.getRelClass().toLowerCase().equals(ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase())))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingPath + "/relation");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void prepareDatasourceOrganization(
|
||||||
|
SparkSession spark, String workingPath) {
|
||||||
|
|
||||||
|
Dataset<Datasource> datasource = readPath(spark, workingPath + "/datasource", Datasource.class);
|
||||||
|
|
||||||
|
Dataset<Relation> relation = readPath(spark, workingPath + "/relation", Relation.class)
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Relation>) r -> r
|
||||||
|
.getRelClass()
|
||||||
|
.toLowerCase()
|
||||||
|
.equals(ModelConstants.IS_PROVIDED_BY.toLowerCase()));
|
||||||
|
|
||||||
|
datasource
|
||||||
|
.joinWith(relation, datasource.col("id").equalTo(relation.col("source")))
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<Datasource, Relation>, DatasourceOrganization>) t2 -> DatasourceOrganization
|
||||||
|
.newInstance(t2._2().getSource(), t2._2().getTarget()),
|
||||||
|
Encoders.bean(DatasourceOrganization.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingPath + "/ datasourceOrganization");
|
||||||
|
;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void prepareAlreadyLinkedAssociation(
|
private static void prepareAlreadyLinkedAssociation(
|
||||||
SparkSession spark, String alreadyLinkedPath) {
|
SparkSession spark, String workingPath) {
|
||||||
String query = "Select source key, collect_set(target) valueSet "
|
|
||||||
+ "from relation "
|
readPath(spark, workingPath + "/relation", Relation.class)
|
||||||
+ "where datainfo.deletedbyinference = false "
|
.filter(
|
||||||
+ "and lower(relClass) = '"
|
(FilterFunction<Relation>) r -> r
|
||||||
+ ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase()
|
.getRelClass()
|
||||||
+ "' "
|
.toLowerCase()
|
||||||
+ "group by source";
|
.equals(ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase()))
|
||||||
|
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Relation, KeyValueSet>) (k, it) -> {
|
||||||
|
Set<String> values = new HashSet<>();
|
||||||
|
KeyValueSet kvs = new KeyValueSet();
|
||||||
|
kvs.setKey(k);
|
||||||
|
values.add(it.next().getTarget());
|
||||||
|
it.forEachRemaining(r -> values.add(r.getTarget()));
|
||||||
|
kvs.setValueSet(new ArrayList<>(values));
|
||||||
|
return kvs;
|
||||||
|
}, Encoders.bean(KeyValueSet.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingPath + "/alreadyLinked");
|
||||||
|
|
||||||
spark
|
|
||||||
.sql(query)
|
|
||||||
.as(Encoders.bean(KeyValueSet.class))
|
|
||||||
// TODO retry to stick with datasets
|
|
||||||
.toJavaRDD()
|
|
||||||
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
|
||||||
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -118,7 +118,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Append)
|
.mode(SaveMode.Append)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(inputPath.substring(0, inputPath.indexOf("/") + 1) + "relation");
|
.json(inputPath.substring(0, inputPath.lastIndexOf("/") + 1) + "relation");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static FlatMapFunction<Tuple2<KeyValueSet, KeyValueSet>, Relation> createRelationFn() {
|
private static FlatMapFunction<Tuple2<KeyValueSet, KeyValueSet>, Relation> createRelationFn() {
|
||||||
|
@ -157,12 +157,14 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
||||||
result.createOrReplaceTempView("result");
|
result.createOrReplaceTempView("result");
|
||||||
|
|
||||||
Dataset<Row> cfhb = spark.sql("select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
|
Dataset<Row> cfhb = spark
|
||||||
+
|
.sql(
|
||||||
"from result r " +
|
"select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
|
||||||
"lateral view explode(instance) i as inst " +
|
+
|
||||||
"where r.datainfo.deletedbyinference=false");
|
"from result r " +
|
||||||
//createCfHbforResult(spark);
|
"lateral view explode(instance) i as inst " +
|
||||||
|
"where r.datainfo.deletedbyinference=false");
|
||||||
|
// createCfHbforResult(spark);
|
||||||
cfhb.createOrReplaceTempView("cfhb");
|
cfhb.createOrReplaceTempView("cfhb");
|
||||||
dsOrg.createOrReplaceTempView("rels");
|
dsOrg.createOrReplaceTempView("rels");
|
||||||
|
|
||||||
|
|
|
@ -18,8 +18,8 @@
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "p",
|
"paramName": "wp",
|
||||||
"paramLongName": "preparedInfoPath",
|
"paramLongName": "workingPath",
|
||||||
"paramDescription": "the path where prepared info have been stored",
|
"paramDescription": "the path where prepared info have been stored",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
|
|
|
@ -5,12 +5,6 @@
|
||||||
"paramDescription": "the path of the sequencial file to read",
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"paramName":"out",
|
|
||||||
"paramLongName":"outputPath",
|
|
||||||
"paramDescription": "the output path",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName":"w",
|
"paramName":"w",
|
||||||
"paramLongName":"workingPath",
|
"paramLongName":"workingPath",
|
||||||
|
@ -23,12 +17,7 @@
|
||||||
"paramDescription": "the name of the result table we are currently working on",
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"paramName": "p",
|
|
||||||
"paramLongName": "preparedInfoPath",
|
|
||||||
"paramDescription": "the path where prepared info have been stored",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName": "ssm",
|
"paramName": "ssm",
|
||||||
"paramLongName": "isSparkSessionManaged",
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
|
|
@ -6,21 +6,9 @@
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName":"h",
|
"paramName":"wp",
|
||||||
"paramLongName":"hive_metastore_uris",
|
"paramLongName":"workingPath",
|
||||||
"paramDescription": "the hive metastore uris",
|
"paramDescription": "path where to store/find prepared/ filtered data",
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName":"dop",
|
|
||||||
"paramLongName":"datasourceOrganizationPath",
|
|
||||||
"paramDescription": "path where to store/find association from datasource and organization",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName":"alp",
|
|
||||||
"paramLongName":"alreadyLinkedPath",
|
|
||||||
"paramDescription": "path where to store/find already linked results and organizations",
|
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -220,10 +220,10 @@
|
||||||
<name>sourcePath</name>
|
<name>sourcePath</name>
|
||||||
<value>${outputPath}</value>
|
<value>${outputPath}</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<!-- <property>-->
|
||||||
<name>outputPath</name>
|
<!-- <name>outputPath</name>-->
|
||||||
<value>${outputPath}</value>
|
<!-- <value>${outputPath}</value>-->
|
||||||
</property>
|
<!-- </property>-->
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="community_organization" />
|
<ok to="community_organization" />
|
||||||
|
|
|
@ -65,7 +65,7 @@
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--whitelist</arg><arg>${whitelist}</arg>
|
<arg>--whitelist</arg><arg>${whitelist}</arg>
|
||||||
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/country/preparedInfo</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork_join_prepare_result_country"/>
|
<ok to="fork_join_prepare_result_country"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -99,10 +99,8 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/country/publication</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country/workingP</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/country/preparedInfo</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -129,10 +127,8 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/country/dataset</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country/workingD</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/country/preparedInfo</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -159,10 +155,8 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/country/otherresearchproduct</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country/workingO</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/country/preparedInfo</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -189,10 +183,8 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/country/software</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country/workingS</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/country/preparedInfo</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -228,12 +220,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<<<<<<< HEAD:dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml
|
<arg>--workingPath</arg><arg>${workingDir}/country/publication</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/country/publication</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
=======
|
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
|
|
||||||
>>>>>>> beta:dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -262,12 +249,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<<<<<<< HEAD:dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml
|
<arg>--workingPath</arg><arg>${workingDir}/country/dataset</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/country/dataset</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
=======
|
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
|
|
||||||
>>>>>>> beta:dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -296,12 +278,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<<<<<<< HEAD:dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml
|
<arg>--workingPath</arg><arg>${workingDir}/country/otherresearchproduct</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/country/otherresearchproduct</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
=======
|
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
|
||||||
>>>>>>> beta:dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -330,7 +307,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/country/software</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -112,10 +112,10 @@
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
<arg>--allowedpis</arg><arg>${allowedpids}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -140,10 +140,10 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
<arg>--allowedpis</arg><arg>${allowedpids}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -168,10 +168,10 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
<arg>--allowedpis</arg><arg>${allowedpids}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -196,10 +196,10 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/preparedInfo/targetOrcidAssoc</arg>
|
||||||
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
|
||||||
|
<arg>--allowedpis</arg><arg>${allowedpids}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -263,7 +263,6 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -294,7 +293,6 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -325,7 +323,6 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -356,7 +353,6 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/preparedInfo/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -45,9 +45,7 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--workingPath</arg><arg>${workingDir}/affiliationInstRepo</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
|
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
|
|
||||||
<arg>--blacklist</arg><arg>${blacklist}</arg>
|
<arg>--blacklist</arg><arg>${blacklist}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork_join_apply_resulttoorganization_propagation"/>
|
<ok to="fork_join_apply_resulttoorganization_propagation"/>
|
||||||
|
|
|
@ -4,10 +4,10 @@
|
||||||
<name>sourcePath</name>
|
<name>sourcePath</name>
|
||||||
<description>the source path</description>
|
<description>the source path</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<!-- <property>-->
|
||||||
<name>outputPath</name>
|
<!-- <name>outputPath</name>-->
|
||||||
<description>sets the outputPath</description>
|
<!-- <description>sets the outputPath</description>-->
|
||||||
</property>
|
<!-- </property>-->
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<global>
|
<global>
|
||||||
|
@ -21,27 +21,27 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="resume_from"/>
|
<start to="prepare_info"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<decision name="resume_from">
|
<!-- <decision name="resume_from">-->
|
||||||
<switch>
|
<!-- <switch>-->
|
||||||
<case to="prepare_info">${wf:conf('resumeFrom') eq 'PrepareInfo'}</case>
|
<!-- <case to="prepare_info">${wf:conf('resumeFrom') eq 'PrepareInfo'}</case>-->
|
||||||
<default to="reset_outputpath"/> <!-- first action to be done when downloadDump is to be performed -->
|
<!-- <default to="reset_outputpath"/> <!– first action to be done when downloadDump is to be performed –>-->
|
||||||
</switch>
|
<!-- </switch>-->
|
||||||
</decision>
|
<!-- </decision>-->
|
||||||
|
|
||||||
<action name="reset_outputpath">
|
<!-- <action name="reset_outputpath">-->
|
||||||
<fs>
|
<!-- <fs>-->
|
||||||
<delete path="${outputPath}"/>
|
<!-- <delete path="${outputPath}"/>-->
|
||||||
<mkdir path="${outputPath}"/>
|
<!-- <mkdir path="${outputPath}"/>-->
|
||||||
</fs>
|
<!-- </fs>-->
|
||||||
<ok to="prepare_info"/>
|
<!-- <ok to="prepare_info"/>-->
|
||||||
<error to="Kill"/>
|
<!-- <error to="Kill"/>-->
|
||||||
</action>
|
<!-- </action>-->
|
||||||
|
|
||||||
|
|
||||||
<action name="prepare_info">
|
<action name="prepare_info">
|
||||||
|
@ -91,7 +91,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--relationPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/relation</arg>
|
<arg>--relationPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--leavesPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/leavesPath</arg>
|
<arg>--leavesPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/leavesPath</arg>
|
||||||
<arg>--childParentPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/childParentPath</arg>
|
<arg>--childParentPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/childParentPath</arg>
|
||||||
<arg>--resultOrgPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/resultOrgPath</arg>
|
<arg>--resultOrgPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/resultOrgPath</arg>
|
||||||
|
|
|
@ -5,7 +5,6 @@ import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.neethi.Assertion;
|
import org.apache.neethi.Assertion;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -26,6 +25,8 @@ import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class PrepareStep1Test {
|
public class PrepareStep1Test {
|
||||||
|
|
||||||
|
@ -89,154 +90,118 @@ public class PrepareStep1Test {
|
||||||
|
|
||||||
Assertions.assertEquals(0, tmp.count());
|
Assertions.assertEquals(0, tmp.count());
|
||||||
|
|
||||||
Assertions.assertEquals(7, sc
|
Assertions
|
||||||
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
|
.assertEquals(
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class)).count());
|
7, sc
|
||||||
|
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
|
||||||
|
.count());
|
||||||
|
|
||||||
Assertions.assertEquals(0, sc
|
Assertions
|
||||||
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
|
.assertEquals(
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).count());
|
0, sc
|
||||||
|
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
|
||||||
|
.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void oneUpdateTest() throws Exception {
|
void matchTest() throws Exception {
|
||||||
SparkOrcidToResultFromSemRelJob
|
|
||||||
|
final String sourcePath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
PrepareResultOrcidAssociationStep1
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-isTest",
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
Boolean.TRUE.toString(),
|
"-sourcePath", sourcePath,
|
||||||
"-isSparkSessionManaged",
|
"-resultTableName", Publication.class.getCanonicalName(),
|
||||||
Boolean.FALSE.toString(),
|
"-outputPath", workingDir.toString() + "/preparedInfo",
|
||||||
"-sourcePath",
|
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
|
||||||
getClass()
|
"-allowedpids", "orcid;orcid_pending"
|
||||||
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate")
|
|
||||||
.getPath(),
|
|
||||||
"-hive_metastore_uris",
|
|
||||||
"",
|
|
||||||
"-resultTableName",
|
|
||||||
"eu.dnetlib.dhp.schema.oaf.Dataset",
|
|
||||||
"-outputPath",
|
|
||||||
workingDir.toString() + "/dataset",
|
|
||||||
"-possibleUpdatesPath",
|
|
||||||
getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc")
|
|
||||||
.getPath()
|
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<Dataset> tmp = sc
|
JavaRDD<ResultOrcidList> tmp = sc
|
||||||
.textFile(workingDir.toString() + "/dataset")
|
.textFile(workingDir.toString() + "/preparedInfo/publication")
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
|
||||||
|
|
||||||
// tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s));
|
Assertions.assertEquals(1, tmp.count());
|
||||||
|
|
||||||
Assertions.assertEquals(10, tmp.count());
|
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
|
||||||
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
|
||||||
|
|
||||||
verificationDataset.createOrReplaceTempView("dataset");
|
|
||||||
|
|
||||||
String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType "
|
|
||||||
+ "from dataset "
|
|
||||||
+ "lateral view explode(author) a as MyT "
|
|
||||||
+ "lateral view explode(MyT.pid) p as MyP "
|
|
||||||
+ "where MyP.datainfo.inferenceprovenance = 'propagation'";
|
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Row> propagatedAuthors = spark.sql(query);
|
|
||||||
|
|
||||||
Assertions.assertEquals(1, propagatedAuthors.count());
|
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1,
|
1, tmp
|
||||||
propagatedAuthors
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
.filter(
|
.count());
|
||||||
"id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' "
|
Assertions
|
||||||
+ "and name = 'Vajinder' and surname = 'Kumar' and pidType = '" +
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.size());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"0000-0002-5001-6911",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getOrcid());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"Barbarić-Mikočević, Željka",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getFullname());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"Željka",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getName());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"Barbarić-Mikočević",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getSurname());
|
||||||
|
|
||||||
ModelConstants.ORCID_PENDING + "'")
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
7, sc
|
||||||
|
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
|
||||||
.count());
|
.count());
|
||||||
|
|
||||||
Assertions.assertEquals(1, propagatedAuthors.filter("pid = '0000-0002-8825-3517'").count());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void twoUpdatesTest() throws Exception {
|
|
||||||
SparkOrcidToResultFromSemRelJob
|
|
||||||
.main(
|
|
||||||
new String[] {
|
|
||||||
"-isTest",
|
|
||||||
Boolean.TRUE.toString(),
|
|
||||||
"-isSparkSessionManaged",
|
|
||||||
Boolean.FALSE.toString(),
|
|
||||||
"-sourcePath",
|
|
||||||
getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates")
|
|
||||||
.getPath(),
|
|
||||||
"-hive_metastore_uris",
|
|
||||||
"",
|
|
||||||
"-resultTableName",
|
|
||||||
"eu.dnetlib.dhp.schema.oaf.Dataset",
|
|
||||||
"-outputPath",
|
|
||||||
workingDir.toString() + "/dataset",
|
|
||||||
"-possibleUpdatesPath",
|
|
||||||
getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc")
|
|
||||||
.getPath()
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
JavaRDD<Dataset> tmp = sc
|
|
||||||
.textFile(workingDir.toString() + "/dataset")
|
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
|
||||||
|
|
||||||
Assertions.assertEquals(10, tmp.count());
|
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
|
||||||
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
|
||||||
|
|
||||||
verificationDataset.createOrReplaceTempView("dataset");
|
|
||||||
|
|
||||||
String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType "
|
|
||||||
+ "from dataset "
|
|
||||||
+ "lateral view explode(author) a as MyT "
|
|
||||||
+ "lateral view explode(MyT.pid) p as MyP "
|
|
||||||
+ "where MyP.datainfo.inferenceprovenance = 'propagation'";
|
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Row> propagatedAuthors = spark.sql(query);
|
|
||||||
|
|
||||||
Assertions.assertEquals(2, propagatedAuthors.count());
|
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
|
1, sc
|
||||||
Assertions
|
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
|
||||||
.assertEquals(
|
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class))
|
||||||
1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count());
|
|
||||||
|
|
||||||
query = "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType "
|
|
||||||
+ "from dataset "
|
|
||||||
+ "lateral view explode(author) a as MyT "
|
|
||||||
+ "lateral view explode(MyT.pid) p as MyP ";
|
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Row> authorsExplodedPids = spark.sql(query);
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
authorsExplodedPids
|
|
||||||
.filter(
|
|
||||||
"name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'")
|
|
||||||
.count());
|
.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,222 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class PrepareStep2Test {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PrepareStep2Test.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
private static Path workingDir;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void beforeAll() throws IOException {
|
||||||
|
workingDir = Files.createTempDirectory(PrepareStep2Test.class.getSimpleName());
|
||||||
|
log.info("using work dir {}", workingDir);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(PrepareStep2Test.class.getSimpleName());
|
||||||
|
|
||||||
|
conf.setMaster("local[*]");
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
conf.set("spark.ui.enabled", "false");
|
||||||
|
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||||
|
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(PrepareStep2Test.class.getSimpleName())
|
||||||
|
.config(conf)
|
||||||
|
.getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void afterAll() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testMatch() throws Exception {
|
||||||
|
|
||||||
|
final String sourcePath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/resultSubset")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
PrepareResultOrcidAssociationStep2
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath", sourcePath,
|
||||||
|
"-outputPath", workingDir.toString() + "/preparedInfo/mergedOrcidAssoc"
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<ResultOrcidList> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/preparedInfo/mergedOrcidAssoc")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
|
||||||
|
|
||||||
|
Assertions.assertEquals(1, tmp.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1,
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
2, tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertTrue(
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(aa -> aa.getOrcid().equals("0000-0002-1234-5678")));
|
||||||
|
Assertions
|
||||||
|
.assertTrue(
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(aa -> aa.getOrcid().equals("0000-0002-5001-6911")));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void matchTest() throws Exception {
|
||||||
|
|
||||||
|
final String sourcePath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
PrepareResultOrcidAssociationStep1
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath", sourcePath,
|
||||||
|
"-resultTableName", Publication.class.getCanonicalName(),
|
||||||
|
"-outputPath", workingDir.toString() + "/preparedInfo",
|
||||||
|
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
|
||||||
|
"-allowedpids", "orcid;orcid_pending"
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<ResultOrcidList> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/preparedInfo/publication")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
|
||||||
|
|
||||||
|
Assertions.assertEquals(1, tmp.count());
|
||||||
|
|
||||||
|
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.count());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.size());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"0000-0002-5001-6911",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getOrcid());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"Barbarić-Mikočević, Željka",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getFullname());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"Željka",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getName());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"Barbarić-Mikočević",
|
||||||
|
tmp
|
||||||
|
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getAuthorList()
|
||||||
|
.get(0)
|
||||||
|
.getSurname());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
7, sc
|
||||||
|
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, sc
|
||||||
|
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
{"resultId":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","authorList":[{"name":"Željka","surname":"Barbarić-Mikočević","fullname":"Barbarić-Mikočević, Željka","orcid":"0000-0002-5001-6911"}]}
|
|
@ -0,0 +1 @@
|
||||||
|
{"resultId":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","authorList":[{"name":"Vesna","surname":"Džimbeg-Malčić","fullname":"Džimbeg-Malčić, Vesna","orcid":"0000-0002-1234-5678"}]}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue