forked from D-Net/dnet-hadoop
Merge pull request 'graph cleaning refactoring' (#282) from graph_cleaning_refactoring into beta
Reviewed-on: D-Net/dnet-hadoop#282
This commit is contained in:
commit
cdd33f7445
|
@ -16,6 +16,8 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
@ -38,6 +40,127 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
|
|
||||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||||
|
|
||||||
|
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
|
||||||
|
if (ModelSupport.isSubClass(value, Result.class)) {
|
||||||
|
final Result res = (Result) value;
|
||||||
|
if (shouldCleanContext(res, verifyParam)) {
|
||||||
|
res
|
||||||
|
.setContext(
|
||||||
|
res
|
||||||
|
.getContext()
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
return (T) res;
|
||||||
|
} else {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean shouldCleanContext(Result res, String verifyParam) {
|
||||||
|
boolean titleMatch = res
|
||||||
|
.getTitle()
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
t -> t
|
||||||
|
.getQualifier()
|
||||||
|
.getClassid()
|
||||||
|
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
|
||||||
|
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()));
|
||||||
|
|
||||||
|
return titleMatch && Objects.nonNull(res.getContext());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> T cleanCountry(T value, String[] verifyParam, Set<String> hostedBy,
|
||||||
|
String collectedfrom, String country) {
|
||||||
|
if (ModelSupport.isSubClass(value, Result.class)) {
|
||||||
|
final Result res = (Result) value;
|
||||||
|
if (res.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
|
||||||
|
!res.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
|
||||||
|
return (T) res;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<StructuredProperty> ids = getPidsAndAltIds(res).collect(Collectors.toList());
|
||||||
|
if (ids
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
p -> p
|
||||||
|
.getQualifier()
|
||||||
|
.getClassid()
|
||||||
|
.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
|
||||||
|
res
|
||||||
|
.setCountry(
|
||||||
|
res
|
||||||
|
.getCountry()
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
c -> toTakeCountry(c, country))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return (T) res;
|
||||||
|
} else {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
|
||||||
|
final Stream<StructuredProperty> resultPids = Optional
|
||||||
|
.ofNullable(r.getPid())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty());
|
||||||
|
|
||||||
|
final Stream<StructuredProperty> instancePids = Optional
|
||||||
|
.ofNullable(r.getInstance())
|
||||||
|
.map(
|
||||||
|
instance -> instance
|
||||||
|
.stream()
|
||||||
|
.flatMap(
|
||||||
|
i -> Optional
|
||||||
|
.ofNullable(i.getPid())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty())))
|
||||||
|
.orElse(Stream.empty());
|
||||||
|
|
||||||
|
final Stream<StructuredProperty> instanceAltIds = Optional
|
||||||
|
.ofNullable(r.getInstance())
|
||||||
|
.map(
|
||||||
|
instance -> instance
|
||||||
|
.stream()
|
||||||
|
.flatMap(
|
||||||
|
i -> Optional
|
||||||
|
.ofNullable(i.getAlternateIdentifier())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty())))
|
||||||
|
.orElse(Stream.empty());
|
||||||
|
|
||||||
|
return Stream
|
||||||
|
.concat(
|
||||||
|
Stream.concat(resultPids, instancePids),
|
||||||
|
instanceAltIds);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean pidInParam(String value, String[] verifyParam) {
|
||||||
|
for (String s : verifyParam)
|
||||||
|
if (value.startsWith(s))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean toTakeCountry(Country c, String country) {
|
||||||
|
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
|
||||||
|
// inserted via propagation
|
||||||
|
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
|
||||||
|
return true;
|
||||||
|
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
|
||||||
|
return true;
|
||||||
|
return !(c
|
||||||
|
.getClassid()
|
||||||
|
.equalsIgnoreCase(country) &&
|
||||||
|
c.getDataInfo().getInferenceprovenance().equals("propagation"));
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||||
if (value instanceof Datasource) {
|
if (value instanceof Datasource) {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
|
|
|
@ -1,122 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class CleanContextSparkJob implements Serializable {
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanContextSparkJob.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
|
||||||
.toString(
|
|
||||||
CleanContextSparkJob.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_context_parameters.json"));
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
|
||||||
log.info("inputPath: {}", inputPath);
|
|
||||||
|
|
||||||
String workingDir = parser.get("workingDir");
|
|
||||||
log.info("workingDir: {}", workingDir);
|
|
||||||
|
|
||||||
String contextId = parser.get("contextId");
|
|
||||||
log.info("contextId: {}", contextId);
|
|
||||||
|
|
||||||
String verifyParam = parser.get("verifyParam");
|
|
||||||
log.info("verifyParam: {}", verifyParam);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
|
||||||
|
|
||||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
|
|
||||||
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingDir);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> void cleanContext(SparkSession spark, String contextId, String verifyParam,
|
|
||||||
String inputPath, Class<T> entityClazz, String workingDir) {
|
|
||||||
Dataset<T> res = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz));
|
|
||||||
|
|
||||||
res.map((MapFunction<T, T>) r -> {
|
|
||||||
if (!r
|
|
||||||
.getTitle()
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
t -> t
|
|
||||||
.getQualifier()
|
|
||||||
.getClassid()
|
|
||||||
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
|
|
||||||
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()))) {
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
r
|
|
||||||
.setContext(
|
|
||||||
r
|
|
||||||
.getContext()
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
c -> !c.getId().split("::")[0]
|
|
||||||
.equalsIgnoreCase(contextId))
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
return r;
|
|
||||||
}, Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir);
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(workingDir)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(inputPath);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -3,12 +3,16 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.*;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
@ -17,15 +21,22 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class CleanGraphSparkJob {
|
public class CleanGraphSparkJob {
|
||||||
|
|
||||||
|
@ -33,8 +44,13 @@ public class CleanGraphSparkJob {
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
private ArgumentApplicationParser parser;
|
||||||
|
|
||||||
|
public CleanGraphSparkJob(ArgumentApplicationParser parser) {
|
||||||
|
this.parser = parser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
CleanGraphSparkJob.class
|
CleanGraphSparkJob.class
|
||||||
|
@ -49,30 +65,70 @@ public class CleanGraphSparkJob {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String isLookupUrl = parser.get("isLookupUrl");
|
||||||
|
log.info("isLookupUrl: {}", isLookupUrl);
|
||||||
|
|
||||||
|
ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(parser).run(isSparkSessionManaged, isLookup);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run(Boolean isSparkSessionManaged, ISLookUpService isLookUpService)
|
||||||
|
throws ISLookUpException, ClassNotFoundException {
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
String inputPath = parser.get("inputPath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
String outputPath = parser.get("outputPath");
|
String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
String isLookupUrl = parser.get("isLookupUrl");
|
|
||||||
log.info("isLookupUrl: {}", isLookupUrl);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
String graphTableClassName = parser.get("graphTableClassName");
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
log.info("graphTableClassName: {}", graphTableClassName);
|
||||||
|
|
||||||
|
String contextId = parser.get("contextId");
|
||||||
|
log.info("contextId: {}", contextId);
|
||||||
|
|
||||||
|
String verifyParam = parser.get("verifyParam");
|
||||||
|
log.info("verifyParam: {}", verifyParam);
|
||||||
|
|
||||||
|
String datasourcePath = parser.get("hostedBy");
|
||||||
|
log.info("datasourcePath: {}", datasourcePath);
|
||||||
|
|
||||||
|
String country = parser.get("country");
|
||||||
|
log.info("country: {}", country);
|
||||||
|
|
||||||
|
String[] verifyCountryParam = Optional
|
||||||
|
.ofNullable(parser.get("verifyCountryParam"))
|
||||||
|
.map(s -> s.split(";"))
|
||||||
|
.orElse(new String[] {});
|
||||||
|
log.info("verifyCountryParam: {}", verifyCountryParam);
|
||||||
|
|
||||||
|
String collectedfrom = parser.get("collectedfrom");
|
||||||
|
log.info("collectedfrom: {}", collectedfrom);
|
||||||
|
|
||||||
|
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
|
||||||
|
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
|
||||||
|
|
||||||
|
Boolean deepClean = Optional
|
||||||
|
.ofNullable(parser.get("deepClean"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.FALSE);
|
||||||
|
log.info("deepClean: {}", deepClean);
|
||||||
|
|
||||||
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
|
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
|
||||||
|
|
||||||
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||||
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(CleanGraphSparkJob.class.getSimpleName() + "#" + entityClazz.getSimpleName());
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
||||||
cleanGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
|
cleanGraphTable(
|
||||||
|
spark, vocs, inputPath, entityClazz, outputPath, contextId, verifyParam, datasourcePath, country,
|
||||||
|
verifyCountryParam, collectedfrom, dsMasterDuplicatePath, deepClean);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,21 +137,88 @@ public class CleanGraphSparkJob {
|
||||||
VocabularyGroup vocs,
|
VocabularyGroup vocs,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
Class<T> clazz,
|
Class<T> clazz,
|
||||||
String outputPath) {
|
String outputPath, String contextId, String verifyParam, String datasourcePath, String country,
|
||||||
|
String[] verifyCountryParam, String collectedfrom, String dsMasterDuplicatePath,
|
||||||
|
Boolean deepClean) {
|
||||||
|
|
||||||
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
|
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
|
||||||
|
|
||||||
readTableFromPath(spark, inputPath, clazz)
|
final Dataset<T> cleaned_basic = readTableFromPath(spark, inputPath, clazz)
|
||||||
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
||||||
.filter((FilterFunction<T>) GraphCleaningFunctions::filter)
|
.filter((FilterFunction<T>) GraphCleaningFunctions::filter);
|
||||||
|
|
||||||
|
// read the master-duplicate tuples
|
||||||
|
Dataset<MasterDuplicate> md = spark
|
||||||
|
.read()
|
||||||
|
.textFile(dsMasterDuplicatePath)
|
||||||
|
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
|
||||||
|
|
||||||
|
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
|
||||||
|
Dataset<IdCfHbMapping> resolved = spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map(as(clazz), Encoders.bean(clazz))
|
||||||
|
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
|
||||||
|
|
||||||
|
if (Boolean.FALSE.equals(deepClean)) {
|
||||||
|
|
||||||
|
if (Boolean.TRUE.equals(ModelSupport.isSubClass(clazz, Result.class))) {
|
||||||
|
save(fixCFHB(clazz, cleaned_basic, md, resolved), outputPath);
|
||||||
|
} else {
|
||||||
|
save(cleaned_basic, outputPath);
|
||||||
|
}
|
||||||
|
} else if (Boolean.TRUE.equals(ModelSupport.isSubClass(clazz, Result.class))) {
|
||||||
|
|
||||||
|
// load the hostedby mapping
|
||||||
|
Set<String> hostedBy = Sets
|
||||||
|
.newHashSet(
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.textFile(datasourcePath)
|
||||||
|
.collectAsList());
|
||||||
|
|
||||||
|
// perform the deep cleaning steps
|
||||||
|
final Dataset<T> cleaned_deep = fixCFHB(clazz, cleaned_basic, md, resolved)
|
||||||
|
.map(
|
||||||
|
(MapFunction<T, T>) value -> GraphCleaningFunctions.cleanContext(value, contextId, verifyParam),
|
||||||
|
Encoders.bean(clazz))
|
||||||
|
.map(
|
||||||
|
(MapFunction<T, T>) value -> GraphCleaningFunctions
|
||||||
|
.cleanCountry(value, verifyCountryParam, hostedBy, collectedfrom, country),
|
||||||
|
Encoders.bean(clazz));
|
||||||
|
|
||||||
|
save(cleaned_deep, outputPath);
|
||||||
|
} else {
|
||||||
|
save(cleaned_basic, outputPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> void save(final Dataset<T> dataset, final String outputPath) {
|
||||||
|
dataset
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> Dataset<T> fixCFHB(Class<T> clazz, Dataset<T> results, Dataset<MasterDuplicate> md,
|
||||||
|
Dataset<IdCfHbMapping> resolved) {
|
||||||
|
|
||||||
|
// set the EMPTY master ID/NAME
|
||||||
|
Dataset<IdCfHbMapping> resolvedDs = resolved
|
||||||
|
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
|
||||||
|
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
|
||||||
|
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()));
|
||||||
|
|
||||||
|
return results
|
||||||
|
.joinWith(resolvedDs, results.col("id").equalTo(resolvedDs.col("resultId")), "left")
|
||||||
|
.groupByKey(
|
||||||
|
(MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> ((Result) t._1()).getId(), Encoders.STRING())
|
||||||
|
.mapGroups(getMapGroupsFunction(), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
||||||
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
||||||
|
|
||||||
|
@ -103,9 +226,104 @@ public class CleanGraphSparkJob {
|
||||||
return spark
|
return spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(inputEntityPath)
|
.textFile(inputEntityPath)
|
||||||
.map(
|
.map(as(clazz), Encoders.bean(clazz));
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
|
}
|
||||||
Encoders.bean(clazz));
|
|
||||||
|
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
||||||
|
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
|
||||||
|
return r -> Stream
|
||||||
|
.concat(
|
||||||
|
Optional
|
||||||
|
.ofNullable(r.getCollectedfrom())
|
||||||
|
.map(cf -> cf.stream().map(KeyValue::getKey))
|
||||||
|
.orElse(Stream.empty()),
|
||||||
|
Stream
|
||||||
|
.concat(
|
||||||
|
Optional
|
||||||
|
.ofNullable(((Result) r).getInstance())
|
||||||
|
.map(
|
||||||
|
instances -> instances
|
||||||
|
.stream()
|
||||||
|
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
|
||||||
|
.orElse(Stream.empty())
|
||||||
|
.filter(StringUtils::isNotBlank),
|
||||||
|
Optional
|
||||||
|
.ofNullable(((Result) r).getInstance())
|
||||||
|
.map(
|
||||||
|
instances -> instances
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
i -> Optional
|
||||||
|
.ofNullable(i.getCollectedfrom())
|
||||||
|
.map(KeyValue::getKey)
|
||||||
|
.orElse("")))
|
||||||
|
.orElse(Stream.empty())
|
||||||
|
.filter(StringUtils::isNotBlank)))
|
||||||
|
.distinct()
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.map(cfHb -> asIdCfHbMapping(((Result) r).getId(), cfHb))
|
||||||
|
.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
|
||||||
|
return t -> {
|
||||||
|
final IdCfHbMapping mapping = t._1();
|
||||||
|
Optional
|
||||||
|
.ofNullable(t._2())
|
||||||
|
.ifPresent(t2 -> {
|
||||||
|
mapping.setMasterId(t2.getMasterId());
|
||||||
|
mapping.setMasterName(t2.getMasterName());
|
||||||
|
|
||||||
|
});
|
||||||
|
return mapping;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
|
||||||
|
IdCfHbMapping m = new IdCfHbMapping(resultId);
|
||||||
|
m.setCfhb(cfHb);
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
|
||||||
|
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
|
||||||
|
@Override
|
||||||
|
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
|
||||||
|
final Tuple2<T, IdCfHbMapping> first = values.next();
|
||||||
|
final T res = first._1();
|
||||||
|
|
||||||
|
updateResult(res, first._2());
|
||||||
|
values.forEachRemaining(t -> updateResult(res, t._2()));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateResult(T res, IdCfHbMapping m) {
|
||||||
|
if (Objects.nonNull(m)) {
|
||||||
|
filter(res.getCollectedfrom()).forEach(kv -> updateKeyValue(kv, m));
|
||||||
|
((Result) res).getInstance().forEach(i -> {
|
||||||
|
updateKeyValue(i.getHostedby(), m);
|
||||||
|
updateKeyValue(i.getCollectedfrom(), m);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Stream<KeyValue> filter(List<KeyValue> kvs) {
|
||||||
|
return kvs
|
||||||
|
.stream()
|
||||||
|
.filter(kv -> StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
|
||||||
|
if (Objects.nonNull(kv) && Objects.nonNull(kv.getKey()) && kv.getKey().equals(a.getCfhb())) {
|
||||||
|
kv.setKey(a.getMasterId());
|
||||||
|
kv.setValue(a.getMasterName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -21,7 +20,6 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
|
@ -1,227 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
|
||||||
import org.apache.spark.sql.expressions.Aggregator;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
|
||||||
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
public class CleanCfHbSparkJob {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJob.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
|
||||||
.toString(
|
|
||||||
CleanCountrySparkJob.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_cfhb_parameters.json"));
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
|
||||||
log.info("inputPath: {}", inputPath);
|
|
||||||
|
|
||||||
String resolvedPath = parser.get("resolvedPath");
|
|
||||||
log.info("resolvedPath: {}", resolvedPath);
|
|
||||||
|
|
||||||
String outputPath = parser.get("outputPath");
|
|
||||||
log.info("outputPath: {}", outputPath);
|
|
||||||
|
|
||||||
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
|
|
||||||
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
|
||||||
|
|
||||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
|
||||||
HdfsSupport.remove(resolvedPath, spark.sparkContext().hadoopConfiguration());
|
|
||||||
cleanCfHb(
|
|
||||||
spark, inputPath, entityClazz, resolvedPath, dsMasterDuplicatePath, outputPath);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
|
|
||||||
String resolvedPath, String masterDuplicatePath, String outputPath) {
|
|
||||||
|
|
||||||
// read the master-duplicate tuples
|
|
||||||
Dataset<MasterDuplicate> md = spark
|
|
||||||
.read()
|
|
||||||
.textFile(masterDuplicatePath)
|
|
||||||
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
|
|
||||||
|
|
||||||
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
|
|
||||||
Dataset<IdCfHbMapping> resolved = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(as(entityClazz), Encoders.bean(entityClazz))
|
|
||||||
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
|
|
||||||
|
|
||||||
// set the EMPTY master ID/NAME and save it
|
|
||||||
resolved
|
|
||||||
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
|
|
||||||
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
|
|
||||||
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(resolvedPath);
|
|
||||||
|
|
||||||
// read again the resolved CF|HB mapping
|
|
||||||
Dataset<IdCfHbMapping> resolvedDS = spark
|
|
||||||
.read()
|
|
||||||
.textFile(resolvedPath)
|
|
||||||
.map(as(IdCfHbMapping.class), Encoders.bean(IdCfHbMapping.class));
|
|
||||||
|
|
||||||
// read the result table
|
|
||||||
Dataset<T> res = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(as(entityClazz), Encoders.bean(entityClazz));
|
|
||||||
|
|
||||||
// Join the results with the resolved CF|HB mapping, apply the mapping and save it
|
|
||||||
res
|
|
||||||
.joinWith(resolvedDS, res.col("id").equalTo(resolvedDS.col("resultId")), "left")
|
|
||||||
.groupByKey((MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> t._1().getId(), Encoders.STRING())
|
|
||||||
.mapGroups(getMapGroupsFunction(), Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(outputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
|
|
||||||
return t -> {
|
|
||||||
final IdCfHbMapping mapping = t._1();
|
|
||||||
Optional
|
|
||||||
.ofNullable(t._2())
|
|
||||||
.ifPresent(t2 -> {
|
|
||||||
mapping.setMasterId(t2.getMasterId());
|
|
||||||
mapping.setMasterName(t2.getMasterName());
|
|
||||||
|
|
||||||
});
|
|
||||||
return mapping;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
|
|
||||||
return r -> Stream
|
|
||||||
.concat(
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getCollectedfrom())
|
|
||||||
.map(cf -> cf.stream().map(KeyValue::getKey))
|
|
||||||
.orElse(Stream.empty()),
|
|
||||||
Stream
|
|
||||||
.concat(
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instances -> instances
|
|
||||||
.stream()
|
|
||||||
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
|
|
||||||
.orElse(Stream.empty())
|
|
||||||
.filter(StringUtils::isNotBlank),
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instances -> instances
|
|
||||||
.stream()
|
|
||||||
.map(
|
|
||||||
i -> Optional
|
|
||||||
.ofNullable(i.getCollectedfrom())
|
|
||||||
.map(KeyValue::getKey)
|
|
||||||
.orElse("")))
|
|
||||||
.orElse(Stream.empty())
|
|
||||||
.filter(StringUtils::isNotBlank)))
|
|
||||||
.distinct()
|
|
||||||
.filter(StringUtils::isNotBlank)
|
|
||||||
.map(cfHb -> asIdCfHbMapping(r.getId(), cfHb))
|
|
||||||
.iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
|
|
||||||
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
|
|
||||||
@Override
|
|
||||||
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
|
|
||||||
final Tuple2<T, IdCfHbMapping> first = values.next();
|
|
||||||
final T res = first._1();
|
|
||||||
|
|
||||||
updateResult(res, first._2());
|
|
||||||
values.forEachRemaining(t -> updateResult(res, t._2()));
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateResult(T res, IdCfHbMapping m) {
|
|
||||||
if (Objects.nonNull(m)) {
|
|
||||||
res.getCollectedfrom().forEach(kv -> updateKeyValue(kv, m));
|
|
||||||
res.getInstance().forEach(i -> {
|
|
||||||
updateKeyValue(i.getHostedby(), m);
|
|
||||||
updateKeyValue(i.getCollectedfrom(), m);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
|
|
||||||
if (kv.getKey().equals(a.getCfhb())) {
|
|
||||||
kv.setKey(a.getMasterId());
|
|
||||||
kv.setValue(a.getMasterName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
|
|
||||||
IdCfHbMapping m = new IdCfHbMapping(resultId);
|
|
||||||
m.setCfhb(cfHb);
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
|
||||||
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,211 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import javax.swing.text.html.Option;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 20/07/22
|
|
||||||
*/
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
|
||||||
|
|
||||||
public class CleanCountrySparkJob implements Serializable {
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanCountrySparkJob.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
|
||||||
.toString(
|
|
||||||
CleanCountrySparkJob.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json"));
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
|
||||||
log.info("inputPath: {}", inputPath);
|
|
||||||
|
|
||||||
String workingDir = parser.get("workingDir");
|
|
||||||
log.info("workingDir: {}", workingDir);
|
|
||||||
|
|
||||||
String datasourcePath = parser.get("hostedBy");
|
|
||||||
log.info("datasourcePath: {}", datasourcePath);
|
|
||||||
|
|
||||||
String country = parser.get("country");
|
|
||||||
log.info("country: {}", country);
|
|
||||||
|
|
||||||
String[] verifyParam = parser.get("verifyParam").split(";");
|
|
||||||
log.info("verifyParam: {}", verifyParam);
|
|
||||||
|
|
||||||
String collectedfrom = parser.get("collectedfrom");
|
|
||||||
log.info("collectedfrom: {}", collectedfrom);
|
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
|
||||||
|
|
||||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
|
|
||||||
cleanCountry(
|
|
||||||
spark, country, verifyParam, inputPath, entityClazz, workingDir, collectedfrom, datasourcePath);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
|
|
||||||
String inputPath, Class<T> entityClazz, String workingDir, String collectedfrom, String datasourcePath) {
|
|
||||||
|
|
||||||
List<String> hostedBy = spark
|
|
||||||
.read()
|
|
||||||
.textFile(datasourcePath)
|
|
||||||
.collectAsList();
|
|
||||||
|
|
||||||
Dataset<T> res = spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz));
|
|
||||||
|
|
||||||
res.map((MapFunction<T, T>) r -> {
|
|
||||||
if (r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
|
|
||||||
!r.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<StructuredProperty> ids = getPidsAndAltIds(r).collect(Collectors.toList());
|
|
||||||
if (ids
|
|
||||||
.stream()
|
|
||||||
.anyMatch(
|
|
||||||
p -> p
|
|
||||||
.getQualifier()
|
|
||||||
.getClassid()
|
|
||||||
.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
|
|
||||||
r
|
|
||||||
.setCountry(
|
|
||||||
r
|
|
||||||
.getCountry()
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
c -> toTakeCountry(c, country))
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return r;
|
|
||||||
}, Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir);
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(workingDir)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
|
||||||
Encoders.bean(entityClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(inputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
|
|
||||||
final Stream<StructuredProperty> resultPids = Optional
|
|
||||||
.ofNullable(r.getPid())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
|
|
||||||
final Stream<StructuredProperty> instancePids = Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instance -> instance
|
|
||||||
.stream()
|
|
||||||
.flatMap(
|
|
||||||
i -> Optional
|
|
||||||
.ofNullable(i.getPid())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty())))
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
|
|
||||||
final Stream<StructuredProperty> instanceAltIds = Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instance -> instance
|
|
||||||
.stream()
|
|
||||||
.flatMap(
|
|
||||||
i -> Optional
|
|
||||||
.ofNullable(i.getAlternateIdentifier())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty())))
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
|
|
||||||
return Stream
|
|
||||||
.concat(
|
|
||||||
Stream.concat(resultPids, instancePids),
|
|
||||||
instanceAltIds);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean pidInParam(String value, String[] verifyParam) {
|
|
||||||
for (String s : verifyParam)
|
|
||||||
if (value.startsWith(s))
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean toTakeCountry(Country c, String country) {
|
|
||||||
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
|
|
||||||
// inserted via propagation
|
|
||||||
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
|
|
||||||
return true;
|
|
||||||
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
|
|
||||||
return true;
|
|
||||||
return !(c
|
|
||||||
.getClassid()
|
|
||||||
.equalsIgnoreCase(country) &&
|
|
||||||
c.getDataInfo().getInferenceprovenance().equals("propagation"));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -83,12 +83,57 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="fork_clean_graph"/>
|
<start to="prepare_info"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
<fork name="prepare_info">
|
||||||
|
<path start="select_datasourceId_from_country"/>
|
||||||
|
<path start="get_ds_master_duplicate"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="select_datasourceId_from_country">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Select datasource ID from country</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.clean.GetDatasourceFromCountry</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=10000
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${graphInputPath}</arg>
|
||||||
|
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="get_ds_master_duplicate">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
|
||||||
|
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
||||||
|
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||||
|
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||||
|
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="wait_prepare"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait_prepare" to="fork_clean_graph"/>
|
||||||
|
|
||||||
<fork name="fork_clean_graph">
|
<fork name="fork_clean_graph">
|
||||||
<path start="clean_publication"/>
|
<path start="clean_publication"/>
|
||||||
<path start="clean_dataset"/>
|
<path start="clean_dataset"/>
|
||||||
|
@ -115,12 +160,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -141,12 +194,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -167,12 +228,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -193,12 +262,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -219,12 +296,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -245,12 +330,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -271,12 +364,20 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -297,486 +398,26 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=20000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
<arg>--country</arg><arg>${country}</arg>
|
||||||
|
<arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
|
||||||
|
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||||
|
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||||
|
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||||
|
<arg>--deepClean</arg><arg>${shouldClean}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_clean"/>
|
<ok to="wait_clean"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait_clean" to="clean_context"/>
|
<join name="wait_clean" to="End"/>
|
||||||
|
|
||||||
<decision name="clean_context">
|
|
||||||
<switch>
|
|
||||||
<case to="fork_clean_context">${wf:conf('shouldClean') eq true}</case>
|
|
||||||
<default to="End"/>
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<fork name="fork_clean_context">
|
|
||||||
<path start="clean_publication_context"/>
|
|
||||||
<path start="clean_dataset_context"/>
|
|
||||||
<path start="clean_otherresearchproduct_context"/>
|
|
||||||
<path start="clean_software_context"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="clean_publication_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean publications context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_dataset_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean datasets Context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_otherresearchproduct_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean otherresearchproducts context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_software_context">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean softwares context</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
|
|
||||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_context"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait_clean_context" to="select_datasourceId_from_country"/>
|
|
||||||
|
|
||||||
<action name="select_datasourceId_from_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Select datasource ID from country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="fork_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<fork name="fork_clean_country">
|
|
||||||
<path start="clean_publication_country"/>
|
|
||||||
<path start="clean_dataset_country"/>
|
|
||||||
<path start="clean_otherresearchproduct_country"/>
|
|
||||||
<path start="clean_software_country"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="clean_publication_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean publication country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_dataset_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean dataset country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_otherresearchproduct_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean otherresearchproduct country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="clean_software_country">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Clean software country</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
|
|
||||||
<arg>--country</arg><arg>${country}</arg>
|
|
||||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
|
||||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
|
||||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_country"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait_clean_country" to="should_patch_datasource_ids"/>
|
|
||||||
|
|
||||||
<decision name="should_patch_datasource_ids">
|
|
||||||
<switch>
|
|
||||||
<case to="get_ds_master_duplicate">${wf:conf('shouldClean') eq true}</case>
|
|
||||||
<default to="End"/>
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="get_ds_master_duplicate">
|
|
||||||
<java>
|
|
||||||
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
|
|
||||||
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
|
||||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
|
||||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
|
||||||
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="fork_patch_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<fork name="fork_patch_cfhb">
|
|
||||||
<path start="patch_publication_cfhb"/>
|
|
||||||
<path start="patch_dataset_cfhb"/>
|
|
||||||
<path start="patch_otherresearchproduct_cfhb"/>
|
|
||||||
<path start="patch_software_cfhb"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="patch_publication_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch publication cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/publication</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/publication</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="patch_dataset_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch dataset cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/dataset</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/dataset</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="patch_otherresearchproduct_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch otherresearchproduct cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/otherresearchproduct</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="patch_software_cfhb">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>patch software cfhb</name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
|
||||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/software</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/software</arg>
|
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait_clean_cfhb"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait_clean_cfhb" to="fork_copy_cfhb_patched_results"/>
|
|
||||||
|
|
||||||
<fork name="fork_copy_cfhb_patched_results">
|
|
||||||
<path start="copy_cfhb_patched_publication"/>
|
|
||||||
<path start="copy_cfhb_patched_dataset"/>
|
|
||||||
<path start="copy_cfhb_patched_otherresearchproduct"/>
|
|
||||||
<path start="copy_cfhb_patched_software"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_publication">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/publication"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/publication</arg>
|
|
||||||
<arg>${graphOutputPath}/publication</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_dataset">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/dataset"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/dataset</arg>
|
|
||||||
<arg>${graphOutputPath}/dataset</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_otherresearchproduct">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/otherresearchproduct"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
|
|
||||||
<arg>${graphOutputPath}/otherresearchproduct</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_cfhb_patched_software">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<prepare>
|
|
||||||
<delete path="${graphOutputPath}/software"/>
|
|
||||||
</prepare>
|
|
||||||
<arg>${workingDir}/cfHbPatched/software</arg>
|
|
||||||
<arg>${graphOutputPath}/software</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="copy_wait" to="End"/>
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
|
|
|
@ -28,5 +28,53 @@
|
||||||
"paramLongName": "graphTableClassName",
|
"paramLongName": "graphTableClassName",
|
||||||
"paramDescription": "class name moelling the graph table",
|
"paramDescription": "class name moelling the graph table",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ci",
|
||||||
|
"paramLongName": "contextId",
|
||||||
|
"paramDescription": "the id of the context to be removed",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "vf",
|
||||||
|
"paramLongName": "verifyParam",
|
||||||
|
"paramDescription": "the parameter to be verified to remove the context",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "c",
|
||||||
|
"paramLongName": "country",
|
||||||
|
"paramDescription": "the id of the context to be removed",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "vfc",
|
||||||
|
"paramLongName": "verifyCountryParam",
|
||||||
|
"paramDescription": "the parameter to be verified to remove the country",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "cf",
|
||||||
|
"paramLongName": "collectedfrom",
|
||||||
|
"paramDescription": "the collectedfrom value for which we should apply the cleaning",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "hb",
|
||||||
|
"paramLongName": "hostedBy",
|
||||||
|
"paramDescription": "the set of datasources having the specified country in the graph searched for in the hostedby of the results",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "md",
|
||||||
|
"paramLongName": "masterDuplicatePath",
|
||||||
|
"paramDescription": "path to the file on HDFS holding the datasource id tuples [master, duplicate]",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "dc",
|
||||||
|
"paramLongName": "deepClean",
|
||||||
|
"paramDescription": "flag to activate further cleaning steps",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,289 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.AfterAll;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class CleanContextTest {
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path workingDir;
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException {
|
|
||||||
workingDir = Files.createTempDirectory(CleanContextTest.class.getSimpleName());
|
|
||||||
log.info("using work dir {}", workingDir);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setAppName(CleanContextTest.class.getSimpleName());
|
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("hive.metastore.local", "true");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
|
||||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
|
||||||
|
|
||||||
spark = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(CleanContextTest.class.getSimpleName())
|
|
||||||
.config(conf)
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
|
||||||
spark.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testResultClean() throws Exception {
|
|
||||||
final String sourcePath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_context.json")
|
|
||||||
.getPath();
|
|
||||||
final String prefix = "gcube ";
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(sourcePath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
|
|
||||||
Encoders.bean(Publication.class))
|
|
||||||
.write()
|
|
||||||
.json(workingDir.toString() + "/publication");
|
|
||||||
|
|
||||||
CleanContextSparkJob.main(new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", workingDir.toString() + "/publication",
|
|
||||||
"--graphTableClassName", Publication.class.getCanonicalName(),
|
|
||||||
"--workingDir", workingDir.toString() + "/working",
|
|
||||||
"--contextId", "sobigdata",
|
|
||||||
"--verifyParam", "gCube "
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
JavaRDD<Publication> tmp = sc
|
|
||||||
.textFile(workingDir.toString() + "/publication")
|
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
|
||||||
|
|
||||||
Assertions.assertEquals(7, tmp.count());
|
|
||||||
|
|
||||||
// original result with sobigdata context and gcube as starting string in the main title for the publication
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
0,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with sobigdata context without gcube as starting string in the main title for the publication
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"sobigdata::projects::2",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
|
|
||||||
// original result with sobigdata context with gcube as starting string in the subtitle
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"sobigdata::projects::2",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
List<StructuredProperty> titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(1, titles.size());
|
|
||||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
|
||||||
Assertions.assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
|
|
||||||
|
|
||||||
// original result with sobigdata context with gcube not as starting string in the main title
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"sobigdata::projects::1",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(1, titles.size());
|
|
||||||
Assertions.assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
|
||||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()));
|
|
||||||
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
|
||||||
|
|
||||||
// original result with sobigdata in context and also other contexts with gcube as starting string for the main
|
|
||||||
// title
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"dh-ch",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(1, titles.size());
|
|
||||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
|
||||||
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
|
||||||
|
|
||||||
// original result with multiple main title one of which whith gcube as starting string and with 2 contextes
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"dh-ch",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(2, titles.size());
|
|
||||||
Assertions
|
|
||||||
.assertTrue(
|
|
||||||
titles
|
|
||||||
.stream()
|
|
||||||
.anyMatch(
|
|
||||||
t -> t.getQualifier().getClassid().equals("main title")
|
|
||||||
&& t.getValue().toLowerCase().startsWith(prefix)));
|
|
||||||
|
|
||||||
// original result without sobigdata in context with gcube as starting string for the main title
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.size());
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"dh-ch",
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getContext()
|
|
||||||
.get(0)
|
|
||||||
.getId());
|
|
||||||
titles = tmp
|
|
||||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getTitle();
|
|
||||||
Assertions.assertEquals(2, titles.size());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertTrue(
|
|
||||||
titles
|
|
||||||
.stream()
|
|
||||||
.anyMatch(
|
|
||||||
t -> t.getQualifier().getClassid().equals("main title")
|
|
||||||
&& t.getValue().toLowerCase().startsWith(prefix)));
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,190 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.AfterAll;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 20/07/22
|
|
||||||
*/
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
|
|
||||||
public class CleanCountryTest {
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path workingDir;
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException {
|
|
||||||
workingDir = Files.createTempDirectory(CleanCountryTest.class.getSimpleName());
|
|
||||||
log.info("using work dir {}", workingDir);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setAppName(CleanCountryTest.class.getSimpleName());
|
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("hive.metastore.local", "true");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
|
||||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
|
||||||
|
|
||||||
spark = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(CleanCountryTest.class.getSimpleName())
|
|
||||||
.config(conf)
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
|
||||||
spark.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testResultClean() throws Exception {
|
|
||||||
final String sourcePath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_country.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(sourcePath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
|
|
||||||
Encoders.bean(Publication.class))
|
|
||||||
.write()
|
|
||||||
.json(workingDir.toString() + "/publication");
|
|
||||||
|
|
||||||
CleanCountrySparkJob.main(new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", workingDir.toString() + "/publication",
|
|
||||||
"--graphTableClassName", Publication.class.getCanonicalName(),
|
|
||||||
"--workingDir", workingDir.toString() + "/working",
|
|
||||||
"--country", "NL",
|
|
||||||
"--verifyParam", "10.17632",
|
|
||||||
"--collectedfrom", "NARCIS",
|
|
||||||
"--hostedBy", getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
|
||||||
.getPath()
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
JavaRDD<Publication> tmp = sc
|
|
||||||
.textFile(workingDir.toString() + "/publication")
|
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
|
||||||
|
|
||||||
Assertions.assertEquals(8, tmp.count());
|
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with NL country and pid not starting with Mendely prefix
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
|
|
||||||
// inserted with propagation
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
|
|
||||||
// propagation
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
0,
|
|
||||||
tmp
|
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6ag"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getCountry()
|
|
||||||
.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testDatasetClean() throws Exception {
|
|
||||||
final String sourcePath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.textFile(sourcePath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<String, Dataset>) r -> OBJECT_MAPPER.readValue(r, Dataset.class),
|
|
||||||
Encoders.bean(Dataset.class))
|
|
||||||
.write()
|
|
||||||
.json(workingDir.toString() + "/dataset");
|
|
||||||
|
|
||||||
CleanCountrySparkJob.main(new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", workingDir.toString() + "/dataset",
|
|
||||||
"-graphTableClassName", Dataset.class.getCanonicalName(),
|
|
||||||
"-workingDir", workingDir.toString() + "/working",
|
|
||||||
"-country", "NL",
|
|
||||||
"-verifyParam", "10.17632",
|
|
||||||
"-collectedfrom", "NARCIS",
|
|
||||||
"-hostedBy", getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
|
||||||
.getPath()
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
JavaRDD<Dataset> tmp = sc
|
|
||||||
.textFile(workingDir.toString() + "/dataset")
|
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
|
||||||
|
|
||||||
Assertions.assertEquals(1, tmp.count());
|
|
||||||
|
|
||||||
Assertions.assertEquals(0, tmp.first().getCountry().size());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,924 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.io.filefilter.FalseFileFilter;
|
||||||
|
import org.apache.commons.io.filefilter.TrueFileFilter;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
public class CleanGraphSparkJobTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJobTest.class);
|
||||||
|
|
||||||
|
public static final ObjectMapper MAPPER = new ObjectMapper()
|
||||||
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private ISLookUpService isLookUpService;
|
||||||
|
|
||||||
|
private VocabularyGroup vocabularies;
|
||||||
|
|
||||||
|
private CleaningRuleMap mapping;
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
private static Path testBaseTmpPath;
|
||||||
|
|
||||||
|
private static String graphInputPath;
|
||||||
|
|
||||||
|
private static String graphOutputPath;
|
||||||
|
|
||||||
|
private static String dsMasterDuplicatePath;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void beforeAll() throws IOException, URISyntaxException {
|
||||||
|
testBaseTmpPath = Files.createTempDirectory(CleanGraphSparkJobTest.class.getSimpleName());
|
||||||
|
log.info("using test base path {}", testBaseTmpPath);
|
||||||
|
|
||||||
|
File basePath = Paths
|
||||||
|
.get(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
CleanGraphSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/graph"))
|
||||||
|
.toURI())
|
||||||
|
.toFile();
|
||||||
|
|
||||||
|
List<File> paths = FileUtils
|
||||||
|
.listFilesAndDirs(basePath, FalseFileFilter.FALSE, TrueFileFilter.TRUE)
|
||||||
|
.stream()
|
||||||
|
.filter(f -> !f.getAbsolutePath().endsWith("/graph"))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
for (File path : paths) {
|
||||||
|
String type = StringUtils.substringAfterLast(path.getAbsolutePath(), "/");
|
||||||
|
FileUtils
|
||||||
|
.copyDirectory(
|
||||||
|
path,
|
||||||
|
testBaseTmpPath.resolve("input").resolve("graph").resolve(type).toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
FileUtils
|
||||||
|
.copyFileToDirectory(
|
||||||
|
Paths
|
||||||
|
.get(
|
||||||
|
CleanGraphSparkJobTest.class
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
|
||||||
|
.toURI())
|
||||||
|
.toFile(),
|
||||||
|
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
|
||||||
|
|
||||||
|
graphInputPath = testBaseTmpPath.resolve("input").resolve("graph").toString();
|
||||||
|
graphOutputPath = testBaseTmpPath.resolve("output").resolve("graph").toString();
|
||||||
|
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(CleanGraphSparkJobTest.class.getSimpleName());
|
||||||
|
|
||||||
|
conf.setMaster("local[*]");
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
conf.set("hive.metastore.local", "true");
|
||||||
|
conf.set("spark.ui.enabled", "false");
|
||||||
|
conf.set("spark.sql.warehouse.dir", testBaseTmpPath.toString());
|
||||||
|
conf.set("hive.metastore.warehouse.dir", testBaseTmpPath.resolve("warehouse").toString());
|
||||||
|
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
|
.getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws ISLookUpException, IOException {
|
||||||
|
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||||
|
.thenReturn(synonyms());
|
||||||
|
|
||||||
|
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||||
|
mapping = CleaningRuleMap.create(vocabularies);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void afterAll() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanRelations() throws Exception {
|
||||||
|
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.textFile(graphInputPath.toString() + "/relation")
|
||||||
|
.map(as(Relation.class), Encoders.bean(Relation.class))
|
||||||
|
.collectAsList()
|
||||||
|
.forEach(
|
||||||
|
r -> assertFalse(
|
||||||
|
vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass())));
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/relation",
|
||||||
|
"--outputPath", graphOutputPath + "/relation",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Relation.class.getCanonicalName(),
|
||||||
|
"--deepClean", "false",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.textFile(graphOutputPath.toString() + "/relation")
|
||||||
|
.map(as(Relation.class), Encoders.bean(Relation.class))
|
||||||
|
.collectAsList()
|
||||||
|
.forEach(r -> {
|
||||||
|
|
||||||
|
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass()));
|
||||||
|
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r.getSubRelType()));
|
||||||
|
|
||||||
|
assertEquals("iis", r.getDataInfo().getProvenanceaction().getClassid());
|
||||||
|
assertEquals("Inferred by OpenAIRE", r.getDataInfo().getProvenanceaction().getClassname());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFilter_invisible_true() throws Exception {
|
||||||
|
|
||||||
|
assertNotNull(vocabularies);
|
||||||
|
assertNotNull(mapping);
|
||||||
|
|
||||||
|
String json = IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json")));
|
||||||
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFilter_true_nothing_to_filter() throws Exception {
|
||||||
|
|
||||||
|
assertNotNull(vocabularies);
|
||||||
|
assertNotNull(mapping);
|
||||||
|
|
||||||
|
String json = IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")));
|
||||||
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFilter_missing_invisible() throws Exception {
|
||||||
|
|
||||||
|
assertNotNull(vocabularies);
|
||||||
|
assertNotNull(mapping);
|
||||||
|
|
||||||
|
String json = IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json")));
|
||||||
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning_publication() throws Exception {
|
||||||
|
|
||||||
|
final String id = "50|CSC_________::2250a70c903c6ac6e4c01438259e9375";
|
||||||
|
|
||||||
|
Publication p_in = read(spark, graphInputPath + "/publication", Publication.class)
|
||||||
|
.filter(String.format("id = '%s'", id))
|
||||||
|
.first();
|
||||||
|
|
||||||
|
assertNull(p_in.getBestaccessright());
|
||||||
|
assertTrue(p_in instanceof Result);
|
||||||
|
assertTrue(p_in instanceof Publication);
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "false",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
Publication p = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter(String.format("id = '%s'", id))
|
||||||
|
.first();
|
||||||
|
|
||||||
|
assertNull(p.getPublisher());
|
||||||
|
|
||||||
|
assertEquals("und", p.getLanguage().getClassid());
|
||||||
|
assertEquals("Undetermined", p.getLanguage().getClassname());
|
||||||
|
|
||||||
|
assertEquals("DE", p.getCountry().get(0).getClassid());
|
||||||
|
assertEquals("Germany", p.getCountry().get(0).getClassname());
|
||||||
|
|
||||||
|
assertEquals("0018", p.getInstance().get(0).getInstancetype().getClassid());
|
||||||
|
assertEquals("Annotation", p.getInstance().get(0).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("0027", p.getInstance().get(1).getInstancetype().getClassid());
|
||||||
|
assertEquals("Model", p.getInstance().get(1).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid());
|
||||||
|
assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("CLOSED", p.getInstance().get(0).getAccessright().getClassid());
|
||||||
|
assertEquals("Closed Access", p.getInstance().get(0).getAccessright().getClassname());
|
||||||
|
|
||||||
|
Set<String> pidTerms = vocabularies.getTerms(ModelConstants.DNET_PID_TYPES);
|
||||||
|
assertTrue(
|
||||||
|
p
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getQualifier)
|
||||||
|
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
||||||
|
|
||||||
|
List<Instance> poi = p.getInstance();
|
||||||
|
assertNotNull(poi);
|
||||||
|
assertEquals(3, poi.size());
|
||||||
|
|
||||||
|
final Instance poii = poi.get(0);
|
||||||
|
assertNotNull(poii);
|
||||||
|
assertNotNull(poii.getPid());
|
||||||
|
|
||||||
|
assertEquals(2, poii.getPid().size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
||||||
|
assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
||||||
|
|
||||||
|
assertNotNull(poii.getAlternateIdentifier());
|
||||||
|
assertEquals(1, poii.getAlternateIdentifier().size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
poii
|
||||||
|
.getAlternateIdentifier()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||||
|
|
||||||
|
assertEquals(3, p.getTitle().size());
|
||||||
|
|
||||||
|
List<String> titles = p
|
||||||
|
.getTitle()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getValue)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
assertTrue(titles.contains("omic"));
|
||||||
|
assertTrue(
|
||||||
|
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
|
||||||
|
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
|
||||||
|
|
||||||
|
assertEquals("CLOSED", p.getBestaccessright().getClassid());
|
||||||
|
assertNull(p.getPublisher());
|
||||||
|
|
||||||
|
assertEquals("1970-10-07", p.getDateofacceptance().getValue());
|
||||||
|
|
||||||
|
assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid());
|
||||||
|
assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
final List<Instance> pci = p.getInstance();
|
||||||
|
assertNotNull(pci);
|
||||||
|
assertEquals(3, pci.size());
|
||||||
|
|
||||||
|
final Instance pcii = pci.get(0);
|
||||||
|
assertNotNull(pcii);
|
||||||
|
assertNotNull(pcii.getPid());
|
||||||
|
|
||||||
|
assertEquals(2, pcii.getPid().size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
||||||
|
assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
||||||
|
|
||||||
|
assertNotNull(pcii.getAlternateIdentifier());
|
||||||
|
assertEquals(1, pcii.getAlternateIdentifier().size());
|
||||||
|
assertTrue(
|
||||||
|
pcii
|
||||||
|
.getAlternateIdentifier()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||||
|
|
||||||
|
assertNotNull(p.getSubject());
|
||||||
|
|
||||||
|
List<Subject> fos_subjects = p
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
assertNotNull(fos_subjects);
|
||||||
|
assertEquals(2, fos_subjects.size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
fos_subjects
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> "0101 mathematics".equals(s.getValue()) &
|
||||||
|
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||||
|
"sysimport:crosswalk:datasetarchive"
|
||||||
|
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
fos_subjects
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> "0102 computer and information sciences".equals(s.getValue()) &
|
||||||
|
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||||
|
|
||||||
|
verify_keyword(p, "In Situ Hybridization");
|
||||||
|
verify_keyword(p, "Avicennia");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanDoiBoost() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
|
||||||
|
verifyFiltering(1, "50|doi_________::b0baa0eb88a5788f0b8815560d2a32f2");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanDoiBoost2() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
|
||||||
|
verifyFiltering(1, "50|doi_________::4972b0ca81b96b225aed8038bb965656");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyFiltering(int expectedCount, String id)
|
||||||
|
throws ISLookUpException, ClassNotFoundException, IOException, ParseException {
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "false",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
Dataset<Publication> p = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter(String.format("id = '%s'", id));
|
||||||
|
|
||||||
|
assertEquals(expectedCount, p.count());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanContext() throws Exception {
|
||||||
|
final String prefix = "gcube ";
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "true",
|
||||||
|
"--contextId", "sobigdata",
|
||||||
|
"--verifyParam", "gCube ",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
"--country", "NL",
|
||||||
|
"--verifyCountryParam", "10.17632",
|
||||||
|
"--collectedfrom", "NARCIS",
|
||||||
|
"--hostedBy", Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
|
||||||
|
.getPath()
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
Dataset<Publication> pubs = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter((FilterFunction<Publication>) p1 -> StringUtils.endsWith(p1.getId(), "_ctx"));
|
||||||
|
|
||||||
|
assertEquals(7, pubs.count());
|
||||||
|
|
||||||
|
// original result with sobigdata context and gcube as starting string in the main title for the publication
|
||||||
|
assertEquals(
|
||||||
|
0,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::0224aae28af558f21768dbc6439a_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with sobigdata context without gcube as starting string in the main title for the publication
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"sobigdata::projects::2",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
|
||||||
|
// original result with sobigdata context with gcube as starting string in the subtitle
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"sobigdata::projects::2",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
|
||||||
|
List<StructuredProperty> titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(1, titles.size());
|
||||||
|
assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||||
|
assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
|
||||||
|
|
||||||
|
// original result with sobigdata context with gcube not as starting string in the main title
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"sobigdata::projects::1",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(1, titles.size());
|
||||||
|
assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||||
|
assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()));
|
||||||
|
assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
||||||
|
|
||||||
|
// original result with sobigdata in context and also other contexts with gcube as starting string for the main
|
||||||
|
// title
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"dh-ch",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(1, titles.size());
|
||||||
|
assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||||
|
assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
||||||
|
|
||||||
|
// original result with multiple main title one of which whith gcube as starting string and with 2 contextes
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"dh-ch",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(2, titles.size());
|
||||||
|
assertTrue(
|
||||||
|
titles
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
t -> t.getQualifier().getClassid().equals("main title")
|
||||||
|
&& t.getValue().toLowerCase().startsWith(prefix)));
|
||||||
|
|
||||||
|
// original result without sobigdata in context with gcube as starting string for the main title
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.size());
|
||||||
|
assertEquals(
|
||||||
|
"dh-ch",
|
||||||
|
pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
|
||||||
|
.first()
|
||||||
|
.getContext()
|
||||||
|
.get(0)
|
||||||
|
.getId());
|
||||||
|
titles = pubs
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx"))
|
||||||
|
.first()
|
||||||
|
.getTitle();
|
||||||
|
|
||||||
|
assertEquals(2, titles.size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
titles
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
t -> t.getQualifier().getClassid().equals("main title")
|
||||||
|
&& t.getValue().toLowerCase().startsWith(prefix)));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanCfHbSparkJob() throws Exception {
|
||||||
|
|
||||||
|
final Dataset<Publication> pubs_in = read(spark, graphInputPath + "/publication", Publication.class);
|
||||||
|
final Publication p1_in = pubs_in
|
||||||
|
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", p1_in.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("Bacterial Protein Interaction Database - DUP", p1_in.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc",
|
||||||
|
p1_in.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"Bacterial Protein Interaction Database - DUP", p1_in.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
|
||||||
|
final Publication p2_in = pubs_in
|
||||||
|
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", p2_in.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("FILUR DATA - DUP", p2_in.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35",
|
||||||
|
p2_in.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals("FILUR DATA - DUP", p2_in.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", p2_in.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals("depositar - DUP", p2_in.getInstance().get(0).getHostedby().getValue());
|
||||||
|
|
||||||
|
final Publication p3_in = pubs_in
|
||||||
|
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_in.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("DANS (Data Archiving and Networked Services)", p3_in.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f",
|
||||||
|
p3_in.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_in.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_in.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_in.getInstance().get(0).getHostedby().getValue());
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "true",
|
||||||
|
"--contextId", "sobigdata",
|
||||||
|
"--verifyParam", "gCube ",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
"--country", "NL",
|
||||||
|
"--verifyCountryParam", "10.17632",
|
||||||
|
"--collectedfrom", "NARCIS",
|
||||||
|
"--hostedBy", Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
|
||||||
|
.getPath()
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
assertTrue(Files.exists(Paths.get(graphOutputPath, "publication")));
|
||||||
|
|
||||||
|
final Dataset<Publication> pubs_out = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter((FilterFunction<Publication>) p -> StringUtils.endsWith(p.getId(), "_cfhb"));
|
||||||
|
|
||||||
|
assertEquals(3, pubs_out.count());
|
||||||
|
|
||||||
|
final Publication p1_out = pubs_out
|
||||||
|
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", p1_out.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("Bacterial Protein Interaction Database", p1_out.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce",
|
||||||
|
p1_out.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"Bacterial Protein Interaction Database", p1_out.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
|
||||||
|
final Publication p2_out = pubs_out
|
||||||
|
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", p2_out.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("FULIR Data", p2_out.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|re3data_____::fc1db64b3964826913b1e9eafe830490",
|
||||||
|
p2_out.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals("FULIR Data", p2_out.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", p2_out.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals("depositar", p2_out.getInstance().get(0).getHostedby().getValue());
|
||||||
|
|
||||||
|
final Publication p3_out = pubs_out
|
||||||
|
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'")
|
||||||
|
.first();
|
||||||
|
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_out.getCollectedfrom().get(0).getKey());
|
||||||
|
assertEquals("DANS (Data Archiving and Networked Services)", p3_out.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f",
|
||||||
|
p3_out.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_out.getInstance().get(0).getCollectedfrom().getValue());
|
||||||
|
assertEquals(
|
||||||
|
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", p3_out.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"DANS (Data Archiving and Networked Services)", p3_out.getInstance().get(0).getHostedby().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleanCountry() throws Exception {
|
||||||
|
|
||||||
|
new CleanGraphSparkJob(
|
||||||
|
args(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json",
|
||||||
|
new String[] {
|
||||||
|
"--inputPath", graphInputPath + "/publication",
|
||||||
|
"--outputPath", graphOutputPath + "/publication",
|
||||||
|
"--isLookupUrl", "lookupurl",
|
||||||
|
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
|
"--deepClean", "true",
|
||||||
|
"--contextId", "sobigdata",
|
||||||
|
"--verifyParam", "gCube ",
|
||||||
|
"--masterDuplicatePath", dsMasterDuplicatePath,
|
||||||
|
"--country", "NL",
|
||||||
|
"--verifyCountryParam", "10.17632",
|
||||||
|
"--collectedfrom", "NARCIS",
|
||||||
|
"--hostedBy", Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy"))
|
||||||
|
.getPath()
|
||||||
|
})).run(false, isLookUpService);
|
||||||
|
|
||||||
|
final Dataset<Publication> pubs_out = read(spark, graphOutputPath + "/publication", Publication.class)
|
||||||
|
.filter((FilterFunction<Publication>) p -> StringUtils.endsWith(p.getId(), "_country"));
|
||||||
|
|
||||||
|
assertEquals(8, pubs_out.count());
|
||||||
|
|
||||||
|
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::0224aae28af558f21768dbc6_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with NL country and pid not starting with Mendely prefix
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
|
||||||
|
// inserted with propagation
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817e_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
|
||||||
|
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
|
||||||
|
// propagation
|
||||||
|
assertEquals(
|
||||||
|
0,
|
||||||
|
pubs_out
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Publication>) p -> p
|
||||||
|
.getId()
|
||||||
|
.equals("50|DansKnawCris::3c81248c335f0aa07e06817d_country"))
|
||||||
|
.first()
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> vocs() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> synonyms() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")));
|
||||||
|
}
|
||||||
|
|
||||||
|
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(path)
|
||||||
|
.map(as(clazz), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
||||||
|
return s -> MAPPER.readValue(s, clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String classPathResourceAsString(String path) throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
|
CleanGraphSparkJobTest.class.getResourceAsStream(path)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
return parser;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void verify_keyword(Publication p_cleaned, String subject) {
|
||||||
|
Optional<Subject> s1 = p_cleaned
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.filter(s -> s.getValue().equals(subject))
|
||||||
|
.findFirst();
|
||||||
|
|
||||||
|
assertTrue(s1.isPresent());
|
||||||
|
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassid());
|
||||||
|
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassname());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -13,7 +13,6 @@ import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
@ -59,7 +58,7 @@ public class GraphCleaningFunctionsTest {
|
||||||
void testCleanRelations() throws Exception {
|
void testCleanRelations() throws Exception {
|
||||||
|
|
||||||
List<String> lines = IOUtils
|
List<String> lines = IOUtils
|
||||||
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json"));
|
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/graph/relation/relation.json"));
|
||||||
for (String json : lines) {
|
for (String json : lines) {
|
||||||
Relation r_in = MAPPER.readValue(json, Relation.class);
|
Relation r_in = MAPPER.readValue(json, Relation.class);
|
||||||
assertNotNull(r_in);
|
assertNotNull(r_in);
|
||||||
|
|
|
@ -1,213 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.*;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
|
|
||||||
public class CleanCfHbSparkJobTest {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJobTest.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path testBaseTmpPath;
|
|
||||||
|
|
||||||
private static String resolvedPath;
|
|
||||||
|
|
||||||
private static String graphInputPath;
|
|
||||||
|
|
||||||
private static String graphOutputPath;
|
|
||||||
|
|
||||||
private static String dsMasterDuplicatePath;
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException, URISyntaxException {
|
|
||||||
|
|
||||||
testBaseTmpPath = Files.createTempDirectory(CleanCfHbSparkJobTest.class.getSimpleName());
|
|
||||||
log.info("using test base path {}", testBaseTmpPath);
|
|
||||||
|
|
||||||
final File entitiesSources = Paths
|
|
||||||
.get(CleanCfHbSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/entities").toURI())
|
|
||||||
.toFile();
|
|
||||||
|
|
||||||
FileUtils
|
|
||||||
.copyDirectory(
|
|
||||||
entitiesSources,
|
|
||||||
testBaseTmpPath.resolve("input").resolve("entities").toFile());
|
|
||||||
|
|
||||||
FileUtils
|
|
||||||
.copyFileToDirectory(
|
|
||||||
Paths
|
|
||||||
.get(
|
|
||||||
CleanCfHbSparkJobTest.class
|
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
|
|
||||||
.toURI())
|
|
||||||
.toFile(),
|
|
||||||
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
|
|
||||||
|
|
||||||
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
|
|
||||||
resolvedPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbResolved").toString();
|
|
||||||
graphOutputPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbPatched").toString();
|
|
||||||
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setAppName(CleanCfHbSparkJobTest.class.getSimpleName());
|
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
|
|
||||||
spark = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(CleanCfHbSparkJobTest.class.getSimpleName())
|
|
||||||
.config(conf)
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
|
|
||||||
spark.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testCleanCfHbSparkJob() throws Exception {
|
|
||||||
final String outputPath = graphOutputPath + "/dataset";
|
|
||||||
final String inputPath = graphInputPath + "/dataset";
|
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Dataset> records = read(spark, inputPath, Dataset.class);
|
|
||||||
Dataset d = records
|
|
||||||
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("Bacterial Protein Interaction Database - DUP", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"Bacterial Protein Interaction Database - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("FILUR DATA - DUP", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals("FILUR DATA - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("depositar - DUP", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
CleanCfHbSparkJob
|
|
||||||
.main(
|
|
||||||
new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--inputPath", inputPath,
|
|
||||||
"--outputPath", outputPath,
|
|
||||||
"--resolvedPath", resolvedPath + "/dataset",
|
|
||||||
"--graphTableClassName", Dataset.class.getCanonicalName(),
|
|
||||||
"--masterDuplicatePath", dsMasterDuplicatePath
|
|
||||||
});
|
|
||||||
|
|
||||||
assertTrue(Files.exists(Paths.get(graphOutputPath, "dataset")));
|
|
||||||
|
|
||||||
records = read(spark, outputPath, Dataset.class);
|
|
||||||
|
|
||||||
assertEquals(3, records.count());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("Bacterial Protein Interaction Database", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals("Bacterial Protein Interaction Database", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("FULIR Data", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals("FULIR Data", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("depositar", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
|
|
||||||
d = records
|
|
||||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
|
||||||
.first();
|
|
||||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
|
||||||
assertEquals(
|
|
||||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
|
||||||
assertEquals(
|
|
||||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
|
||||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
|
|
||||||
return spark
|
|
||||||
.read()
|
|
||||||
.textFile(path)
|
|
||||||
.map(as(clazz), Encoders.bean(clazz));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
|
||||||
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
|
||||||
}
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,11 @@
|
||||||
|
<configuration>
|
||||||
|
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||||
|
<encoder>
|
||||||
|
<pattern>%d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n</pattern>
|
||||||
|
</encoder>
|
||||||
|
</appender>
|
||||||
|
|
||||||
|
<root level="info">
|
||||||
|
<appender-ref ref="STDOUT" />
|
||||||
|
</root>
|
||||||
|
</configuration>
|
Loading…
Reference in New Issue