forked from D-Net/dnet-hadoop
[Clean Context] changed a bit the logic. Added the check not to have result hosted by a datasource of type institutional repository from NL. Added also the check that the country should have been included in the result via propagation for it to be removed
This commit is contained in:
parent
390013a4b2
commit
62d2138806
|
@ -1,15 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
package eu.dnetlib.dhp.oa.graph.clean.country;
|
||||||
|
|
||||||
/**
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 20/07/22
|
import java.io.Serializable;
|
||||||
*/
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import javax.swing.text.html.Option;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
@ -21,128 +21,147 @@ import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.Serializable;
|
/**
|
||||||
import java.util.List;
|
* @author miriam.baglioni
|
||||||
import java.util.Optional;
|
* @Date 20/07/22
|
||||||
import java.util.stream.Collectors;
|
*/
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class CleanCountrySparkJob implements Serializable {
|
public class CleanCountrySparkJob implements Serializable {
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanCountrySparkJob.class);
|
private static final Logger log = LoggerFactory.getLogger(CleanCountrySparkJob.class);
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
CleanContextSparkJob.class
|
CleanContextSparkJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json"));
|
"/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json"));
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
Boolean isSparkSessionManaged = Optional
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
String inputPath = parser.get("inputPath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
String workingPath = parser.get("workingPath");
|
String workingPath = parser.get("workingPath");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingPath: {}", workingPath);
|
||||||
|
|
||||||
String datasourcePath = parser.get("datasourcePath");
|
String datasourcePath = parser.get("hostedBy");
|
||||||
log.info("datasourcePath: {}", datasourcePath);
|
log.info("datasourcePath: {}", datasourcePath);
|
||||||
|
|
||||||
String country = parser.get("country");
|
String country = parser.get("country");
|
||||||
log.info("country: {}", country);
|
log.info("country: {}", country);
|
||||||
|
|
||||||
String[] verifyParam = parser.get("verifyParam").split(";");
|
String[] verifyParam = parser.get("verifyParam").split(";");
|
||||||
log.info("verifyParam: {}", verifyParam);
|
log.info("verifyParam: {}", verifyParam);
|
||||||
|
|
||||||
String collectedfrom = parser.get("collectedfrom");
|
String collectedfrom = parser.get("collectedfrom");
|
||||||
log.info("collectedfrom: {}", collectedfrom);
|
log.info("collectedfrom: {}", collectedfrom);
|
||||||
|
|
||||||
String graphTableClassName = parser.get("graphTableClassName");
|
String graphTableClassName = parser.get("graphTableClassName");
|
||||||
log.info("graphTableClassName: {}", graphTableClassName);
|
log.info("graphTableClassName: {}", graphTableClassName);
|
||||||
|
|
||||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
||||||
cleanCountry(spark, country, verifyParam, inputPath, entityClazz, workingPath,collectedfrom, datasourcePath);
|
cleanCountry(
|
||||||
});
|
spark, country, verifyParam, inputPath, entityClazz, workingPath, collectedfrom, datasourcePath);
|
||||||
}
|
});
|
||||||
|
}
|
||||||
|
|
||||||
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
|
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
|
||||||
String inputPath, Class<T> entityClazz, String workingPath, String collectedfrom, String datasourcePath) {
|
String inputPath, Class<T> entityClazz, String workingPath, String collectedfrom, String datasourcePath) {
|
||||||
|
|
||||||
List<String> hostedBy = spark.read().textFile(datasourcePath)
|
List<String> hostedBy = spark
|
||||||
// .filter((FilterFunction<String>) ds -> !ds.equals(collectedfrom))
|
.read()
|
||||||
.collectAsList();
|
.textFile(datasourcePath)
|
||||||
|
// .filter((FilterFunction<String>) ds -> !ds.equals(collectedfrom))
|
||||||
|
.collectAsList();
|
||||||
|
|
||||||
Dataset<T> res = spark
|
Dataset<T> res = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(inputPath)
|
.textFile(inputPath)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
||||||
Encoders.bean(entityClazz))
|
Encoders.bean(entityClazz));
|
||||||
;
|
|
||||||
|
|
||||||
res.map((MapFunction<T, T>) r -> {
|
res.map((MapFunction<T, T>) r -> {
|
||||||
if(r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
|
if (r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
|
||||||
!r.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))){
|
!r.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(r.getPid().stream().anyMatch(p -> p.getQualifier().getClassid().equals("doi") && pidInParam(p.getValue(), verifyParam))
|
if (r
|
||||||
&& r.getCountry().stream().anyMatch(c -> c.getClassid().equals(country) && c.getDataInfo().getInferenceprovenance().equals("propagation")))
|
.getPid()
|
||||||
{ r
|
.stream()
|
||||||
.setCountry(
|
.anyMatch(p -> p.getQualifier().getClassid().equals("doi") && pidInParam(p.getValue(), verifyParam))) {
|
||||||
r
|
r
|
||||||
.getCountry()
|
.setCountry(
|
||||||
.stream()
|
r
|
||||||
.filter(
|
.getCountry()
|
||||||
c -> !c.getClassid()
|
.stream()
|
||||||
.equalsIgnoreCase(country))
|
.filter(
|
||||||
.collect(Collectors.toList()));
|
c -> toTakeCountry(c, country))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}, Encoders.bean(entityClazz))
|
}, Encoders.bean(entityClazz))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(workingPath);
|
.json(workingPath);
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(workingPath)
|
.textFile(workingPath)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
||||||
Encoders.bean(entityClazz))
|
Encoders.bean(entityClazz))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(inputPath);
|
.json(inputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean pidInParam(String value, String[] verifyParam) {
|
private static boolean pidInParam(String value, String[] verifyParam) {
|
||||||
for (String s : verifyParam )
|
for (String s : verifyParam)
|
||||||
if (value.startsWith(s))
|
if (value.startsWith(s))
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean toTakeCountry(Country c, String country) {
|
||||||
|
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
|
||||||
|
// inserted via propagation
|
||||||
|
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
|
||||||
|
return true;
|
||||||
|
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
|
||||||
|
return true;
|
||||||
|
return !(c
|
||||||
|
.getClassid()
|
||||||
|
.equalsIgnoreCase(country) &&
|
||||||
|
c.getDataInfo().getInferenceprovenance().equals("propagation"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
package eu.dnetlib.dhp.oa.graph.clean.country;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
import java.io.Serializable;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import java.util.List;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
@ -15,85 +17,92 @@ import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author miriam.baglioni
|
* @author miriam.baglioni
|
||||||
* @Date 22/07/22
|
* @Date 22/07/22
|
||||||
*/
|
*/
|
||||||
public class GetDatasourceFromCountry implements Serializable {
|
public class GetDatasourceFromCountry implements Serializable {
|
||||||
private static final Logger log = LoggerFactory.getLogger(GetDatasourceFromCountry.class);
|
private static final Logger log = LoggerFactory.getLogger(GetDatasourceFromCountry.class);
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
GetDatasourceFromCountry.class
|
GetDatasourceFromCountry.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_datasource_country_parameters.json"));
|
"/eu/dnetlib/dhp/oa/graph/input_datasource_country_parameters.json"));
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
Boolean isSparkSessionManaged = Optional
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
String inputPath = parser.get("inputPath");
|
String inputPath = parser.get("inputPath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
String workingPath = parser.get("workingPath");
|
String workingPath = parser.get("workingPath");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingPath: {}", workingPath);
|
||||||
|
|
||||||
String country = parser.get("country");
|
String country = parser.get("country");
|
||||||
log.info("country: {}", country);
|
log.info("country: {}", country);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
||||||
getDatasourceFromCountry(spark, country, inputPath, workingPath);
|
getDatasourceFromCountry(spark, country, inputPath, workingPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath, String workingPath) {
|
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath,
|
||||||
|
String workingPath) {
|
||||||
|
|
||||||
Dataset<Organization> organization = spark.read().textFile(inputPath + "/organization")
|
Dataset<Organization> organization = spark
|
||||||
.map(
|
.read()
|
||||||
(MapFunction<String, Organization>) value -> OBJECT_MAPPER.readValue(value, Organization.class),
|
.textFile(inputPath + "/organization")
|
||||||
Encoders.bean(Organization.class))
|
.map(
|
||||||
.filter(
|
(MapFunction<String, Organization>) value -> OBJECT_MAPPER.readValue(value, Organization.class),
|
||||||
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
|
Encoders.bean(Organization.class))
|
||||||
o.getCountry().getClassid().length() > 0 &&
|
.filter(
|
||||||
o.getCountry().getClassid().equals(country));;
|
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
|
||||||
|
o.getCountry().getClassid().length() > 0 &&
|
||||||
|
o.getCountry().getClassid().equals(country));
|
||||||
|
;
|
||||||
|
|
||||||
|
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
||||||
|
Dataset<Relation> relation = spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath + "/relation")
|
||||||
|
.map(
|
||||||
|
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
||||||
|
Encoders.bean(Relation.class))
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Relation>) rel -> rel.getRelClass().equalsIgnoreCase(ModelConstants.IS_PROVIDED_BY) &&
|
||||||
|
!rel.getDataInfo().getDeletedbyinference());
|
||||||
|
|
||||||
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
organization
|
||||||
Dataset<Relation> relation = spark.read().textFile( inputPath + "/relation")
|
.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
|
||||||
.map(
|
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getSource(), Encoders.STRING())
|
||||||
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
.write()
|
||||||
Encoders.bean(Relation.class))
|
.mode(SaveMode.Overwrite)
|
||||||
.filter(
|
.option("compression", "gzip")
|
||||||
(FilterFunction<Relation>) rel -> rel.getRelClass().equalsIgnoreCase(ModelConstants.IS_PROVIDED_BY) &&
|
.json(workingPath);
|
||||||
!rel.getDataInfo().getDeletedbyinference());
|
|
||||||
|
|
||||||
organization.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
|
}
|
||||||
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getSource(), Encoders.STRING() )
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression","gzip")
|
|
||||||
.json(workingPath);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
|
||||||
/**
|
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 20/07/22
|
|
||||||
*/
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.dump.DumpJobTest;
|
import java.io.IOException;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
@ -22,113 +19,132 @@ import org.junit.jupiter.api.Test;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
/**
|
||||||
import java.nio.file.Files;
|
* @author miriam.baglioni
|
||||||
import java.nio.file.Path;
|
* @Date 20/07/22
|
||||||
|
*/
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
|
||||||
public class CleanCountryTest {
|
public class CleanCountryTest {
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
private static SparkSession spark;
|
private static SparkSession spark;
|
||||||
|
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
|
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void beforeAll() throws IOException {
|
public static void beforeAll() throws IOException {
|
||||||
workingDir = Files.createTempDirectory(DumpJobTest.class.getSimpleName());
|
workingDir = Files.createTempDirectory(CleanCountryTest.class.getSimpleName());
|
||||||
log.info("using work dir {}", workingDir);
|
log.info("using work dir {}", workingDir);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.setAppName(DumpJobTest.class.getSimpleName());
|
conf.setAppName(CleanCountryTest.class.getSimpleName());
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
conf.setMaster("local[*]");
|
||||||
conf.set("spark.driver.host", "localhost");
|
conf.set("spark.driver.host", "localhost");
|
||||||
conf.set("hive.metastore.local", "true");
|
conf.set("hive.metastore.local", "true");
|
||||||
conf.set("spark.ui.enabled", "false");
|
conf.set("spark.ui.enabled", "false");
|
||||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||||
|
|
||||||
spark = SparkSession
|
spark = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.appName(DumpJobTest.class.getSimpleName())
|
.appName(CleanCountryTest.class.getSimpleName())
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.getOrCreate();
|
.getOrCreate();
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterAll
|
@AfterAll
|
||||||
public static void afterAll() throws IOException {
|
public static void afterAll() throws IOException {
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testResultClean() throws Exception {
|
public void testResultClean() throws Exception {
|
||||||
final String sourcePath = getClass()
|
final String sourcePath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_country.json")
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_country.json")
|
||||||
.getPath();
|
.getPath();
|
||||||
final String prefix = "gcube ";
|
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(sourcePath)
|
.textFile(sourcePath)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
|
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
|
||||||
Encoders.bean(Publication.class))
|
Encoders.bean(Publication.class))
|
||||||
.write()
|
.write()
|
||||||
.json(workingDir.toString() + "/publication");
|
.json(workingDir.toString() + "/publication");
|
||||||
|
|
||||||
CleanCountrySparkJob.main(new String[] {
|
CleanCountrySparkJob.main(new String[] {
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
"--inputPath", workingDir.toString() + "/publication",
|
"--inputPath", workingDir.toString() + "/publication",
|
||||||
"-graphTableClassName", Publication.class.getCanonicalName(),
|
"-graphTableClassName", Publication.class.getCanonicalName(),
|
||||||
"-workingPath", workingDir.toString() + "/working",
|
"-workingPath", workingDir.toString() + "/working",
|
||||||
"-country", "NL",
|
"-country", "NL",
|
||||||
"-verifyParam", "10.17632",
|
"-verifyParam", "10.17632",
|
||||||
"-collectedfrom", "NARCIS"
|
"-collectedfrom", "NARCIS",
|
||||||
});
|
"-hostedBy", getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
||||||
|
.getPath()
|
||||||
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
JavaRDD<Publication> tmp = sc
|
JavaRDD<Publication> tmp = sc
|
||||||
.textFile(workingDir.toString() + "/publication")
|
.textFile(workingDir.toString() + "/publication")
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
||||||
|
|
||||||
Assertions.assertEquals(7, tmp.count());
|
Assertions.assertEquals(8, tmp.count());
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
|
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1,
|
1,
|
||||||
tmp
|
tmp
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
|
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
|
||||||
.collect()
|
.collect()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getCountry()
|
.getCountry()
|
||||||
.size());
|
.size());
|
||||||
|
|
||||||
// original result with NL country and pid not starting with Mendely prefix
|
// original result with NL country and pid not starting with Mendely prefix
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1,
|
1,
|
||||||
tmp
|
tmp
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
||||||
.collect()
|
.collect()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getCountry()
|
.getCountry()
|
||||||
.size());
|
.size());
|
||||||
|
|
||||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS
|
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
|
||||||
Assertions
|
// inserted with propagation
|
||||||
.assertEquals(
|
Assertions
|
||||||
0,
|
.assertEquals(
|
||||||
tmp
|
1,
|
||||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
tmp
|
||||||
.collect()
|
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
||||||
.get(0)
|
.collect()
|
||||||
.getCountry()
|
.get(0)
|
||||||
.size());
|
.getCountry()
|
||||||
}
|
.size());
|
||||||
|
|
||||||
|
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
|
||||||
|
// propagation
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
0,
|
||||||
|
tmp
|
||||||
|
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6ag"))
|
||||||
|
.collect()
|
||||||
|
.get(0)
|
||||||
|
.getCountry()
|
||||||
|
.size());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue