master #59
|
@ -1,15 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 20/07/22
|
||||
*/
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.swing.text.html.Option;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
|
@ -21,13 +21,17 @@ import org.apache.spark.sql.SparkSession;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 20/07/22
|
||||
*/
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class CleanCountrySparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanCountrySparkJob.class);
|
||||
|
@ -56,7 +60,7 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
|
||||
String datasourcePath = parser.get("datasourcePath");
|
||||
String datasourcePath = parser.get("hostedBy");
|
||||
log.info("datasourcePath: {}", datasourcePath);
|
||||
|
||||
String country = parser.get("country");
|
||||
|
@ -79,14 +83,17 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
||||
cleanCountry(spark, country, verifyParam, inputPath, entityClazz, workingPath,collectedfrom, datasourcePath);
|
||||
cleanCountry(
|
||||
spark, country, verifyParam, inputPath, entityClazz, workingPath, collectedfrom, datasourcePath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
|
||||
String inputPath, Class<T> entityClazz, String workingPath, String collectedfrom, String datasourcePath) {
|
||||
|
||||
List<String> hostedBy = spark.read().textFile(datasourcePath)
|
||||
List<String> hostedBy = spark
|
||||
.read()
|
||||
.textFile(datasourcePath)
|
||||
// .filter((FilterFunction<String>) ds -> !ds.equals(collectedfrom))
|
||||
.collectAsList();
|
||||
|
||||
|
@ -95,8 +102,7 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
.textFile(inputPath)
|
||||
.map(
|
||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
||||
Encoders.bean(entityClazz))
|
||||
;
|
||||
Encoders.bean(entityClazz));
|
||||
|
||||
res.map((MapFunction<T, T>) r -> {
|
||||
if (r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
|
||||
|
@ -104,16 +110,17 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
return r;
|
||||
}
|
||||
|
||||
if(r.getPid().stream().anyMatch(p -> p.getQualifier().getClassid().equals("doi") && pidInParam(p.getValue(), verifyParam))
|
||||
&& r.getCountry().stream().anyMatch(c -> c.getClassid().equals(country) && c.getDataInfo().getInferenceprovenance().equals("propagation")))
|
||||
{ r
|
||||
if (r
|
||||
.getPid()
|
||||
.stream()
|
||||
.anyMatch(p -> p.getQualifier().getClassid().equals("doi") && pidInParam(p.getValue(), verifyParam))) {
|
||||
r
|
||||
.setCountry(
|
||||
r
|
||||
.getCountry()
|
||||
.stream()
|
||||
.filter(
|
||||
c -> !c.getClassid()
|
||||
.equalsIgnoreCase(country))
|
||||
c -> toTakeCountry(c, country))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
}
|
||||
|
@ -144,5 +151,17 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
return false;
|
||||
}
|
||||
|
||||
private static boolean toTakeCountry(Country c, String country) {
|
||||
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
|
||||
// inserted via propagation
|
||||
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
|
||||
return true;
|
||||
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
|
||||
return true;
|
||||
return !(c
|
||||
.getClassid()
|
||||
.equalsIgnoreCase(country) &&
|
||||
c.getDataInfo().getInferenceprovenance().equals("propagation"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean.country;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
|
@ -15,14 +17,15 @@ import org.apache.spark.sql.SaveMode;
|
|||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 22/07/22
|
||||
|
@ -67,20 +70,25 @@ public class GetDatasourceFromCountry implements Serializable {
|
|||
});
|
||||
}
|
||||
|
||||
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath, String workingPath) {
|
||||
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath,
|
||||
String workingPath) {
|
||||
|
||||
Dataset<Organization> organization = spark.read().textFile(inputPath + "/organization")
|
||||
Dataset<Organization> organization = spark
|
||||
.read()
|
||||
.textFile(inputPath + "/organization")
|
||||
.map(
|
||||
(MapFunction<String, Organization>) value -> OBJECT_MAPPER.readValue(value, Organization.class),
|
||||
Encoders.bean(Organization.class))
|
||||
.filter(
|
||||
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
|
||||
o.getCountry().getClassid().length() > 0 &&
|
||||
o.getCountry().getClassid().equals(country));;
|
||||
|
||||
o.getCountry().getClassid().equals(country));
|
||||
;
|
||||
|
||||
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
||||
Dataset<Relation> relation = spark.read().textFile( inputPath + "/relation")
|
||||
Dataset<Relation> relation = spark
|
||||
.read()
|
||||
.textFile(inputPath + "/relation")
|
||||
.map(
|
||||
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
||||
Encoders.bean(Relation.class))
|
||||
|
@ -88,7 +96,8 @@ public class GetDatasourceFromCountry implements Serializable {
|
|||
(FilterFunction<Relation>) rel -> rel.getRelClass().equalsIgnoreCase(ModelConstants.IS_PROVIDED_BY) &&
|
||||
!rel.getDataInfo().getDeletedbyinference());
|
||||
|
||||
organization.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
|
||||
organization
|
||||
.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
|
||||
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getSource(), Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
|
|
@ -1,13 +1,10 @@
|
|||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 20/07/22
|
||||
*/
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.DumpJobTest;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
@ -22,10 +19,14 @@ import org.junit.jupiter.api.Test;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 20/07/22
|
||||
*/
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
||||
public class CleanCountryTest {
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
@ -38,11 +39,11 @@ public class CleanCountryTest {
|
|||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(DumpJobTest.class.getSimpleName());
|
||||
workingDir = Files.createTempDirectory(CleanCountryTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(DumpJobTest.class.getSimpleName());
|
||||
conf.setAppName(CleanCountryTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
|
@ -53,7 +54,7 @@ public class CleanCountryTest {
|
|||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(DumpJobTest.class.getSimpleName())
|
||||
.appName(CleanCountryTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
@ -69,7 +70,6 @@ public class CleanCountryTest {
|
|||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_country.json")
|
||||
.getPath();
|
||||
final String prefix = "gcube ";
|
||||
|
||||
spark
|
||||
.read()
|
||||
|
@ -87,7 +87,10 @@ public class CleanCountryTest {
|
|||
"-workingPath", workingDir.toString() + "/working",
|
||||
"-country", "NL",
|
||||
"-verifyParam", "10.17632",
|
||||
"-collectedfrom", "NARCIS"
|
||||
"-collectedfrom", "NARCIS",
|
||||
"-hostedBy", getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
||||
.getPath()
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
@ -95,7 +98,7 @@ public class CleanCountryTest {
|
|||
.textFile(workingDir.toString() + "/publication")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
||||
|
||||
Assertions.assertEquals(7, tmp.count());
|
||||
Assertions.assertEquals(8, tmp.count());
|
||||
|
||||
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
|
||||
Assertions
|
||||
|
@ -119,12 +122,25 @@ public class CleanCountryTest {
|
|||
.getCountry()
|
||||
.size());
|
||||
|
||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS
|
||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
|
||||
// inserted with propagation
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getCountry()
|
||||
.size());
|
||||
|
||||
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
|
||||
// propagation
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6ag"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getCountry()
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue