[Clean Context] changed a bit the logic. Added the check not to have result hosted by a datasource of type institutional repository from NL. Added also the check that the country should have been included in the result via propagation for it to be removed

This commit is contained in:
Miriam Baglioni 2022-08-08 14:10:47 +02:00
parent 390013a4b2
commit 62d2138806
5 changed files with 315 additions and 270 deletions

View File

@ -1,15 +1,15 @@
package eu.dnetlib.dhp.oa.graph.clean.country;
/**
* @author miriam.baglioni
* @Date 20/07/22
*/
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import javax.swing.text.html.Option;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -21,13 +21,17 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/**
* @author miriam.baglioni
* @Date 20/07/22
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleanCountrySparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CleanCountrySparkJob.class);
@ -56,7 +60,7 @@ public class CleanCountrySparkJob implements Serializable {
String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String datasourcePath = parser.get("datasourcePath");
String datasourcePath = parser.get("hostedBy");
log.info("datasourcePath: {}", datasourcePath);
String country = parser.get("country");
@ -79,14 +83,17 @@ public class CleanCountrySparkJob implements Serializable {
isSparkSessionManaged,
spark -> {
cleanCountry(spark, country, verifyParam, inputPath, entityClazz, workingPath,collectedfrom, datasourcePath);
cleanCountry(
spark, country, verifyParam, inputPath, entityClazz, workingPath, collectedfrom, datasourcePath);
});
}
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
String inputPath, Class<T> entityClazz, String workingPath, String collectedfrom, String datasourcePath) {
List<String> hostedBy = spark.read().textFile(datasourcePath)
List<String> hostedBy = spark
.read()
.textFile(datasourcePath)
// .filter((FilterFunction<String>) ds -> !ds.equals(collectedfrom))
.collectAsList();
@ -95,25 +102,25 @@ public class CleanCountrySparkJob implements Serializable {
.textFile(inputPath)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz))
;
Encoders.bean(entityClazz));
res.map((MapFunction<T, T>) r -> {
if(r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
!r.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))){
if (r.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
!r.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
return r;
}
if(r.getPid().stream().anyMatch(p -> p.getQualifier().getClassid().equals("doi") && pidInParam(p.getValue(), verifyParam))
&& r.getCountry().stream().anyMatch(c -> c.getClassid().equals(country) && c.getDataInfo().getInferenceprovenance().equals("propagation")))
{ r
if (r
.getPid()
.stream()
.anyMatch(p -> p.getQualifier().getClassid().equals("doi") && pidInParam(p.getValue(), verifyParam))) {
r
.setCountry(
r
.getCountry()
.stream()
.filter(
c -> !c.getClassid()
.equalsIgnoreCase(country))
c -> toTakeCountry(c, country))
.collect(Collectors.toList()));
}
@ -138,11 +145,23 @@ public class CleanCountrySparkJob implements Serializable {
}
private static boolean pidInParam(String value, String[] verifyParam) {
for (String s : verifyParam )
for (String s : verifyParam)
if (value.startsWith(s))
return true;
return false;
}
}
private static boolean toTakeCountry(Country c, String country) {
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
// inserted via propagation
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
return true;
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
return true;
return !(c
.getClassid()
.equalsIgnoreCase(country) &&
c.getDataInfo().getInferenceprovenance().equals("propagation"));
}
}

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.oa.graph.clean.country;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -15,14 +17,15 @@ import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/**
* @author miriam.baglioni
* @Date 22/07/22
@ -67,20 +70,25 @@ public class GetDatasourceFromCountry implements Serializable {
});
}
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath, String workingPath) {
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath,
String workingPath) {
Dataset<Organization> organization = spark.read().textFile(inputPath + "/organization")
Dataset<Organization> organization = spark
.read()
.textFile(inputPath + "/organization")
.map(
(MapFunction<String, Organization>) value -> OBJECT_MAPPER.readValue(value, Organization.class),
Encoders.bean(Organization.class))
.filter(
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
o.getCountry().getClassid().length() > 0 &&
o.getCountry().getClassid().equals(country));;
o.getCountry().getClassid().equals(country));
;
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
Dataset<Relation> relation = spark.read().textFile( inputPath + "/relation")
Dataset<Relation> relation = spark
.read()
.textFile(inputPath + "/relation")
.map(
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
Encoders.bean(Relation.class))
@ -88,11 +96,12 @@ public class GetDatasourceFromCountry implements Serializable {
(FilterFunction<Relation>) rel -> rel.getRelClass().equalsIgnoreCase(ModelConstants.IS_PROVIDED_BY) &&
!rel.getDataInfo().getDeletedbyinference());
organization.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getSource(), Encoders.STRING() )
organization
.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getSource(), Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.json(workingPath);
}

View File

@ -1,13 +1,10 @@
package eu.dnetlib.dhp.oa.graph.clean;
/**
* @author miriam.baglioni
* @Date 20/07/22
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
import eu.dnetlib.dhp.oa.graph.dump.DumpJobTest;
import eu.dnetlib.dhp.schema.oaf.Publication;
package eu.dnetlib.dhp.oa.graph.clean;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -22,10 +19,14 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* @author miriam.baglioni
* @Date 20/07/22
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class CleanCountryTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -38,11 +39,11 @@ public class CleanCountryTest {
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpJobTest.class.getSimpleName());
workingDir = Files.createTempDirectory(CleanCountryTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpJobTest.class.getSimpleName());
conf.setAppName(CleanCountryTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
@ -53,7 +54,7 @@ public class CleanCountryTest {
spark = SparkSession
.builder()
.appName(DumpJobTest.class.getSimpleName())
.appName(CleanCountryTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@ -69,7 +70,6 @@ public class CleanCountryTest {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_country.json")
.getPath();
final String prefix = "gcube ";
spark
.read()
@ -87,7 +87,10 @@ public class CleanCountryTest {
"-workingPath", workingDir.toString() + "/working",
"-country", "NL",
"-verifyParam", "10.17632",
"-collectedfrom", "NARCIS"
"-collectedfrom", "NARCIS",
"-hostedBy", getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
.getPath()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -95,7 +98,7 @@ public class CleanCountryTest {
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(7, tmp.count());
Assertions.assertEquals(8, tmp.count());
// original result with NL country and doi starting with Mendely prefix, but not collectedfrom NARCIS
Assertions
@ -119,12 +122,25 @@ public class CleanCountryTest {
.getCountry()
.size());
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS but not
// inserted with propagation
Assertions
.assertEquals(
1,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
.collect()
.get(0)
.getCountry()
.size());
// original result with NL country and doi starting with Mendely prefix and collectedfrom NARCIS inserted with
// propagation
Assertions
.assertEquals(
0,
tmp
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6ag"))
.collect()
.get(0)
.getCountry()