[SKG-IFv1.1] relaxed selection constraints to include sh and edp datasources. added additional contraints on properties for ds different from the two above

This commit is contained in:
Miriam Baglioni 2024-09-24 16:38:25 +02:00
parent c7cca75345
commit 2da78f8a79
3 changed files with 108 additions and 3 deletions

View File

@ -35,7 +35,7 @@ import scala.Tuple2;
*/
public class FilterEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(EmitFromEntities.class);
public static final String DATE_FORMAT = "YYYY/mm/DD";
public static final String DATE_FORMAT = "yyyy-MM-dd";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
@ -99,7 +99,7 @@ public class FilterEntities implements Serializable {
});
}
private static <R extends eu.dnetlib.dhp.schema.oaf.Result> boolean verifyLot1Constraints(R r) {
public static <R extends eu.dnetlib.dhp.schema.oaf.Result> boolean verifyLot1Constraints(R r) {
if (r
.getInstance()
.stream()
@ -159,4 +159,5 @@ public class FilterEntities implements Serializable {
}
}
}

View File

@ -131,7 +131,7 @@ public class SelectLOT1Entities implements Serializable {
"array_contains(pid.qualifier.classid , 'ena') OR " +
"array_contains(pid.qualifier.classid , 'uniprot') OR " +
"array_contains(instance.hostedby.key ,'10|re3data_____::c4b2081b224be6b3e79d0e5e5556f631') OR " +
"array_contains(instance.hostedby.key, '10|openaire____::dbfd07503aaa1ed31beed7dec942f3f4') )") // filter//
"array_contains(instance.hostedby.key, '10|openaire____::dbfd07503aaa1ed31beed7dec942f3f4')") // filter//
// type
.selectExpr("id", "explode(instance) as instance")
.withColumn(

View File

@ -0,0 +1,104 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.filterentities.FilterEntities;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class filterEntitiesTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(filterEntitiesTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(filterEntitiesTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(filterEntitiesTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpResultTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testEmitFromResult() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
final String workingDir = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/")
.getPath();
Dataset<Publication> result = Utils
.readPath(spark, sourcePath + "publication", Publication.class);
Dataset<Publication> filter = result
.filter((FilterFunction<Publication>) r -> FilterEntities.verifyLot1Constraints(r));
filter.foreach((ForeachFunction<Publication>) r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
}
public static boolean isValidFormat(String value) {
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
df.setLenient(false);
try {
System.out.println(value);
Date parseDate = df.parse(value);
df.format(parseDate);
return true;
} catch (ParseException e) {
return false;
}
}
@Test
public void isValidDate() {
System.out.println(isValidFormat("2019-03-03"));
System.out.println(isValidFormat("2019/03/03"));
}
}