[SKG-IFv1.1] relaxed selection constraints to include sh and edp datasources. added additional contraints on properties for ds different from the two above
This commit is contained in:
parent
c7cca75345
commit
2da78f8a79
|
@ -35,7 +35,7 @@ import scala.Tuple2;
|
|||
*/
|
||||
public class FilterEntities implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(EmitFromEntities.class);
|
||||
public static final String DATE_FORMAT = "YYYY/mm/DD";
|
||||
public static final String DATE_FORMAT = "yyyy-MM-dd";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
|
@ -99,7 +99,7 @@ public class FilterEntities implements Serializable {
|
|||
});
|
||||
}
|
||||
|
||||
private static <R extends eu.dnetlib.dhp.schema.oaf.Result> boolean verifyLot1Constraints(R r) {
|
||||
public static <R extends eu.dnetlib.dhp.schema.oaf.Result> boolean verifyLot1Constraints(R r) {
|
||||
if (r
|
||||
.getInstance()
|
||||
.stream()
|
||||
|
@ -159,4 +159,5 @@ public class FilterEntities implements Serializable {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -131,7 +131,7 @@ public class SelectLOT1Entities implements Serializable {
|
|||
"array_contains(pid.qualifier.classid , 'ena') OR " +
|
||||
"array_contains(pid.qualifier.classid , 'uniprot') OR " +
|
||||
"array_contains(instance.hostedby.key ,'10|re3data_____::c4b2081b224be6b3e79d0e5e5556f631') OR " +
|
||||
"array_contains(instance.hostedby.key, '10|openaire____::dbfd07503aaa1ed31beed7dec942f3f4') )") // filter//
|
||||
"array_contains(instance.hostedby.key, '10|openaire____::dbfd07503aaa1ed31beed7dec942f3f4')") // filter//
|
||||
// type
|
||||
.selectExpr("id", "explode(instance) as instance")
|
||||
.withColumn(
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.filterentities.FilterEntities;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
||||
public class filterEntitiesTest {
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(filterEntitiesTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(filterEntitiesTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(filterEntitiesTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(DumpResultTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmitFromResult() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
|
||||
.getPath();
|
||||
|
||||
final String workingDir = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/")
|
||||
.getPath();
|
||||
|
||||
Dataset<Publication> result = Utils
|
||||
.readPath(spark, sourcePath + "publication", Publication.class);
|
||||
|
||||
Dataset<Publication> filter = result
|
||||
.filter((FilterFunction<Publication>) r -> FilterEntities.verifyLot1Constraints(r));
|
||||
|
||||
filter.foreach((ForeachFunction<Publication>) r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
}
|
||||
|
||||
public static boolean isValidFormat(String value) {
|
||||
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
|
||||
df.setLenient(false);
|
||||
try {
|
||||
System.out.println(value);
|
||||
Date parseDate = df.parse(value);
|
||||
df.format(parseDate);
|
||||
return true;
|
||||
} catch (ParseException e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void isValidDate() {
|
||||
System.out.println(isValidFormat("2019-03-03"));
|
||||
System.out.println(isValidFormat("2019/03/03"));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue