dump #50

Merged
claudio.atzori merged 98 commits from miriam.baglioni/dnet-hadoop:dump into master 2020-11-04 18:07:01 +01:00
18 changed files with 0 additions and 1063 deletions
Showing only changes of commit 56150d7e5e - Show all commits

View File

@ -1,62 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.ttl;
import java.io.Serializable;
import java.util.List;
public class OrganizationInfo implements Serializable {
private String name;
private String shortName;
private String country;
private String websiteUrl;
private List<Pids> pidsList;
private String id;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getShortName() {
return shortName;
}
public void setShortName(String shortName) {
this.shortName = shortName;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getWebsiteUrl() {
return websiteUrl;
}
public void setWebsiteUrl(String webciteUrl) {
this.websiteUrl = webciteUrl;
}
public List<Pids> getPidsList() {
return pidsList;
}
public void setPidsList(List<Pids> pidsList) {
this.pidsList = pidsList;
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.ttl;
import java.io.Serializable;
public class Pids implements Serializable {
private String type;
private String value;
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -1,88 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.ttl;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.oaf.Organization;
import scala.Tuple2;
public class SparkPrepareOrganizationInfo implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkPrepareOrganizationInfo.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkPrepareOrganizationInfo.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
prepareOrganization(spark, inputPath, outputPath);
});
}
private static void prepareOrganization(SparkSession spark, String inputPath, String outputPath) {
Dataset<Organization> orgs = Utils.readPath(spark, inputPath, Organization.class);
orgs.createOrReplaceTempView("organization");
String query = "select country.classname country, legalname.value name, legalshortname.value shortName, websiteurl.value websiteUrl, "
+
"collect_set(named_struct(\"pidtype\", pIde.qualifier.classid, \"pid\", pIde.value)) as pidsList, id " +
"from organization " +
"lateral view explode( pid) p as pIde " +
"group by country.classname, legalname.value, legalshortname.value, websiteurl.value, id";
spark
.sql(query)
.as(Encoders.bean(OrganizationInfo.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}

View File

@ -1,96 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.pid;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.dump.pidgraph.Entity;
public class DumpAuthorTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpAuthorTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpAuthorTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpAuthorTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpAuthorTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDumpAuthorPids() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/pid/result")
.getPath();
SparkDumpPidAuthor.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString(),
"-sourcePath", sourcePath,
"-allowedAuthorPids", "[\"orcid\", \"mag\"]"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Entity> tmp = sc
.textFile(workingDir.toString() + "/author")
.map(item -> OBJECT_MAPPER.readValue(item, Entity.class));
org.apache.spark.sql.Dataset<Entity> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Entity.class));
verificationDataset.show(false);
Assertions.assertEquals(3, verificationDataset.count());
Assertions.assertEquals(1, verificationDataset.filter("id = 'mag:fakemag'").count());
Assertions.assertEquals(2, verificationDataset.filter("substr(id,1,5) = 'orcid'").count());
}
}

View File

@ -1,161 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.pid;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
import eu.dnetlib.dhp.schema.dump.pidgraph.Entity;
public class DumpResultRelationTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpResultRelationTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpResultRelationTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpResultRelationTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpAuthorTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDumpResultRelationPids() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/pid/preparedInfoDR")
.getPath();
SparkDumpResultRelation.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString(),
"-preparedInfoPath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
org.apache.spark.sql.Dataset<Relation> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
verificationDataset.show(100, false);
Assertions.assertEquals(32, verificationDataset.count());
Assertions
.assertEquals(
1, verificationDataset
.filter(
"source.id = 'orcid:0000-0001-9317-9350' and " +
"relType.name = 'sameAs' and target.id = 'mag:fakeMag'")
.count());
Assertions
.assertEquals(
4, verificationDataset
.filter(
"source.id = 'orcid:0000-0001-9317-9350' and " +
"relType.name = 'isAuthorOf'")
.count());
Assertions
.assertEquals(
4, verificationDataset
.filter(
"target.id = 'orcid:0000-0001-9317-9350' and " +
"relType.name = 'hasAuthor'")
.count());
Assertions
.assertEquals(
1, verificationDataset
.filter(
"source.id = 'orcid:0000-0001-9317-9350' and " +
"relType.name = 'hasCoAuthor'")
.count());
Assertions
.assertEquals(
1, verificationDataset
.filter(
"source.id = 'orcid:0000-0001-9317-9350' and " +
"relType.name = 'hasCoAuthor' and target.id = 'orcid:0000-0002-1114-4216'")
.count());
Assertions
.assertEquals(
2, verificationDataset
.filter(
"source.id = 'orcid:0000-0002-1114-4216' and " +
"relType.name = 'hasCoAuthor'")
.count());
Assertions
.assertEquals(
1, verificationDataset
.filter(
"target.id = 'orcid:0000-0001-9317-9350' and " +
"relType.name = 'hasCoAuthor' and source.id = 'orcid:0000-0002-1114-4216'")
.count());
Assertions
.assertEquals(
1, verificationDataset
.filter(
"target.id = 'mag:fakeMag' and " +
"relType.name = 'hasCoAuthor' and source.id = 'orcid:0000-0002-1114-4216'")
.count());
Assertions.assertEquals(4, verificationDataset.filter("relType.name = 'hasOtherManifestation'").count());
//
// Assertions.assertEquals(1, verificationDataset.filter("id = 'mag:fakemag'").count());
//
// Assertions.assertEquals(2, verificationDataset.filter("substr(id,1,5) = 'orcid'").count());
}
}

View File

@ -1,96 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.pid;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.dump.pidgraph.Entity;
public class DumpResultTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpResultTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpResultTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpAuthorTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpResultTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDumpResultPids() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/pid/preparedInfo")
.getPath();
SparkDumpPidResult.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString(),
"-preparedInfoPath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Entity> tmp = sc
.textFile(workingDir.toString() + "/result")
.map(item -> OBJECT_MAPPER.readValue(item, Entity.class));
org.apache.spark.sql.Dataset<Entity> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Entity.class));
Assertions.assertEquals(35, verificationDataset.count());
Assertions.assertEquals(32, verificationDataset.filter("substr(id,1,3) = 'doi'").count());
Assertions.assertEquals(1, verificationDataset.filter("substr(id,1,3) = 'pdb'").count());
Assertions.assertEquals(1, verificationDataset.filter("substr(id,1,4) = 'pmid'").count());
Assertions.assertEquals(1, verificationDataset.filter("substr(id,1,5) = 'arXiv'").count());
}
}

View File

@ -1,168 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.pid;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class PreparedInfoCollectTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(PreparedInfoCollectTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PreparedInfoCollectTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PreparedInfoCollectTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(PreparedInfoCollectTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testCollectAndSave() throws Exception {
//software and otherresearchproduct associated preparedInfo files intended to be empty
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/pid/preparedInfoSplit")
.getPath();
SparkCollectPreparedInfo.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-preparedInfoPath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultPidsList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo")
.map(item -> OBJECT_MAPPER.readValue(item, ResultPidsList.class));
org.apache.spark.sql.Dataset<ResultPidsList> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(ResultPidsList.class));
Assertions.assertEquals(34, verificationDataset.count());
verificationDataset.createOrReplaceTempView("check");
Assertions
.assertEquals(
33, spark
.sql(
"Select resultId " +
"from check " +
"lateral view explode (resultAllowedPids) r as resallowed " +
" where resallowed.key = 'doi'")
.count());
Assertions
.assertEquals(
4, spark
.sql(
"SELECT pids.value " +
"FROM check " +
"LATERAL VIEW EXPLODE (authorAllowedPids) a as authallowed " +
"LATERAL VIEW EXPLODE (authallowed) a as pids " +
"WHERE pids.key = 'orcid'")
.count());
Assertions
.assertEquals(
1, spark
.sql(
"SELECT pids.value " +
"FROM check " +
"LATERAL VIEW EXPLODE (authorAllowedPids) a as authallowed " +
"LATERAL VIEW EXPLODE (authallowed) a as pids " +
"WHERE pids.key = 'mag'")
.count());
Dataset<Row> check = spark
.sql(
"Select resultId, pids.value orcid, resallowed.key pid_type, resallowed.value pid " +
"from check" +
" lateral view explode(authorAllowedPids) a as authallowed " +
" lateral view explode(authallowed) p as pids " +
" lateral view explode (resultAllowedPids) r as resallowed " +
"where pids.key = 'orcid'");
Assertions.assertEquals(6, check.count());
Assertions
.assertEquals(1, check.filter("resultId = '50|base_oa_____::1a700385222228181f20835bae60a71e'").count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::1a700385222228181f20835bae60a71e' " +
"and orcid = '0000-0002-1114-4216'")
.count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::1a700385222228181f20835bae60a71e' " +
"and pid = '10.1016/j.jrmge.2016.11.005'")
.count());
Assertions
.assertEquals(1, check.filter("resultId = '50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f'").count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f' " +
"and orcid = '0000-0001-9317-9350'")
.count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f' " +
"and pid = '10.1016/j.jotr.2014.10.003'")
.count());
}
}

View File

@ -1,260 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.pid;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class PreparedInfoPidTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(PreparedInfoPidTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PreparedInfoPidTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PreparedInfoPidTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(PreparedInfoPidTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testPublication() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/pid/result/publication")
.getPath();
SparkPrepareResultPids.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-sourcePath", sourcePath,
"-allowedResultPids", "[\"doi\",\"arxiv\",\"pmc\",\"pmid\",\"pdb\"]",
"-allowedAuthorPids", "[\"orcid\"]",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultPidsList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultPidsList.class));
org.apache.spark.sql.Dataset<ResultPidsList> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(ResultPidsList.class));
Assertions.assertEquals(31, verificationDataset.count());
verificationDataset.createOrReplaceTempView("check");
Assertions
.assertEquals(
31, spark
.sql(
"Select resultId " +
"from check " +
"lateral view explode (resultAllowedPids) r as resallowed " +
" where resallowed.key = 'doi'")
.count());
Dataset<Row> check = spark
.sql(
"Select resultId, pids.value orcid, resallowed.key pid_type, resallowed.value pid " +
"from check" +
" lateral view explode(authorAllowedPids) a as authallowed " +
" lateral view explode(authallowed) p as pids " +
" lateral view explode (resultAllowedPids) r as resallowed " +
"where pids.key = 'orcid'");
Assertions.assertEquals(2, check.count());
Assertions
.assertEquals(1, check.filter("resultId = '50|base_oa_____::1a700385222228181f20835bae60a71e'").count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::1a700385222228181f20835bae60a71e' " +
"and orcid = '0000-0002-1114-4216'")
.count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::1a700385222228181f20835bae60a71e' " +
"and pid = '10.1016/j.jrmge.2016.11.005'")
.count());
Assertions
.assertEquals(1, check.filter("resultId = '50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f'").count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f' " +
"and orcid = '0000-0001-9317-9350'")
.count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f' " +
"and pid = '10.1016/j.jotr.2014.10.003'")
.count());
}
@Test
public void testDataset() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/pid/result/dataset")
.getPath();
SparkPrepareResultPids.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-sourcePath", sourcePath,
"-allowedResultPids", "[\"doi\",\"arxiv\",\"pmc\",\"pmid\",\"pdb\"]",
"-allowedAuthorPids", "[\"orcid\"]",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultPidsList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, ResultPidsList.class));
org.apache.spark.sql.Dataset<ResultPidsList> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(ResultPidsList.class));
Assertions.assertEquals(3, verificationDataset.count());
verificationDataset.createOrReplaceTempView("check");
Dataset<Row> check = spark
.sql(
"Select resultId, pids.value orcid, resallowed.key pid_type, resallowed.value pid " +
"from check" +
" lateral view explode(authorAllowedPids) a as authallowed " +
" lateral view explode(authallowed) p as pids " +
" lateral view explode (resultAllowedPids) r as resallowed ");
Assertions.assertEquals(4, check.count());
Assertions
.assertEquals(4, check.filter("resultId = '50|_____OmicsDI::00899d9cb1163754943a3277365adc02'").count());
Assertions
.assertEquals(
2, check
.filter(
"resultId = '50|_____OmicsDI::00899d9cb1163754943a3277365adc02' " +
"and (orcid = '0000-0001-9317-9350' or orcid = '0000-0002-1114-4216') and " +
"pid = '10.1016/fake' and pid_type = 'doi' ")
.count());
Assertions
.assertEquals(
2, check
.filter(
"resultId = '50|_____OmicsDI::00899d9cb1163754943a3277365adc02' " +
"and (orcid = '0000-0001-9317-9350' or orcid = '0000-0002-1114-4216') and " +
"pid = '10443fake' and pid_type = 'pmid' ")
.count());
check = spark
.sql(
"Select resultId, authorAllowedPids, resallowed.key pid_type, resallowed.value pid " +
"from check" +
" lateral view explode (resultAllowedPids) r as resallowed ");
Assertions.assertEquals(5, check.count());
Assertions
.assertEquals(2, check.filter("resultId = '50|_____OmicsDI::00899d9cb1163754943a3277365adc02'").count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|_____OmicsDI::00899d9cb1163754943a3277365adc02' " +
" and pid = '10.1016/fake' and pid_type = 'doi' ")
.count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|_____OmicsDI::00899d9cb1163754943a3277365adc02' " +
" and pid = '10443fake' and pid_type = 'pmid' ")
.count());
Assertions
.assertEquals(1, check.filter("resultId = '50|_____OmicsDI::023fd1fcbb64f0f7df0671798a62f379'").count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|_____OmicsDI::023fd1fcbb64f0f7df0671798a62f379' " +
" and pid = 'fakepdb' and pid_type = 'pdb' ")
.count());
Assertions
.assertEquals(2, check.filter("resultId = '50|_____OmicsDI::036d65211a6ac14237c6e2d7cc223386'").count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|_____OmicsDI::036d65211a6ac14237c6e2d7cc223386' " +
" and pid = '10.1016/j.tcs.2012.06.029' and pid_type = 'doi' ")
.count());
Assertions
.assertEquals(
1, check
.filter(
"resultId = '50|_____OmicsDI::036d65211a6ac14237c6e2d7cc223386' " +
" and pid = 'fake_arxiv' and pid_type = 'arXiv' ")
.count());
}
}

View File

@ -1,34 +0,0 @@
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1145/2069087.2069102"}],"resultId":"50|acm_________::72f7ad968fa42cfbf0d3d7b245e43477"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1145/1805974.1805977"}],"resultId":"50|acm_________::cbd6814ee33b6357c7cea7c008a72b80"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":""}],"resultId":"50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.tcs.2012.06.029"}],"resultId":"50|base_oa_____::020ca6ee0ae16e1e9c1405207087a671"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jacc.2014.06.769"}],"resultId":"50|base_oa_____::02715dd9fc09b87966f3bad613aec7f9"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0042-6822(02)00051-x"}],"resultId":"50|base_oa_____::035f21f0844d85933ada45818ab13f5d"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0022-247x(68)90176-5"}],"resultId":"50|base_oa_____::06505050e503a2e7f9167ba704fbeaac"}
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0001-9317-9350"}]],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jotr.2014.10.003"}],"resultId":"50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0377-0427(02)00443-0"}],"resultId":"50|base_oa_____::0a4c1442ae16ccfe614b1230a1700ce5"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.phpro.2012.03.506"}],"resultId":"50|base_oa_____::0cd5cf2933ecbdee82c39799961c8b18"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0168-0072(84)90037-x"}],"resultId":"50|base_oa_____::0f7eb47f3d0e591efedcaacc0ad312aa"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0213-9111(03)71806-8"}],"resultId":"50|base_oa_____::10f021134185727435d60679a30a9705"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.joca.2015.02.572"}],"resultId":"50|base_oa_____::111ab11c5644629db0779ba0041f8e0b"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0024-3795(94)00213-w"}],"resultId":"50|base_oa_____::11366a19965f0d8ecadef4df4a7e9ccc"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.ijid.2014.03.993"}],"resultId":"50|base_oa_____::1382fc03144c27085955624f9727e699"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.virol.2006.03.025"}],"resultId":"50|base_oa_____::1757e43719aee1289faee6b2c9538d1d"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0075-9511(01)80047-5"}],"resultId":"50|base_oa_____::17da7b56aa049a6bd67d6f3b2399a8ee"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.visres.2008.12.014"}],"resultId":"50|base_oa_____::1825ff5d845c3c25cecd779583829b6b"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0120-3347(10)81002-0"}],"resultId":"50|base_oa_____::1854b6c6e615c64bb886b20e3c51f099"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s1995-7645(12)60117-5"}],"resultId":"50|base_oa_____::187f25dc2d40ec7ed34eb488236eef17"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.arabjc.2014.01.016"}],"resultId":"50|base_oa_____::1a11ffdca7bff691d9be29074694ed28"}
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0002-1114-4216"}]],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jrmge.2016.11.005"}],"resultId":"50|base_oa_____::1a700385222228181f20835bae60a71e"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jval.2015.09.2195"}],"resultId":"50|base_oa_____::1b63c2f6ce1dc7ba9598fe6ec08affdc"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0022-0000(90)90015-d"}],"resultId":"50|base_oa_____::1be5adb74414a30ec46c74dfa20d7562"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.artint.2010.07.002"}],"resultId":"50|base_oa_____::1c62e28a5d3faf1da7717a81e05f6cf4"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jmaa.2004.08.018"}],"resultId":"50|base_oa_____::1d561e2e20c1bb6eb50b2e309ed5658f"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0166-8641(01)00188-2"}],"resultId":"50|base_oa_____::1e805d44f8076ae1c11f0093cf9679ca"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.bbmt.2007.01.017"}],"resultId":"50|base_oa_____::1e922204b8bfd042c5b9fbeb29848b53"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0741-5214(03)00428-2"}],"resultId":"50|base_oa_____::1fe2ad6be2235cd0e494baad6184efaa"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.riai.2013.09.004"}],"resultId":"50|base_oa_____::2122d7837eac59f792a6f11f85bec960"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0092-8674(00)81616-5"}],"resultId":"50|base_oa_____::23375ea29b261a29a7891b6f890460ae"}
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0001-9317-9350"},{"key":"mag","value":"fakeMag"}],[{"key":"orcid","value":"0000-0002-1114-4216"}]],"resultAllowedPids":[{"key":"doi","value":"10.1016/fake"},{"key":"pmid","value":"10443fake"}],"resultId":"50|_____OmicsDI::00899d9cb1163754943a3277365adc02"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"pdb","value":"fakepdb"}],"resultId":"50|_____OmicsDI::023fd1fcbb64f0f7df0671798a62f379"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.tcs.2012.06.029"},{"key":"arXiv","value":"fake_arxiv"}],"resultId":"50|_____OmicsDI::036d65211a6ac14237c6e2d7cc223386"}

View File

@ -1,34 +0,0 @@
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0001-9317-9350"},{"key":"mag","value":"fakeMag"}],[{"key":"orcid","value":"0000-0002-1114-4216"}]],"resultAllowedPids":[{"key":"doi","value":"10.1145/2069087.2069102"}],"resultId":"50|acm_________::72f7ad968fa42cfbf0d3d7b245e43477"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1145/1805974.1805977"}],"resultId":"50|acm_________::cbd6814ee33b6357c7cea7c008a72b80"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":""}],"resultId":"50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.tcs.2012.06.029"}],"resultId":"50|base_oa_____::020ca6ee0ae16e1e9c1405207087a671"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jacc.2014.06.769"}],"resultId":"50|base_oa_____::02715dd9fc09b87966f3bad613aec7f9"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0042-6822(02)00051-x"}],"resultId":"50|base_oa_____::035f21f0844d85933ada45818ab13f5d"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0022-247x(68)90176-5"}],"resultId":"50|base_oa_____::06505050e503a2e7f9167ba704fbeaac"}
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0001-9317-9350"}]],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jotr.2014.10.003"}],"resultId":"50|base_oa_____::06546a1ad6b1c71e5e366ef15b9ade1f"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0377-0427(02)00443-0"}],"resultId":"50|base_oa_____::0a4c1442ae16ccfe614b1230a1700ce5"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.phpro.2012.03.506"}],"resultId":"50|base_oa_____::0cd5cf2933ecbdee82c39799961c8b18"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0168-0072(84)90037-x"}],"resultId":"50|base_oa_____::0f7eb47f3d0e591efedcaacc0ad312aa"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0213-9111(03)71806-8"}],"resultId":"50|base_oa_____::10f021134185727435d60679a30a9705"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.joca.2015.02.572"}],"resultId":"50|base_oa_____::111ab11c5644629db0779ba0041f8e0b"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0024-3795(94)00213-w"}],"resultId":"50|base_oa_____::11366a19965f0d8ecadef4df4a7e9ccc"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.ijid.2014.03.993"}],"resultId":"50|base_oa_____::1382fc03144c27085955624f9727e699"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.virol.2006.03.025"}],"resultId":"50|base_oa_____::1757e43719aee1289faee6b2c9538d1d"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0075-9511(01)80047-5"}],"resultId":"50|base_oa_____::17da7b56aa049a6bd67d6f3b2399a8ee"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.visres.2008.12.014"}],"resultId":"50|base_oa_____::1825ff5d845c3c25cecd779583829b6b"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0120-3347(10)81002-0"}],"resultId":"50|base_oa_____::1854b6c6e615c64bb886b20e3c51f099"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s1995-7645(12)60117-5"}],"resultId":"50|base_oa_____::187f25dc2d40ec7ed34eb488236eef17"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.arabjc.2014.01.016"}],"resultId":"50|base_oa_____::1a11ffdca7bff691d9be29074694ed28"}
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0002-1114-4216"}]],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jrmge.2016.11.005"}],"resultId":"50|base_oa_____::1a700385222228181f20835bae60a71e"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jval.2015.09.2195"}],"resultId":"50|base_oa_____::1b63c2f6ce1dc7ba9598fe6ec08affdc"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/0022-0000(90)90015-d"}],"resultId":"50|base_oa_____::1be5adb74414a30ec46c74dfa20d7562"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.artint.2010.07.002"}],"resultId":"50|base_oa_____::1c62e28a5d3faf1da7717a81e05f6cf4"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.jmaa.2004.08.018"}],"resultId":"50|base_oa_____::1d561e2e20c1bb6eb50b2e309ed5658f"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0166-8641(01)00188-2"}],"resultId":"50|base_oa_____::1e805d44f8076ae1c11f0093cf9679ca"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.bbmt.2007.01.017"}],"resultId":"50|base_oa_____::1e922204b8bfd042c5b9fbeb29848b53"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0741-5214(03)00428-2"}],"resultId":"50|base_oa_____::1fe2ad6be2235cd0e494baad6184efaa"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.riai.2013.09.004"}],"resultId":"50|base_oa_____::2122d7837eac59f792a6f11f85bec960"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/s0092-8674(00)81616-5"}],"resultId":"50|base_oa_____::23375ea29b261a29a7891b6f890460ae"}
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0001-9317-9350"},{"key":"mag","value":"fakeMag"}],[{"key":"orcid","value":"0000-0002-1114-4216"}]],"resultAllowedPids":[{"key":"doi","value":"10.1016/fake"},{"key":"pmid","value":"10443fake"}],"resultId":"50|_____OmicsDI::00899d9cb1163754943a3277365adc02"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"pdb","value":"fakepdb"}],"resultId":"50|_____OmicsDI::023fd1fcbb64f0f7df0671798a62f379"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.tcs.2012.06.029"},{"key":"arXiv","value":"fake_arxiv"}],"resultId":"50|_____OmicsDI::036d65211a6ac14237c6e2d7cc223386"}

View File

@ -1,3 +0,0 @@
{"authorAllowedPids":[[{"key":"orcid","value":"0000-0001-9317-9350"},{"key":"mag","value":"fakeMag"}],[{"key":"orcid","value":"0000-0002-1114-4216"}]],"resultAllowedPids":[{"key":"doi","value":"10.1016/fake"},{"key":"pmid","value":"10443fake"}],"resultId":"50|_____OmicsDI::00899d9cb1163754943a3277365adc02"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"pdb","value":"fakepdb"}],"resultId":"50|_____OmicsDI::023fd1fcbb64f0f7df0671798a62f379"}
{"authorAllowedPids":[],"resultAllowedPids":[{"key":"doi","value":"10.1016/j.tcs.2012.06.029"},{"key":"arXiv","value":"fake_arxiv"}],"resultId":"50|_____OmicsDI::036d65211a6ac14237c6e2d7cc223386"}