fixed NPE, moved class to generate tar
This commit is contained in:
parent
e9aca6b702
commit
24be522e7c
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common;
|
package eu.dnetlib.dhp.oa.common;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
|
@ -15,7 +15,7 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.MakeTarArchive;
|
import eu.dnetlib.dhp.oa.common.MakeTarArchive;
|
||||||
|
|
||||||
public class MakeTar implements Serializable {
|
public class MakeTar implements Serializable {
|
||||||
|
|
||||||
|
|
|
@ -515,11 +515,17 @@ public class ResultMapper implements Serializable {
|
||||||
|
|
||||||
setCommonValue(i, instance);
|
setCommonValue(i, instance);
|
||||||
|
|
||||||
|
if (Optional.ofNullable(i.getCollectedfrom()).isPresent() &&
|
||||||
|
Optional.ofNullable(i.getCollectedfrom().getKey()).isPresent() &&
|
||||||
|
StringUtils.isNotBlank(i.getCollectedfrom().getKey()))
|
||||||
instance
|
instance
|
||||||
.setCollectedfrom(
|
.setCollectedfrom(
|
||||||
CfHbKeyValue
|
CfHbKeyValue
|
||||||
.newInstance(i.getCollectedfrom().getKey().substring(3), i.getCollectedfrom().getValue()));
|
.newInstance(i.getCollectedfrom().getKey().substring(3), i.getCollectedfrom().getValue()));
|
||||||
|
|
||||||
|
if (Optional.ofNullable(i.getHostedby()).isPresent() &&
|
||||||
|
Optional.ofNullable(i.getHostedby().getKey()).isPresent() &&
|
||||||
|
StringUtils.isNotBlank(i.getHostedby().getKey()))
|
||||||
instance
|
instance
|
||||||
.setHostedby(
|
.setHostedby(
|
||||||
CfHbKeyValue.newInstance(i.getHostedby().getKey().substring(3), i.getHostedby().getValue()));
|
CfHbKeyValue.newInstance(i.getHostedby().getKey().substring(3), i.getHostedby().getValue()));
|
||||||
|
|
|
@ -219,9 +219,9 @@ public class QueryInformationSystem {
|
||||||
}
|
}
|
||||||
if (funding.toLowerCase().contains("h2020")) {
|
if (funding.toLowerCase().contains("h2020")) {
|
||||||
nsp = "corda__h2020::";
|
nsp = "corda__h2020::";
|
||||||
} else if (funding.toLowerCase().contains("he")){
|
} else if (funding.toLowerCase().contains("he")) {
|
||||||
nsp = "corda_____he::";
|
nsp = "corda_____he::";
|
||||||
}else{
|
} else {
|
||||||
nsp = "corda_______::";
|
nsp = "corda_______::";
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -9,12 +9,9 @@ import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
|
||||||
import org.apache.spark.sql.types.*;
|
import org.apache.spark.sql.types.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -69,22 +66,35 @@ public class SparkSelectValidRelationsJob implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void selectValidRelation2(SparkSession spark, String inputPath, String outputPath){
|
private static void selectValidRelation2(SparkSession spark, String inputPath, String outputPath) {
|
||||||
final StructType structureSchema = new StructType().fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
final StructType structureSchema = new StructType()
|
||||||
|
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Row> df =spark.createDataFrame(new ArrayList<Row>(), structureSchema);
|
org.apache.spark.sql.Dataset<Row> df = spark.createDataFrame(new ArrayList<Row>(), structureSchema);
|
||||||
List<String> entities = Arrays.asList("publication", "dataset","otherresearchproduct","software","organization","project","datasource");
|
List<String> entities = Arrays
|
||||||
for(String e : entities)
|
.asList(
|
||||||
df = df.union(spark.read().schema(structureSchema).json(inputPath + "/" + e).filter("dataInfo.deletedbyinference != true"));
|
"publication", "dataset", "otherresearchproduct", "software", "organization", "project", "datasource");
|
||||||
|
for (String e : entities)
|
||||||
|
df = df
|
||||||
|
.union(
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.schema(structureSchema)
|
||||||
|
.json(inputPath + "/" + e)
|
||||||
|
.filter("dataInfo.deletedbyinference != true and dataInfo.invisible != true"));
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Row> relations = spark.read().schema(Encoders.bean(Relation.class).schema()).json(inputPath + "/relation")
|
org.apache.spark.sql.Dataset<Row> relations = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(inputPath + "/relation")
|
||||||
.filter("dataInfo.deletedbyinference == false");
|
.filter("dataInfo.deletedbyinference == false");
|
||||||
|
|
||||||
relations.join(df, relations.col("source").equalTo(df.col("id")), "leftsemi")
|
relations
|
||||||
|
.join(df, relations.col("source").equalTo(df.col("id")), "leftsemi")
|
||||||
.join(df, relations.col("target").equalTo(df.col("id")), "leftsemi")
|
.join(df, relations.col("target").equalTo(df.col("id")), "leftsemi")
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression","gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,8 @@ public class ProjectsSubsetSparkJob implements Serializable {
|
||||||
String projectListPath) {
|
String projectListPath) {
|
||||||
Dataset<String> projectList = spark.read().textFile(projectListPath);
|
Dataset<String> projectList = spark.read().textFile(projectListPath);
|
||||||
Dataset<Project> projects;
|
Dataset<Project> projects;
|
||||||
projects = Utils.readPath(spark, inputPath, Project.class)
|
projects = Utils
|
||||||
|
.readPath(spark, inputPath, Project.class)
|
||||||
.map((MapFunction<Project, Project>) p -> {
|
.map((MapFunction<Project, Project>) p -> {
|
||||||
p.setId("40|" + p.getId());
|
p.setId("40|" + p.getId());
|
||||||
return p;
|
return p;
|
||||||
|
|
Loading…
Reference in New Issue