first attempt to the dump of pids graph
This commit is contained in:
parent
85203c16e3
commit
bef79d3bdf
|
@ -0,0 +1,28 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class ResultOrganization implements Serializable {
|
||||
private String resultId;
|
||||
private List<StructuredProperty> orgPids;
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public List<StructuredProperty> getOrgPid() {
|
||||
return orgPids;
|
||||
}
|
||||
|
||||
public void setOrgPid(List<StructuredProperty> pid) {
|
||||
this.orgPids = pid;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
|
||||
|
||||
/**
|
||||
* Needed to create relations between pids in the result. The list of resultAllowedPids will produce relation of type
|
||||
* source hasOtherMaterialization target (and vice-versa) where source will be identified by one of the pids in the list
|
||||
* and target by another. A couple of relation between every two nodes. The list of authorAllowedPids will produce
|
||||
* relation of type source hasAuthor target and target isAuthorOf source for every couple of nodes in result and author.
|
||||
*/
|
||||
public class ResultPidsList implements Serializable {
|
||||
private String resultId;
|
||||
private List<KeyValue> resultAllowedPids;
|
||||
private List<List<KeyValue>> authorAllowedPids;
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public List<KeyValue> getResultAllowedPids() {
|
||||
return resultAllowedPids;
|
||||
}
|
||||
|
||||
public void setResultAllowedPids(List<KeyValue> resultAllowedPids) {
|
||||
this.resultAllowedPids = resultAllowedPids;
|
||||
}
|
||||
|
||||
public List<List<KeyValue>> getAuthorAllowedPids() {
|
||||
return authorAllowedPids;
|
||||
}
|
||||
|
||||
public void setAuthorAllowedPids(List<List<KeyValue>> authorAllowedPids) {
|
||||
this.authorAllowedPids = authorAllowedPids;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class ResultProject implements Serializable {
|
||||
private String resultId;
|
||||
private String code;
|
||||
private List<String> fundings;
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public List<String> getFundings() {
|
||||
return fundings;
|
||||
}
|
||||
|
||||
public void setFundings(List<String> fundings) {
|
||||
this.fundings = fundings;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import javax.rmi.CORBA.Util;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkCollectPreparedInfo implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkCollectPreparedInfo.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkCollectPreparedInfo.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_collectandsave.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("preparedInfoPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
collectAndSave(spark, inputPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
|
||||
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/publication", ResultPidsList.class)
|
||||
.union(Utils.readPath(spark, inputPath + "/dataset", ResultPidsList.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/software", ResultPidsList.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", ResultPidsList.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.dump.pidgraph.Entity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
|
||||
public class SparkDumpOrganization implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpOrganization.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpOrganization.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_dump_organization.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final List<String> allowedOrgPid = new Gson().fromJson(parser.get("allowedOrganizationPids"), List.class);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpPidOrgs(spark, allowedOrgPid, inputPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void dumpPidOrgs(SparkSession spark, List<String> allowedOrgPid, String inputPath,
|
||||
String outputPath) {
|
||||
Dataset<Organization> resultPids = Utils.readPath(spark, inputPath, Organization.class);
|
||||
|
||||
resultPids.flatMap((FlatMapFunction<Organization, Entity>) r -> {
|
||||
List<Entity> ret = new ArrayList<>();
|
||||
r.getPid().forEach(pid -> {
|
||||
if (allowedOrgPid.contains(pid.getQualifier().getClassid().toLowerCase())) {
|
||||
ret.add(Entity.newInstance(pid.getQualifier().getClassid() + ":" + pid.getValue()));
|
||||
}
|
||||
});
|
||||
return ret.iterator();
|
||||
}, Encoders.bean(Entity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/organization");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,142 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.dump.pidgraph.Entity;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class SparkDumpPidAuthor implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpPidAuthor.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpPidAuthor.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_dump_author.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final List<String> allowedAuthorPids = new Gson().fromJson(parser.get("allowedAuthorPids"), List.class);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpPidAuthor(spark, inputPath, outputPath, allowedAuthorPids);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void dumpPidAuthor(SparkSession spark, String inputPath, String outputPath, List<String> aap) {
|
||||
Dataset<Publication> publication = Utils.readPath(spark, inputPath + "/publication", Publication.class);
|
||||
Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> dataset = Utils
|
||||
.readPath(spark, inputPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
Dataset<Software> software = Utils.readPath(spark, inputPath + "/software", Software.class);
|
||||
Dataset<OtherResearchProduct> other = Utils
|
||||
.readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class);
|
||||
|
||||
publication.createOrReplaceTempView("publication");
|
||||
dataset.createOrReplaceTempView("dataset");
|
||||
software.createOrReplaceTempView("software");
|
||||
other.createOrReplaceTempView("other");
|
||||
|
||||
Dataset<KeyValue> pids = spark
|
||||
.sql(
|
||||
"SELECT DISTINCT apid.value value , apid.qualifier.classid key " +
|
||||
"FROM publication " +
|
||||
"LATERAL VIEW EXPLODE (author) a as auth " +
|
||||
"LATERAL VIEW EXPLODE (auth.pid) p as apid ")
|
||||
.as(Encoders.bean(KeyValue.class))
|
||||
.union(
|
||||
spark
|
||||
.sql(
|
||||
"SELECT DISTINCT apid.value value , apid.qualifier.classid key " +
|
||||
"FROM dataset " +
|
||||
"LATERAL VIEW EXPLODE (author) a as auth " +
|
||||
"LATERAL VIEW EXPLODE (auth.pid) p as apid ")
|
||||
.as(Encoders.bean(KeyValue.class)))
|
||||
.union(
|
||||
spark
|
||||
.sql(
|
||||
"SELECT DISTINCT apid.value value , apid.qualifier.classid key " +
|
||||
"FROM software " +
|
||||
"LATERAL VIEW EXPLODE (author) a as auth " +
|
||||
"LATERAL VIEW EXPLODE (auth.pid) p as apid ")
|
||||
.as(Encoders.bean(KeyValue.class)))
|
||||
.union(
|
||||
spark
|
||||
.sql(
|
||||
"SELECT DISTINCT apid.value value , apid.qualifier.classid key " +
|
||||
"FROM other " +
|
||||
"LATERAL VIEW EXPLODE (author) a as auth " +
|
||||
"LATERAL VIEW EXPLODE (auth.pid) p as apid ")
|
||||
.as(Encoders.bean(KeyValue.class)));
|
||||
|
||||
pids.createOrReplaceTempView("pids");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"Select distinct key, value " +
|
||||
"FROM pids")
|
||||
.as(Encoders.bean(KeyValue.class))
|
||||
.filter((FilterFunction<KeyValue>) p -> aap.contains(p.getKey()))
|
||||
.map(
|
||||
(MapFunction<KeyValue, Entity>) pid -> Entity.newInstance(pid.getKey() + ":" + pid.getValue()),
|
||||
Encoders.bean(Entity.class))
|
||||
.write()
|
||||
|
||||
// resultPids.flatMap((FlatMapFunction<ResultPidsList, Entity>) r-> {
|
||||
// List<Entity> ret = new ArrayList<>();
|
||||
// r.getAuthorAllowedPids().forEach(pid -> {
|
||||
// ret.addAll(pid.stream().map(p -> Entity.newInstance(p.getKey() + ":" + p.getValue())).collect(Collectors.toList()));
|
||||
//
|
||||
// });
|
||||
// return ret.iterator();
|
||||
// }, Encoders.bean(Entity.class))
|
||||
// .write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/author");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.dump.pidgraph.Entity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkDumpPidResult implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpPidResult.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpPidResult.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_dump_result.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("preparedInfoPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpPidEntities(spark, inputPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void dumpPidEntities(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<ResultPidsList> resultPids = Utils.readPath(spark, inputPath, ResultPidsList.class);
|
||||
|
||||
resultPids.flatMap((FlatMapFunction<ResultPidsList, Entity>) r -> {
|
||||
List<Entity> ret = new ArrayList<>();
|
||||
r.getResultAllowedPids().forEach(pid -> {
|
||||
if (StringUtils.isNoneEmpty(pid.getKey(), pid.getValue()))
|
||||
ret.add(Entity.newInstance(pid.getKey() + ":" + pid.getValue()));
|
||||
});
|
||||
return ret.iterator();
|
||||
}, Encoders.bean(Entity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/result");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.dump.pidgraph.Entity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
|
||||
public class SparkDumpProject implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpOrganization.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpOrganization.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_dump_project.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpProjects(spark, inputPath, outputPath);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private static void dumpProjects(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<Project> projectDataset = Utils.readPath(spark, inputPath, Project.class);
|
||||
|
||||
projectDataset.flatMap((FlatMapFunction<Project, Entity>) project -> {
|
||||
List<Entity> projs = new ArrayList<>();
|
||||
project.getFundingtree().forEach(fund -> {
|
||||
try {
|
||||
projs.add(Utils.getEntity(fund.getValue(), project.getCode().getValue()));
|
||||
} catch (DocumentException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
return projs.iterator();
|
||||
}, Encoders.bean(Entity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/project");
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkDumpResultOrganizationRelation implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpResultOrganizationRelation.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpResultOrganizationRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_dump_organizationrels.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultPidListPath = parser.get("preparedInfoPath");
|
||||
|
||||
final List<String> allowedPids = new Gson().fromJson(parser.get("allowedOrganizationPids"), List.class);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpResultOrganziationRelations(spark, inputPath, resultPidListPath, allowedPids, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void dumpResultOrganziationRelations(SparkSession spark, String inputPath, String preparedInfoPath,
|
||||
List<String> allowedPids, String outputPath) {
|
||||
Dataset<Relation> relations = Utils.readPath(spark, inputPath + "/relation", Relation.class);
|
||||
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "/organization", Organization.class);
|
||||
Dataset<ResultPidsList> resultPid = Utils.readPath(spark, preparedInfoPath, ResultPidsList.class);
|
||||
|
||||
relations.createOrReplaceTempView("relation");
|
||||
organizations.createOrReplaceTempView("organization");
|
||||
|
||||
Dataset<ResultOrganization> resultOrg = spark
|
||||
.sql(
|
||||
"SELECT source resultId , pid orgPids" +
|
||||
"FROM relation r " +
|
||||
"JOIN organization o " +
|
||||
"ON r.target = o.id " +
|
||||
"WHERE r.datainfo.deletedbyinference = false " +
|
||||
"AND o.datainfo.deletedbyinference = false " +
|
||||
"AND lower(relclass) = '" + ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase() + "'")
|
||||
.as(Encoders.bean(ResultOrganization.class));
|
||||
|
||||
resultOrg
|
||||
.joinWith(resultPid, resultOrg.col("resultId").equalTo(resultPid.col("resultId")), "left")
|
||||
.flatMap(
|
||||
(FlatMapFunction<Tuple2<ResultOrganization, ResultPidsList>, eu.dnetlib.dhp.schema.dump.oaf.graph.Relation>) value -> {
|
||||
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList = new ArrayList<>();
|
||||
Optional<ResultPidsList> orel = Optional.ofNullable(value._2());
|
||||
if (orel.isPresent()) {
|
||||
List<String> orgList = value
|
||||
._1()
|
||||
.getOrgPid()
|
||||
.stream()
|
||||
.filter(p -> allowedPids.contains(p.getQualifier().getClassid()))
|
||||
.map(pid -> pid.getQualifier().getClassid() + ":" + pid.getValue())
|
||||
.collect(Collectors.toList());
|
||||
if (orgList.size() > 0) {
|
||||
List<KeyValue> resList = orel.get().getResultAllowedPids();
|
||||
for (int i = 0; i < resList.size(); i++) {
|
||||
String pid = resList.get(i).getKey() + ":" + resList.get(i).getValue();
|
||||
for (int j = 0; j < orgList.size(); j++) {
|
||||
relList
|
||||
.addAll(
|
||||
Utils
|
||||
.getRelationPair(
|
||||
pid, orgList.get(j), Constants.RESULT, Constants.ORGANIZATION,
|
||||
ModelConstants.AFFILIATION, ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||
ModelConstants.IS_AUTHOR_INSTITUTION_OF));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return relList.iterator();
|
||||
}, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relationOrganization");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,129 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkDumpResultProjectRelation implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpResultProjectRelation.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpResultProjectRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_dump_projectrels.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultPidListPath = parser.get("preparedInfoPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpResultProjectRelations(spark, inputPath, resultPidListPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void dumpResultProjectRelations(SparkSession spark, String inputPath, String preparedInfoPath,
|
||||
String outputPath) {
|
||||
Dataset<Relation> relations = Utils.readPath(spark, inputPath + "/relation", Relation.class);
|
||||
Dataset<Project> projects = Utils.readPath(spark, inputPath + "/project", Project.class);
|
||||
Dataset<ResultPidsList> resultPid = Utils.readPath(spark, preparedInfoPath, ResultPidsList.class);
|
||||
|
||||
relations.createOrReplaceTempView("relation");
|
||||
projects.createOrReplaceTempView("project");
|
||||
|
||||
Dataset<ResultProject> resultProj = spark
|
||||
.sql(
|
||||
"SELECT source resultId , code, fundingtree.value fundings" +
|
||||
"FROM relation r " +
|
||||
"JOIN project p " +
|
||||
"ON r.target = p.id " +
|
||||
"WHERE r.datainfo.deletedbyinference = false " +
|
||||
"AND lower(relclass) = '" + ModelConstants.IS_PRODUCED_BY.toLowerCase() + "'")
|
||||
.as(Encoders.bean(ResultProject.class));
|
||||
|
||||
resultProj
|
||||
.joinWith(resultPid, resultProj.col("resultId").equalTo(resultPid.col("resultId")), "left")
|
||||
.flatMap(
|
||||
(FlatMapFunction<Tuple2<ResultProject, ResultPidsList>, eu.dnetlib.dhp.schema.dump.oaf.graph.Relation>) value -> {
|
||||
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList = new ArrayList<>();
|
||||
Optional<ResultPidsList> orel = Optional.ofNullable(value._2());
|
||||
if (orel.isPresent()) {
|
||||
List<String> projList = new ArrayList<>();
|
||||
String code = value._1().getCode();
|
||||
for (String fund : value._1().getFundings()) {
|
||||
projList.add(Utils.getEntity(fund, code).getId());
|
||||
}
|
||||
|
||||
List<KeyValue> resList = orel.get().getResultAllowedPids();
|
||||
for (int i = 0; i < resList.size(); i++) {
|
||||
String pid = resList.get(i).getKey() + ":" + resList.get(i).getValue();
|
||||
for (int j = 0; j < projList.size(); j++) {
|
||||
relList
|
||||
.addAll(
|
||||
Utils
|
||||
.getRelationPair(
|
||||
pid, projList.get(j), Constants.RESULT, Constants.PROJECT,
|
||||
ModelConstants.OUTCOME, ModelConstants.IS_PRODUCED_BY,
|
||||
ModelConstants.PRODUCES));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return relList.iterator();
|
||||
}, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relationProject");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,175 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.ResultProject;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
|
||||
public class SparkDumpResultRelation implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpResultRelation.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpResultRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_dump_result.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("preparedInfoPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpPidRelations(spark, inputPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static Dataset<Relation> distinctRelations(Dataset<Relation> rels) {
|
||||
return rels
|
||||
.filter(getRelationFilterFunction())
|
||||
.groupByKey(
|
||||
(MapFunction<Relation, String>) r -> String
|
||||
.join(
|
||||
r.getSource().getId(), r.getTarget().getId(), r.getReltype().getName(),
|
||||
r.getReltype().getType()),
|
||||
Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Relation, Relation>) (key, relationIterator) -> relationIterator.next(),
|
||||
Encoders.bean(Relation.class));
|
||||
}
|
||||
|
||||
private static FilterFunction<Relation> getRelationFilterFunction() {
|
||||
return (FilterFunction<Relation>) r -> StringUtils.isNotBlank(r.getSource().getId()) ||
|
||||
StringUtils.isNotBlank(r.getTarget().getId()) ||
|
||||
StringUtils.isNotBlank(r.getReltype().getName()) ||
|
||||
StringUtils.isNotBlank(r.getReltype().getType());
|
||||
}
|
||||
|
||||
private static void dumpPidRelations(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<ResultPidsList> resultPids = Utils.readPath(spark, inputPath, ResultPidsList.class);
|
||||
|
||||
distinctRelations(resultPids.flatMap((FlatMapFunction<ResultPidsList, Relation>) r -> {
|
||||
List<Relation> ret = new ArrayList<>();
|
||||
List<KeyValue> resPids = r.getResultAllowedPids();
|
||||
List<List<KeyValue>> authPids = r.getAuthorAllowedPids();
|
||||
|
||||
for (int i = 0; i < resPids.size() - 1; i++) {
|
||||
String pid = resPids.get(i).getKey() + ":" + resPids.get(i).getValue();
|
||||
for (int j = i + 1; j < resPids.size(); j++) {
|
||||
ret
|
||||
.addAll(
|
||||
Utils
|
||||
.getRelationPair(
|
||||
pid, resPids.get(j).getKey() + ":" + resPids.get(j).getValue(),
|
||||
Constants.RESULT, Constants.RESULT, Constants.SIMILARITY,
|
||||
Constants.RESPID_RESPID_RELATION, Constants.RESPID_RESPID_RELATION));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < authPids.size() - 1; i++) {
|
||||
for (int j = i + 1; j < authPids.size(); j++) {
|
||||
ret.addAll(getAuthRelations(authPids.get(i), authPids.get(j)));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < resPids.size(); i++) {
|
||||
String pid = resPids.get(i).getKey() + ":" + resPids.get(i).getValue();
|
||||
for (int j = 0; j < authPids.size(); j++) {
|
||||
for (int k = 0; k < authPids.get(j).size(); k++) {
|
||||
ret
|
||||
.addAll(
|
||||
Utils
|
||||
.getRelationPair(
|
||||
pid,
|
||||
authPids.get(j).get(k).getKey() + ":" + authPids.get(j).get(k).getValue(),
|
||||
Constants.RESULT, Constants.AUTHOR, Constants.AUTHORSHIP,
|
||||
Constants.RES_AUTHOR_RELATION, Constants.AUTHOR_RES_RELATION));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return ret.iterator();
|
||||
}, Encoders.bean(Relation.class)))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
}
|
||||
|
||||
private static List<Relation> getAuthRelations(List<KeyValue> a1, List<KeyValue> a2) {
|
||||
List<Relation> ret = new ArrayList<>();
|
||||
if (a1.size() > 1) {
|
||||
ret.addAll(getSameAs(a1));
|
||||
}
|
||||
if (a2.size() > 1) {
|
||||
ret.addAll(getSameAs(a2));
|
||||
}
|
||||
for (int i = 0; i < a1.size(); i++) {
|
||||
String pid = a1.get(i).getKey() + ":" + a1.get(i).getValue();
|
||||
for (int j = 0; j < a2.size(); j++) {
|
||||
ret
|
||||
.addAll(
|
||||
Utils
|
||||
.getRelationPair(
|
||||
pid, a2.get(j).getKey() + ":" + a2.get(j).getValue(),
|
||||
Constants.AUTHOR, Constants.AUTHOR, Constants.AUTHORSHIP,
|
||||
Constants.AUTHOR_AUTHOR_RELATION, Constants.AUTHOR_AUTHOR_RELATION));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static List<Relation> getSameAs(List<KeyValue> a1) {
|
||||
List<Relation> ret = new ArrayList<>();
|
||||
for (int i = 0; i < a1.size() - 1; i++) {
|
||||
ret
|
||||
.addAll(
|
||||
Utils
|
||||
.getRelationPair(
|
||||
a1.get(i).getKey() + ":" + a1.get(i).getValue(),
|
||||
a1.get(i + 1).getKey() + ":" + a1.get(i + 1).getValue(),
|
||||
Constants.AUTHOR, Constants.AUTHOR, Constants.SIMILARITY,
|
||||
Constants.SAME_AS, Constants.SAME_AS));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.pid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkPrepareResultPids implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultPids.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkPrepareResultPids.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump_pid/input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
final List<String> allowedResultPid = new Gson().fromJson(parser.get("allowedResultPids"), List.class);
|
||||
final List<String> allowedAuthorPid = new Gson().fromJson(parser.get("allowedAuthorPids"), List.class);
|
||||
|
||||
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
|
||||
log.info("resultType: {}", resultType);
|
||||
|
||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
preparePidEntities(
|
||||
spark, inputPath, outputPath + "/" + resultType, inputClazz, allowedResultPid, allowedAuthorPid);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void preparePidEntities(SparkSession spark, String inputPath, String outputPath,
|
||||
Class<R> inputClazz, List<String> allowedResultPid,
|
||||
List<String> allowedAuthorPid) {
|
||||
|
||||
Dataset<R> result = Utils.readPath(spark, inputPath, inputClazz);
|
||||
|
||||
result.map((MapFunction<R, ResultPidsList>) res -> {
|
||||
ResultPidsList ret = new ResultPidsList();
|
||||
ret.setResultId(res.getId());
|
||||
List<KeyValue> pidList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(res.getPid())
|
||||
.ifPresent(pids -> pids.forEach(pid -> {
|
||||
if (allowedResultPid.contains(pid.getQualifier().getClassid().toLowerCase())) {
|
||||
pidList.add(KeyValue.newInstance(pid.getQualifier().getClassid(), pid.getValue()));
|
||||
}
|
||||
}));
|
||||
ret.setResultAllowedPids(pidList);
|
||||
List<List<KeyValue>> authorPidList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(res.getAuthor())
|
||||
.ifPresent(authors -> authors.forEach(a -> {
|
||||
Optional
|
||||
.ofNullable(a.getPid())
|
||||
.ifPresent(pids -> pids.forEach(p -> {
|
||||
List<KeyValue> authorPids = new ArrayList<>();
|
||||
if (allowedAuthorPid.contains(p.getQualifier().getClassid().toLowerCase())) {
|
||||
authorPids.add(KeyValue.newInstance(p.getQualifier().getClassid(), p.getValue()));
|
||||
}
|
||||
if (authorPids.size() > 0) {
|
||||
authorPidList.add(authorPids);
|
||||
}
|
||||
}));
|
||||
}));
|
||||
ret.setAuthorAllowedPids(authorPidList);
|
||||
|
||||
if (authorPidList.size() == 0 && pidList.size() == 0) {
|
||||
return null;
|
||||
}
|
||||
return ret;
|
||||
}, Encoders.bean(ResultPidsList.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue