new implementation for country propagatio

This commit is contained in:
Miriam Baglioni 2020-04-08 10:49:09 +02:00
parent beebbcf66b
commit 2afe971816
1 changed files with 36 additions and 41 deletions

View File

@ -53,14 +53,18 @@ public class SparkCountryPropagationJob {
List<String> whitelist = Arrays.asList(parser.get("whitelist").split(";")); List<String> whitelist = Arrays.asList(parser.get("whitelist").split(";"));
List<String> allowedtypes = Arrays.asList(parser.get("allowedtypes").split(";")); List<String> allowedtypes = Arrays.asList(parser.get("allowedtypes").split(";"));
boolean writeUpdates = TRUE.equals(parser.get("writeUpdate"));
boolean saveGraph = TRUE.equals(parser.get("saveGraph"));
// datasource(spark, whitelist, outputPath, inputPath, "true".equals(parser.get("writeUpdate")),
// "true".equals(parser.get("saveGraph")), allowedtypes);
//
// }
//
//
// private static void datasource(SparkSession spark, List<String> whitelist, String outputPath, String inputPath,
// boolean writeUpdates, boolean saveGraph, List<String> allowedtypes){
datasource(spark, whitelist, outputPath, inputPath);
}
private static void datasource(SparkSession spark, List<String> whitelist, String outputPath, String inputPath){
String whitelisted = ""; String whitelisted = "";
for (String i : whitelist){ for (String i : whitelist){
whitelisted += " OR id = '" + i + "'"; whitelisted += " OR id = '" + i + "'";
@ -78,7 +82,7 @@ public class SparkCountryPropagationJob {
.map(item -> new ObjectMapper().readValue(item, Organization.class)).rdd(), Encoders.bean(Organization.class)); .map(item -> new ObjectMapper().readValue(item, Organization.class)).rdd(), Encoders.bean(Organization.class));
Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> dataset = spark.createDataset(sc.textFile(inputPath + "/dataset") Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> dataset = spark.createDataset(sc.textFile(inputPath + "/dataset")
.map(item -> new ObjectMapper().readValue(item, eu.dnetlib.dhp.schema.oaf.Dataset.class)).rdd(), .map(item -> new ObjectMapper().readValue(item, eu.dnetlib.dhp.schema.oaf.Dataset.class)).rdd(),
Encoders.bean(eu.dnetlib.dhp.schema.oaf.Dataset.class)); Encoders.bean(eu.dnetlib.dhp.schema.oaf.Dataset.class));
Dataset<OtherResearchProduct> other = spark.createDataset(sc.textFile(inputPath + "/otherresearchproduct") Dataset<OtherResearchProduct> other = spark.createDataset(sc.textFile(inputPath + "/otherresearchproduct")
@ -96,26 +100,14 @@ public class SparkCountryPropagationJob {
datasource.createOrReplaceTempView("datasource"); datasource.createOrReplaceTempView("datasource");
relation.createOrReplaceTempView("relation"); relation.createOrReplaceTempView("relation");
organization.createOrReplaceTempView("organization"); organization.createOrReplaceTempView("organization");
// String query = "SELECT source ds, target org, country.classid country " +
// "FROM ( SELECT id " +
// "FROM openaire.datasource " +
// "WHERE datasourcetype.classid = 'pubsrepository::institutional' " +
// "AND (datainfo.deletedbyinference = false " + whitelisted + ") ) d " +
// "JOIN ( SELECT source, target " +
// "FROM openaire.relation " +
// "WHERE relclass = 'provides' " +
// "AND datainfo.deletedbyinference = false ) rel " +
// "ON d.id = rel.source " +
// "JOIN (SELECT id, country " +
// "FROM openaire.organization " +
// "WHERE datainfo.deletedbyinference = false ) o " +
// "ON o.id = rel.target";
String query = "SELECT source ds, target org, country.classid country " + String query = "SELECT source ds, target org, country.classid country " +
"FROM ( SELECT id " + "FROM ( SELECT id " +
"FROM datasource " + "FROM datasource " +
"WHERE datasourcetype.classid = 'pubsrepository::institutional' " + "WHERE (datainfo.deletedbyinference = false " + whitelisted + ") " +
"AND (datainfo.deletedbyinference = false " + whitelisted + ") ) d " + getConstraintList("datasourcetype.classid = '", allowedtypes) +
// "datasourcetype.classid = 'pubsrepository::institutional' " +
// "AND (datainfo.deletedbyinference = false " + whitelisted + ") ) d " +
"JOIN ( SELECT source, target " + "JOIN ( SELECT source, target " +
"FROM relation " + "FROM relation " +
"WHERE relclass = 'provides' " + "WHERE relclass = 'provides' " +
@ -141,23 +133,27 @@ public class SparkCountryPropagationJob {
publication.createOrReplaceTempView("publication"); publication.createOrReplaceTempView("publication");
final JavaRDD<Row> toupdateresultpublication = propagateOnResult(spark, "publication"); final JavaRDD<Row> toupdateresultpublication = propagateOnResult(spark, "publication");
writeUpdates(toupdateresultsoftware, toupdateresultdataset, toupdateresultother, toupdateresultpublication, outputPath); if(writeUpdates){
writeUpdates(toupdateresultsoftware, toupdateresultdataset, toupdateresultother, toupdateresultpublication, outputPath);
}
createUpdateForSoftwareDataset(toupdateresultsoftware, inputPath, spark) if(saveGraph){
.map(s -> new ObjectMapper().writeValueAsString(s)) createUpdateForSoftwareDataset(toupdateresultsoftware, inputPath, spark)
.saveAsTextFile(outputPath + "/software"); .map(s -> new ObjectMapper().writeValueAsString(s))
.saveAsTextFile(outputPath + "/software");
createUpdateForDatasetDataset(toupdateresultdataset,inputPath,spark) createUpdateForDatasetDataset(toupdateresultdataset,inputPath,spark)
.map(d -> new ObjectMapper().writeValueAsString(d)) .map(d -> new ObjectMapper().writeValueAsString(d))
.saveAsTextFile(outputPath + "/dataset"); .saveAsTextFile(outputPath + "/dataset");
createUpdateForOtherDataset(toupdateresultother, inputPath, spark) createUpdateForOtherDataset(toupdateresultother, inputPath, spark)
.map(o -> new ObjectMapper().writeValueAsString(o)) .map(o -> new ObjectMapper().writeValueAsString(o))
.saveAsTextFile(outputPath + "/otherresearchproduct"); .saveAsTextFile(outputPath + "/otherresearchproduct");
createUpdateForPublicationDataset(toupdateresultpublication, inputPath, spark) createUpdateForPublicationDataset(toupdateresultpublication, inputPath, spark)
.map(p -> new ObjectMapper().writeValueAsString(p)) .map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath + "/publication"); .saveAsTextFile(outputPath + "/publication");
}
} }
@ -276,9 +272,9 @@ public class SparkCountryPropagationJob {
String query; String query;
query = "SELECT id, inst.collectedfrom.key cf , inst.hostedby.key hb " + query = "SELECT id, inst.collectedfrom.key cf , inst.hostedby.key hb " +
"FROM ( SELECT id, instance " + "FROM ( SELECT id, instance " +
"FROM " + table + "FROM " + table +
" WHERE datainfo.deletedbyinference = false) ds " + " WHERE datainfo.deletedbyinference = false) ds " +
"LATERAL VIEW EXPLODE(instance) i AS inst"; "LATERAL VIEW EXPLODE(instance) i AS inst";
Dataset<Row> cfhb = spark.sql(query); Dataset<Row> cfhb = spark.sql(query);
cfhb.createOrReplaceTempView("cfhb"); cfhb.createOrReplaceTempView("cfhb");
@ -333,5 +329,4 @@ public class SparkCountryPropagationJob {
} }