new implementation for country propagatio

This commit is contained in:
Miriam Baglioni 2020-04-08 10:49:09 +02:00
parent beebbcf66b
commit 2afe971816
1 changed files with 36 additions and 41 deletions

View File

@ -53,14 +53,18 @@ public class SparkCountryPropagationJob {
List<String> whitelist = Arrays.asList(parser.get("whitelist").split(";"));
List<String> allowedtypes = Arrays.asList(parser.get("allowedtypes").split(";"));
boolean writeUpdates = TRUE.equals(parser.get("writeUpdate"));
boolean saveGraph = TRUE.equals(parser.get("saveGraph"));
// datasource(spark, whitelist, outputPath, inputPath, "true".equals(parser.get("writeUpdate")),
// "true".equals(parser.get("saveGraph")), allowedtypes);
//
// }
//
//
// private static void datasource(SparkSession spark, List<String> whitelist, String outputPath, String inputPath,
// boolean writeUpdates, boolean saveGraph, List<String> allowedtypes){
datasource(spark, whitelist, outputPath, inputPath);
}
private static void datasource(SparkSession spark, List<String> whitelist, String outputPath, String inputPath){
String whitelisted = "";
for (String i : whitelist){
whitelisted += " OR id = '" + i + "'";
@ -96,26 +100,14 @@ public class SparkCountryPropagationJob {
datasource.createOrReplaceTempView("datasource");
relation.createOrReplaceTempView("relation");
organization.createOrReplaceTempView("organization");
// String query = "SELECT source ds, target org, country.classid country " +
// "FROM ( SELECT id " +
// "FROM openaire.datasource " +
// "WHERE datasourcetype.classid = 'pubsrepository::institutional' " +
// "AND (datainfo.deletedbyinference = false " + whitelisted + ") ) d " +
// "JOIN ( SELECT source, target " +
// "FROM openaire.relation " +
// "WHERE relclass = 'provides' " +
// "AND datainfo.deletedbyinference = false ) rel " +
// "ON d.id = rel.source " +
// "JOIN (SELECT id, country " +
// "FROM openaire.organization " +
// "WHERE datainfo.deletedbyinference = false ) o " +
// "ON o.id = rel.target";
String query = "SELECT source ds, target org, country.classid country " +
"FROM ( SELECT id " +
"FROM datasource " +
"WHERE datasourcetype.classid = 'pubsrepository::institutional' " +
"AND (datainfo.deletedbyinference = false " + whitelisted + ") ) d " +
"WHERE (datainfo.deletedbyinference = false " + whitelisted + ") " +
getConstraintList("datasourcetype.classid = '", allowedtypes) +
// "datasourcetype.classid = 'pubsrepository::institutional' " +
// "AND (datainfo.deletedbyinference = false " + whitelisted + ") ) d " +
"JOIN ( SELECT source, target " +
"FROM relation " +
"WHERE relclass = 'provides' " +
@ -141,8 +133,11 @@ public class SparkCountryPropagationJob {
publication.createOrReplaceTempView("publication");
final JavaRDD<Row> toupdateresultpublication = propagateOnResult(spark, "publication");
if(writeUpdates){
writeUpdates(toupdateresultsoftware, toupdateresultdataset, toupdateresultother, toupdateresultpublication, outputPath);
}
if(saveGraph){
createUpdateForSoftwareDataset(toupdateresultsoftware, inputPath, spark)
.map(s -> new ObjectMapper().writeValueAsString(s))
.saveAsTextFile(outputPath + "/software");
@ -158,6 +153,7 @@ public class SparkCountryPropagationJob {
createUpdateForPublicationDataset(toupdateresultpublication, inputPath, spark)
.map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath + "/publication");
}
}
@ -334,4 +330,3 @@ public class SparkCountryPropagationJob {
}