forked from antonis.lempesis/dnet-hadoop
Merge pull request '8172_impact_indicators_workflow' (#284) from 8172_impact_indicators_workflow into beta
Reviewed-on: D-Net/dnet-hadoop#284
This commit is contained in:
commit
9c8b41475a
|
@ -6,13 +6,14 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -24,8 +25,9 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
|
|
||||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
||||||
|
import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel;
|
||||||
|
import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
|
@ -40,7 +42,6 @@ import scala.Tuple2;
|
||||||
*/
|
*/
|
||||||
public class SparkAtomicActionScoreJob implements Serializable {
|
public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
|
|
||||||
private static final String DOI = "doi";
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
@ -56,18 +57,17 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
|
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String inputPath = parser.get("inputPath");
|
final String resultsInputPath = parser.get("resultsInputPath");
|
||||||
log.info("inputPath {}: ", inputPath);
|
log.info("resultsInputPath: {}", resultsInputPath);
|
||||||
|
|
||||||
|
final String projectsInputPath = parser.get("projectsInputPath");
|
||||||
|
log.info("projectsInputPath: {}", projectsInputPath);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath {}: ", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
@ -76,17 +76,45 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
prepareResults(spark, inputPath, outputPath);
|
|
||||||
|
JavaPairRDD<Text, Text> resultsRDD = prepareResults(spark, resultsInputPath, outputPath);
|
||||||
|
JavaPairRDD<Text, Text> projectsRDD = prepareProjects(spark, projectsInputPath, outputPath);
|
||||||
|
|
||||||
|
resultsRDD
|
||||||
|
.union(projectsRDD)
|
||||||
|
.saveAsHadoopFile(
|
||||||
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) {
|
private static <I extends Project> JavaPairRDD<Text, Text> prepareProjects(SparkSession spark, String inputPath,
|
||||||
|
String outputPath) {
|
||||||
|
|
||||||
|
// read input bip project scores
|
||||||
|
Dataset<BipProjectModel> projectScores = readPath(spark, inputPath, BipProjectModel.class);
|
||||||
|
|
||||||
|
return projectScores.map((MapFunction<BipProjectModel, Project>) bipProjectScores -> {
|
||||||
|
Project project = new Project();
|
||||||
|
project.setId(bipProjectScores.getProjectId());
|
||||||
|
project.setMeasures(bipProjectScores.toMeasures());
|
||||||
|
return project;
|
||||||
|
}, Encoders.bean(Project.class))
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(p -> new AtomicAction(Project.class, p))
|
||||||
|
.mapToPair(
|
||||||
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <I extends Result> JavaPairRDD<Text, Text> prepareResults(SparkSession spark, String bipScorePath,
|
||||||
|
String outputPath) {
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
|
JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
|
||||||
.textFile(bipScorePath)
|
.textFile(bipScorePath)
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));
|
||||||
|
|
||||||
Dataset<BipScore> bipScores = spark
|
Dataset<BipScore> bipScores = spark
|
||||||
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
|
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
|
||||||
|
@ -96,9 +124,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
return bs;
|
return bs;
|
||||||
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
|
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
|
||||||
|
|
||||||
bipScores
|
return bipScores.map((MapFunction<BipScore, Result>) bs -> {
|
||||||
|
|
||||||
.map((MapFunction<BipScore, Result>) bs -> {
|
|
||||||
Result ret = new Result();
|
Result ret = new Result();
|
||||||
|
|
||||||
ret.setId(bs.getId());
|
ret.setId(bs.getId());
|
||||||
|
@ -111,9 +137,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
.map(p -> new AtomicAction(Result.class, p))
|
.map(p -> new AtomicAction(Result.class, p))
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<Measure> getMeasure(BipScore value) {
|
private static List<Measure> getMeasure(BipScore value) {
|
||||||
|
@ -159,12 +183,4 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
|
||||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
|
||||||
return spark
|
|
||||||
.read()
|
|
||||||
.textFile(inputPath)
|
|
||||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.opencsv.bean.CsvBindByPosition;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Measure;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
public class BipProjectModel {
|
||||||
|
String projectId;
|
||||||
|
|
||||||
|
String numOfInfluentialResults;
|
||||||
|
|
||||||
|
String numOfPopularResults;
|
||||||
|
|
||||||
|
String totalImpulse;
|
||||||
|
|
||||||
|
String totalCitationCount;
|
||||||
|
|
||||||
|
// each project bip measure has exactly one value, hence one key-value pair
|
||||||
|
private Measure createMeasure(String measureId, String measureValue) {
|
||||||
|
|
||||||
|
KeyValue kv = new KeyValue();
|
||||||
|
kv.setKey("score");
|
||||||
|
kv.setValue(measureValue);
|
||||||
|
kv
|
||||||
|
.setDataInfo(
|
||||||
|
OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false,
|
||||||
|
UPDATE_DATA_INFO_TYPE,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
UPDATE_MEASURE_BIP_CLASS_ID,
|
||||||
|
UPDATE_CLASS_NAME,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
""));
|
||||||
|
|
||||||
|
Measure measure = new Measure();
|
||||||
|
measure.setId(measureId);
|
||||||
|
measure.setUnit(Collections.singletonList(kv));
|
||||||
|
return measure;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Measure> toMeasures() {
|
||||||
|
return Arrays
|
||||||
|
.asList(
|
||||||
|
createMeasure("numOfInfluentialResults", numOfInfluentialResults),
|
||||||
|
createMeasure("numOfPopularResults", numOfPopularResults),
|
||||||
|
createMeasure("totalImpulse", totalImpulse),
|
||||||
|
createMeasure("totalCitationCount", totalCitationCount));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,19 +1,21 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.actionmanager.bipmodel.Score;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class that maps the model of the bipFinder! input data.
|
* Class that maps the model of the bipFinder! input data.
|
||||||
* Only needed for deserialization purposes
|
* Only needed for deserialization purposes
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
|
public class BipResultModel extends HashMap<String, List<Score>> implements Serializable {
|
||||||
|
|
||||||
public BipDeserialize() {
|
public BipResultModel() {
|
||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,8 +24,8 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
|
|
||||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
||||||
|
import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
@ -82,9 +82,9 @@ public class PrepareBipFinder implements Serializable {
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
|
JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
|
||||||
.textFile(inputPath)
|
.textFile(inputPath)
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
|
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
|
||||||
|
|
|
@ -6,9 +6,15 @@
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "ip",
|
"paramName": "rip",
|
||||||
"paramLongName": "inputPath",
|
"paramLongName": "resultsInputPath",
|
||||||
"paramDescription": "the URL from where to get the programme file",
|
"paramDescription": "the URL from where to get the input file for results",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "pip",
|
||||||
|
"paramLongName": "projectsInputPath",
|
||||||
|
"paramDescription": "the URL from where to get the input file for projects",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -6,7 +6,8 @@ import static org.junit.jupiter.api.Assertions.*;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
|
||||||
|
import javax.xml.crypto.Data;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
@ -27,7 +28,9 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class SparkAtomicActionScoreJobTest {
|
public class SparkAtomicActionScoreJobTest {
|
||||||
|
@ -37,8 +40,8 @@ public class SparkAtomicActionScoreJobTest {
|
||||||
private static SparkSession spark;
|
private static SparkSession spark;
|
||||||
|
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
private static final Logger log = LoggerFactory
|
|
||||||
.getLogger(SparkAtomicActionScoreJobTest.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJobTest.class);
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void beforeAll() throws IOException {
|
public static void beforeAll() throws IOException {
|
||||||
|
@ -69,47 +72,64 @@ public class SparkAtomicActionScoreJobTest {
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
private void runJob(String resultsInputPath, String projectsInputPath, String outputPath) throws Exception {
|
||||||
void testMatch() throws Exception {
|
|
||||||
String bipScoresPath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
SparkAtomicActionScoreJob
|
SparkAtomicActionScoreJob
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-isSparkSessionManaged",
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
Boolean.FALSE.toString(),
|
"-resultsInputPath", resultsInputPath,
|
||||||
"-inputPath",
|
"-projectsInputPath", projectsInputPath,
|
||||||
|
"-outputPath", outputPath,
|
||||||
bipScoresPath,
|
|
||||||
|
|
||||||
"-outputPath",
|
|
||||||
workingDir.toString() + "/actionSet"
|
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testScores() throws Exception {
|
||||||
|
|
||||||
|
String resultsInputPath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
String projectsInputPath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
String outputPath = workingDir.toString() + "/actionSet";
|
||||||
|
|
||||||
|
// execute the job to generate the action sets for result scores
|
||||||
|
runJob(resultsInputPath, projectsInputPath, outputPath);
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<Result> tmp = sc
|
JavaRDD<OafEntity> tmp = sc
|
||||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
.sequenceFile(outputPath, Text.class, Text.class)
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
.map(aa -> ((Result) aa.getPayload()));
|
.map(aa -> ((OafEntity) aa.getPayload()));
|
||||||
|
|
||||||
assertEquals(4, tmp.count());
|
assertEquals(8, tmp.count());
|
||||||
|
|
||||||
Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
|
Dataset<OafEntity> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(OafEntity.class));
|
||||||
verificationDataset.createOrReplaceTempView("result");
|
verificationDataset.createOrReplaceTempView("result");
|
||||||
|
|
||||||
Dataset<Row> execVerification = spark
|
Dataset<Row> testDataset = spark
|
||||||
.sql(
|
.sql(
|
||||||
"Select p.id oaid, mes.id, mUnit.value from result p " +
|
"Select p.id oaid, mes.id, mUnit.value from result p " +
|
||||||
"lateral view explode(measures) m as mes " +
|
"lateral view explode(measures) m as mes " +
|
||||||
"lateral view explode(mes.unit) u as mUnit ");
|
"lateral view explode(mes.unit) u as mUnit ");
|
||||||
|
|
||||||
Assertions.assertEquals(12, execVerification.count());
|
// execVerification.show();
|
||||||
|
|
||||||
|
Assertions.assertEquals(28, testDataset.count());
|
||||||
|
|
||||||
|
assertResultImpactScores(testDataset);
|
||||||
|
assertProjectImpactScores(testDataset);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void assertResultImpactScores(Dataset<Row> testDataset) {
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"6.63451994567e-09", execVerification
|
"6.63451994567e-09", testDataset
|
||||||
.filter(
|
.filter(
|
||||||
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||||
"and id = 'influence'")
|
"and id = 'influence'")
|
||||||
|
@ -119,7 +139,7 @@ public class SparkAtomicActionScoreJobTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"0.348694533145", execVerification
|
"0.348694533145", testDataset
|
||||||
.filter(
|
.filter(
|
||||||
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||||
"and id = 'popularity_alt'")
|
"and id = 'popularity_alt'")
|
||||||
|
@ -129,7 +149,7 @@ public class SparkAtomicActionScoreJobTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"2.16094680115e-09", execVerification
|
"2.16094680115e-09", testDataset
|
||||||
.filter(
|
.filter(
|
||||||
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||||
"and id = 'popularity'")
|
"and id = 'popularity'")
|
||||||
|
@ -137,7 +157,49 @@ public class SparkAtomicActionScoreJobTest {
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getString(0));
|
.getString(0));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void assertProjectImpactScores(Dataset<Row> testDataset) throws Exception {
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"0", testDataset
|
||||||
|
.filter(
|
||||||
|
"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
|
||||||
|
"and id = 'numOfInfluentialResults'")
|
||||||
|
.select("value")
|
||||||
|
.collectAsList()
|
||||||
|
.get(0)
|
||||||
|
.getString(0));
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"1", testDataset
|
||||||
|
.filter(
|
||||||
|
"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
|
||||||
|
"and id = 'numOfPopularResults'")
|
||||||
|
.select("value")
|
||||||
|
.collectAsList()
|
||||||
|
.get(0)
|
||||||
|
.getString(0));
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"25", testDataset
|
||||||
|
.filter(
|
||||||
|
"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
|
||||||
|
"and id = 'totalImpulse'")
|
||||||
|
.select("value")
|
||||||
|
.collectAsList()
|
||||||
|
.get(0)
|
||||||
|
.getString(0));
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
"43", testDataset
|
||||||
|
.filter(
|
||||||
|
"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
|
||||||
|
"and id = 'totalCitationCount'")
|
||||||
|
.select("value")
|
||||||
|
.collectAsList()
|
||||||
|
.get(0)
|
||||||
|
.getString(0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
{"projectId":"40|nsf_________::d93e50d22374a1cf59f6a232413ea027","numOfInfluentialResults":0,"numOfPopularResults":10,"totalImpulse":181,"totalCitationCount":235}
|
||||||
|
{"projectId":"40|nih_________::1c93debc7085e440f245fbe70b2e8b21","numOfInfluentialResults":14,"numOfPopularResults":17,"totalImpulse":1558,"totalCitationCount":4226}
|
||||||
|
{"projectId":"40|nih_________::c02a8233e9b60f05bb418f0c9b714833","numOfInfluentialResults":0,"numOfPopularResults":1,"totalImpulse":25,"totalCitationCount":43}
|
||||||
|
{"projectId":"40|corda_______::d91dcf3a87dd7f72248fab0b8a4ba273","numOfInfluentialResults":2,"numOfPopularResults":3,"totalImpulse":78,"totalCitationCount":178}
|
|
@ -0,0 +1,36 @@
|
||||||
|
# Ranking Workflow for OpenAIRE Publications
|
||||||
|
|
||||||
|
This project contains the files for running a paper ranking workflow on the openaire graph using apache oozie.
|
||||||
|
All scripts are written in python and the project setup follows the typical oozie workflow structure:
|
||||||
|
|
||||||
|
- a workflow.xml file containing the workflow specification
|
||||||
|
- a job.properties file specifying parameter values for the parameters used by the workflow
|
||||||
|
- a set of python scripts used by the workflow
|
||||||
|
|
||||||
|
**NOTE**: the workflow depends on the external library of ranking scripts called [BiP! Ranker](https://github.com/athenarc/Bip-Ranker).
|
||||||
|
You can check out a specific tag/release of BIP! Ranker using maven, as described in the following section.
|
||||||
|
|
||||||
|
## Build and deploy
|
||||||
|
|
||||||
|
Use the following command for packaging:
|
||||||
|
|
||||||
|
```
|
||||||
|
mvn package -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
|
||||||
|
```
|
||||||
|
|
||||||
|
Deploy and run:
|
||||||
|
```
|
||||||
|
mvn package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use.
|
||||||
|
|
||||||
|
|
||||||
|
Job info and logs:
|
||||||
|
```
|
||||||
|
export OOZIE_URL=http://iis-cdh5-test-m3:11000/oozie
|
||||||
|
oozie job -info <jobId>
|
||||||
|
oozie job -log <jobId>
|
||||||
|
```
|
||||||
|
|
||||||
|
where `jobId` is the id of the job returned by the `run_workflow.sh` script.
|
|
@ -0,0 +1,62 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-workflows</artifactId>
|
||||||
|
<version>1.2.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>dhp-impact-indicators</artifactId>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<maven.compiler.source>8</maven.compiler.source>
|
||||||
|
<maven.compiler.target>8</maven.compiler.target>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
|
||||||
|
<!-- Use this property to fetch a specific tag -->
|
||||||
|
<bip.ranker.tag>v1.0.0</bip.ranker.tag>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<scm>
|
||||||
|
<url>https://github.com/athenarc/Bip-Ranker</url>
|
||||||
|
<connection>scm:git:https://github.com/athenarc/Bip-Ranker.git</connection>
|
||||||
|
</scm>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-scm-plugin</artifactId>
|
||||||
|
<version>1.8.1</version>
|
||||||
|
<configuration>
|
||||||
|
<connectionType>connection</connectionType>
|
||||||
|
<scmVersionType>tag</scmVersionType><!-- 'branch' can also be provided here -->
|
||||||
|
<scmVersion>${bip.ranker.tag}</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
||||||
|
<checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/bip-ranker</checkoutDirectory>
|
||||||
|
</configuration>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>checkout-bip-ranker</id>
|
||||||
|
<phase>prepare-package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>checkout</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-aggregation</artifactId>
|
||||||
|
<version>${projectVersion}</version>
|
||||||
|
<scope>compile</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,105 @@
|
||||||
|
# The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters
|
||||||
|
# and concern the parameterization required for running workflows on the @GARR cluster
|
||||||
|
|
||||||
|
# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties ---
|
||||||
|
# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
|
||||||
|
# dhp.hadoop.frontend.user.name=ilias.kanellos
|
||||||
|
# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
|
||||||
|
# dhp.hadoop.frontend.port.ssh=22
|
||||||
|
# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
|
||||||
|
# jobTracker=yarnRM
|
||||||
|
# nameNode=hdfs://nameservice1
|
||||||
|
# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
|
||||||
|
# maven.executable=mvn
|
||||||
|
|
||||||
|
# Some memory and driver settings for more demanding tasks
|
||||||
|
sparkHighDriverMemory=20G
|
||||||
|
sparkNormalDriverMemory=10G
|
||||||
|
|
||||||
|
sparkHighExecutorMemory=20G
|
||||||
|
sparkNormalExecutorMemory=10G
|
||||||
|
|
||||||
|
sparkExecutorCores=4
|
||||||
|
sparkShufflePartitions=7680
|
||||||
|
|
||||||
|
# The above is given differently in an example I found online
|
||||||
|
oozie.action.sharelib.for.spark=spark2
|
||||||
|
oozieActionShareLibForSpark2=spark2
|
||||||
|
spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
|
||||||
|
spark2EventLogDir=/user/spark/spark2ApplicationHistory
|
||||||
|
sparkSqlWarehouseDir=/user/hive/warehouse
|
||||||
|
hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
|
||||||
|
# This MAY avoid the no library used error
|
||||||
|
oozie.use.system.libpath=true
|
||||||
|
# Some stuff copied from openaire's jobs
|
||||||
|
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
|
||||||
|
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
|
||||||
|
|
||||||
|
|
||||||
|
# Some stuff copied from openaire's jobs
|
||||||
|
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
|
||||||
|
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------ #
|
||||||
|
# The following set of properties are my own custom ones
|
||||||
|
|
||||||
|
# Based on the page linked to at the start of the file, if we use yarn as a resource manager, its address is given as follows
|
||||||
|
resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster
|
||||||
|
|
||||||
|
# current year used when creating graph / by some ranking methods
|
||||||
|
currentYear=2023
|
||||||
|
|
||||||
|
# Alpha value for pagerank
|
||||||
|
pageRankAlpha=0.5
|
||||||
|
# AttRank values
|
||||||
|
attrankAlpha=0.2
|
||||||
|
attrankBeta=0.5
|
||||||
|
attrankGamma=0.3
|
||||||
|
attrankRho=-0.16
|
||||||
|
# attrankCurrentYear=2023
|
||||||
|
attrankStartYear=2021
|
||||||
|
|
||||||
|
# Ram values
|
||||||
|
ramGamma=0.6
|
||||||
|
# ramCurrentYear=2023
|
||||||
|
|
||||||
|
# Convergence error for pagerank
|
||||||
|
convergenceError=0.000000000001
|
||||||
|
|
||||||
|
# I think this should be the oozie workflow directory
|
||||||
|
# oozieWorkflowPath=user/ilias.kanellos/workflow_example/
|
||||||
|
|
||||||
|
# Directory where json data containing scores will be output
|
||||||
|
bipScorePath=${workingDir}/openaire_universe_scores/
|
||||||
|
|
||||||
|
# Directory where dataframes are checkpointed
|
||||||
|
checkpointDir=${nameNode}/${workingDir}/check/
|
||||||
|
|
||||||
|
# The directory for the doi-based bip graph
|
||||||
|
# bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph
|
||||||
|
|
||||||
|
# The folder from which synonyms of openaire-ids are read
|
||||||
|
# openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/
|
||||||
|
openaireDataInput=/tmp/prod_provision/graph/18_graph_blacklisted
|
||||||
|
|
||||||
|
# A folder where we will write the openaire to doi mapping
|
||||||
|
synonymFolder=${nameNode}/${workingDir}/openaireid_to_dois/
|
||||||
|
|
||||||
|
# This will be where we store the openaire graph input. They told us on GARR to use a directory under /data
|
||||||
|
openaireGraphInputPath=${nameNode}/${workingDir}/openaire_id_graph
|
||||||
|
|
||||||
|
# The workflow application path
|
||||||
|
wfAppPath=${oozieTopWfApplicationPath}
|
||||||
|
|
||||||
|
# The following is needed as a property of a workflow
|
||||||
|
#oozie.wf.application.path=${wfAppPath}
|
||||||
|
oozie.wf.application.path=${oozieTopWfApplicationPath}
|
||||||
|
|
||||||
|
|
||||||
|
# Path where the final output should be?
|
||||||
|
actionSetOutputPath=${workingDir}/bip_actionsets
|
||||||
|
|
||||||
|
# The directory to store project impact indicators
|
||||||
|
projectImpactIndicatorsOutput=${workingDir}/project_indicators
|
||||||
|
|
||||||
|
resume=entry-point-decision
|
|
@ -0,0 +1,255 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
# Create openaire id - openaire id graph from openaire data
|
||||||
|
|
||||||
|
#############################################################################################################
|
||||||
|
# Program proceeds as follows:
|
||||||
|
# 1. We read the input folder provided from hdfs.
|
||||||
|
# This contains subfolders with openaire graph objects and openaire graph relations
|
||||||
|
# 2. We select all openaire graph objects of interest. We filter out based on visibility
|
||||||
|
# and inference criteria. We also filter out based on the availability of publication year
|
||||||
|
# 3. Get reference type dataframes from openaire. Then filter each one of them based on the
|
||||||
|
# existence of citing and cited in the above filtered dataset. Get only citations
|
||||||
|
# produced by publication objects, or otherresearchproducts of types:
|
||||||
|
# [TBD]
|
||||||
|
# 4. Get objects that don't appear in the relations (from those gathered in step 1) and add
|
||||||
|
# them to the graph
|
||||||
|
# 5. Group relations by citing paper and do graph-specific formatting
|
||||||
|
#############################################################################################################
|
||||||
|
# ---------- Imports ------------- #
|
||||||
|
import sys
|
||||||
|
# import pyspark
|
||||||
|
# from pyspark import SparkConf, SparkContext
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
# Functions to effectively handle data
|
||||||
|
# manipulation for DataFrames
|
||||||
|
import pyspark.sql.functions as F
|
||||||
|
# Diagnostics
|
||||||
|
from timeit import default_timer as timer
|
||||||
|
# from datetime import timedelta, datetime
|
||||||
|
# -------------------------------- #
|
||||||
|
|
||||||
|
if len(sys.argv) < 5:
|
||||||
|
print ("Usage: ./create_openaire_ranking_graph.py <openaire_graph_data_folder> <current_year> <num_partitions> <output_folder>")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Inputs will be:
|
||||||
|
|
||||||
|
# 1. Folder where openaire graph is stored
|
||||||
|
graph_folder = sys.argv[1]
|
||||||
|
# 2. Current year (this will be needed for filtering)
|
||||||
|
current_year = int(sys.argv[2])
|
||||||
|
# 3. Number of partitions
|
||||||
|
num_partitions = int(sys.argv[3])
|
||||||
|
# 4. where to write output
|
||||||
|
output_folder = sys.argv[4]
|
||||||
|
|
||||||
|
# Lists of results types we want to inclued in the citations
|
||||||
|
# valid_result_types = ['publication', 'other']
|
||||||
|
valid_result_types = ['publication']
|
||||||
|
# list of types in otherresearchproduct which are considered valid for citations
|
||||||
|
valid_other = ['']
|
||||||
|
|
||||||
|
# Create the spark session
|
||||||
|
spark = SparkSession.builder.appName('oa ranking graph creation').getOrCreate()
|
||||||
|
# Set context level logging to WARN
|
||||||
|
spark.sparkContext.setLogLevel("WARN")
|
||||||
|
|
||||||
|
############################################################################################################################
|
||||||
|
# 1. Get the research objects and filter based on conditions.
|
||||||
|
# These will also be the unique identifiers we should find in the final graph
|
||||||
|
|
||||||
|
# Initialize an empty dataframe
|
||||||
|
oa_objects_df = None
|
||||||
|
|
||||||
|
# There is a directory structure on hdfs under the provided path.
|
||||||
|
# We need to parse data from the folders: ["publication", "dataset", "software", "otherresearchproduct"]
|
||||||
|
# which are rankable oa result objects.
|
||||||
|
|
||||||
|
# Loop subfolders
|
||||||
|
for sub_folder in ["publication", "dataset", "software", "otherresearchproduct"]:
|
||||||
|
# Read the json data of the graph into a dataframe initially
|
||||||
|
if not oa_objects_df:
|
||||||
|
oa_objects_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname', 'datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year'))
|
||||||
|
oa_objects_df = oa_objects_df.where( 'datainfo.deletedbyinference = false' ).where( 'datainfo.invisible = false' ).repartition(num_partitions, 'id').cache()
|
||||||
|
# If we already have data, simply add more to it
|
||||||
|
else:
|
||||||
|
sub_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname','datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year'))
|
||||||
|
sub_df = sub_df.where( 'datainfo.deletedbyinference = false ' ).where( 'datainfo.invisible = false ').cache()
|
||||||
|
# Add the data to the openaire objects dataframe
|
||||||
|
oa_objects_df = oa_objects_df.union(sub_df).repartition(num_partitions, 'id').cache()
|
||||||
|
# Clear memory
|
||||||
|
sub_df.unpersist(True)
|
||||||
|
|
||||||
|
# Remove those records without year
|
||||||
|
oa_objects_df = oa_objects_df.where(F.col('year').isNotNull())
|
||||||
|
|
||||||
|
|
||||||
|
# Now replace years where > (current_year+1) with 0
|
||||||
|
oa_objects_df = oa_objects_df.withColumn('clean_year', F.when(F.col('year').cast('int') > (current_year+1), 0).otherwise(F.col('year')))\
|
||||||
|
.drop('year').withColumnRenamed('clean_year', 'year').repartition(num_partitions, 'id')
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------- #
|
||||||
|
'''
|
||||||
|
# Some diagnostics
|
||||||
|
print ("Min and max years:" )
|
||||||
|
oa_objects_df.select(F.max('year')).show()
|
||||||
|
oa_objects_df.select(F.min('year')).show()
|
||||||
|
|
||||||
|
# This should be slow due to not repartitioning by year
|
||||||
|
print ("Distinct years:")
|
||||||
|
oa_objects_df.select('year').distinct().sort(F.col('year')).show(5000, False)
|
||||||
|
|
||||||
|
# Show distinct values of deletedbyinference and invisible to ensure we have the correct data
|
||||||
|
print ("Distinct deleted by inference:")
|
||||||
|
oa_objects_df.select('deletedbyinference').distinct().show()
|
||||||
|
print ("Distinct invisible values:")
|
||||||
|
oa_objects_df.select('invisible').distinct().show()
|
||||||
|
|
||||||
|
# Output total count
|
||||||
|
print ("Total num of research objects: " + str(oa_objects_df.count()))
|
||||||
|
'''
|
||||||
|
# -------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
# Keep only required fields - we still keep resulttype.classname to
|
||||||
|
# filter the citation relationships we consider valid
|
||||||
|
oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache()
|
||||||
|
|
||||||
|
'''
|
||||||
|
print ("OA objects Schema:")
|
||||||
|
oa_objects_df.printSchema()
|
||||||
|
sys.exit(0)
|
||||||
|
'''
|
||||||
|
############################################################################################################################
|
||||||
|
# 2. Get the relation objects and filter them based on their existence in the oa_objects_df
|
||||||
|
# NOTE: we are only interested in citations of type "cites"
|
||||||
|
# Further, we
|
||||||
|
|
||||||
|
# Deprecated line
|
||||||
|
# references_df = spark.read.json(graph_folder + "/relation").select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass')\
|
||||||
|
# .where( 'relClass = "References"' ).repartition(num_partitions, 'citing').drop('relClass')
|
||||||
|
# print ("References df has: " + str(references_df.count()) + " entries")
|
||||||
|
|
||||||
|
# Collect only valid citations i.e., invisible = false & deletedbyinference=false
|
||||||
|
cites_df = spark.read.json(graph_folder + "/relation")\
|
||||||
|
.select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'collectedfrom.value', 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
|
||||||
|
.where( (F.col('relClass') == "Cites") \
|
||||||
|
& (F.col('dataInfo.deletedbyinference') == "false")\
|
||||||
|
& (F.col('dataInfo.invisible') == "false"))\
|
||||||
|
.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
|
||||||
|
.drop('deletedbyinference').drop('invisible')\
|
||||||
|
.repartition(num_partitions, 'citing').drop('relClass')\
|
||||||
|
.withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\
|
||||||
|
.drop('collectedfrom.value')\
|
||||||
|
.drop('value')\
|
||||||
|
.where(
|
||||||
|
(F.array_contains(F.col('collected_lower'), "opencitations"))
|
||||||
|
| (F.array_contains(F.col('collected_lower'), "crossref"))
|
||||||
|
| (F.array_contains(F.col('collected_lower'), "microsoft academic graph"))
|
||||||
|
).drop('collected_lower')
|
||||||
|
# print ("Cited df has: " + str(cites_df.count()) + " entries")
|
||||||
|
|
||||||
|
# DEPRECATED
|
||||||
|
# cited_by_df = spark.read.json(graph_folder + "/relation").select(F.col('target').alias('citing'), F.col('source').alias('cited'), 'relClass')\
|
||||||
|
# .where( 'relClass = "IsCitedBy"' ).repartition(num_partitions, 'citing').drop('relClass')
|
||||||
|
# print ("Cited by df has: " + str(cited_by_df.count()) + " entries")
|
||||||
|
|
||||||
|
# DEPRECATED
|
||||||
|
# Keep only relations where citing and cited are in the oa_objects_df
|
||||||
|
# references_df = references_df.join(oa_objects_df.select('id'), references_df.citing == oa_objects_df.id).drop('id')
|
||||||
|
# references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
|
||||||
|
# print ("References df now has: " + str(references_df.count()) + " entries")
|
||||||
|
|
||||||
|
cites_df = cites_df.join(oa_objects_df.select('id', 'classname'), cites_df.citing == oa_objects_df.id).where( F.col('classname').isin(valid_result_types) ).drop('id').drop('classname')
|
||||||
|
cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).distinct().repartition(num_partitions, 'citing').cache()
|
||||||
|
# TODO: add here a clause filtering out the citations
|
||||||
|
# originating from "other" types of research objects which we consider valid
|
||||||
|
|
||||||
|
# print ("Cites df now has: " + str(cites_df.count()) + " entries")
|
||||||
|
|
||||||
|
# DEPRECATED
|
||||||
|
# cited_by_df = cited_by_df.join(oa_objects_df.select('id'), cited_by_df.citing == oa_objects_df.id).drop('id')
|
||||||
|
# cited_by_df = cited_by_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cited_by_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
|
||||||
|
# print ("Cited BY df now has: " + str(cited_by_df.count()) + " entries")
|
||||||
|
|
||||||
|
# DEPRECATED
|
||||||
|
# Join all the above into a single set
|
||||||
|
# citations_df = references_df.union(cites_df).distinct().repartition(num_partitions, 'citing').cache()
|
||||||
|
# Free space
|
||||||
|
# references_df.unpersist(True)
|
||||||
|
# cites_df.unpersist(True)
|
||||||
|
|
||||||
|
# citations_df = citations_df.union(cited_by_df).distinct().repartition(num_partitions, 'citing').cache()
|
||||||
|
|
||||||
|
# ALL citations we keep are in the cited_df dataframe
|
||||||
|
citations_df = cites_df
|
||||||
|
|
||||||
|
'''
|
||||||
|
# Show schema
|
||||||
|
print ("Citation schema:")
|
||||||
|
citations_df.printSchema()
|
||||||
|
print ("Objects schema:")
|
||||||
|
oa_objects_df.printSchema()
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Free space
|
||||||
|
# cited_by_df.unpersist(True)
|
||||||
|
|
||||||
|
# Show total num of unique citations
|
||||||
|
'''
|
||||||
|
num_unique_citations = citations_df.count()
|
||||||
|
print ("Total unique citations: " + str(num_unique_citations))
|
||||||
|
'''
|
||||||
|
############################################################################################################################
|
||||||
|
# 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references)
|
||||||
|
dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\
|
||||||
|
.select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing')
|
||||||
|
# Count dangling nodes
|
||||||
|
'''
|
||||||
|
dangling_num = dangling_nodes.count()
|
||||||
|
print ("Number of dangling nodes: " + str(dangling_num))
|
||||||
|
'''
|
||||||
|
# print ("Dangling nodes sample:")
|
||||||
|
# dangling_nodes.show(10, False)
|
||||||
|
############################################################################################################################
|
||||||
|
# 4. Group the citation dataframe by citing doi, and create the cited dois list. Add dangling nodes to the result
|
||||||
|
graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited')).repartition(num_partitions, 'citing').cache()
|
||||||
|
# Free space
|
||||||
|
citations_df.unpersist(True)
|
||||||
|
|
||||||
|
'''
|
||||||
|
num_nodes = graph.count()
|
||||||
|
print ("Entries in graph before dangling nodes:" + str(num_nodes))
|
||||||
|
'''
|
||||||
|
# print ("Sample in graph: ")
|
||||||
|
# graph.show(10, False)
|
||||||
|
|
||||||
|
# Add dangling nodes
|
||||||
|
graph = graph.union(dangling_nodes).repartition(num_partitions, 'citing')
|
||||||
|
# Count current number of results
|
||||||
|
num_nodes = graph.count()
|
||||||
|
print ("Num entries after adding dangling nodes: " + str(num_nodes))
|
||||||
|
|
||||||
|
# Add publication year
|
||||||
|
graph = graph.join(oa_objects_df, graph.citing == oa_objects_df.id).select('citing', 'cited', 'year').cache()
|
||||||
|
num_nodes_final = graph.count()
|
||||||
|
print ("After adding year: " + str(num_nodes_final))
|
||||||
|
# print ("Graph sample:")
|
||||||
|
# graph.show(20, False)
|
||||||
|
# Calculate initial score of nodes (1/N)
|
||||||
|
initial_score = float(1)/float(num_nodes_final)
|
||||||
|
############################################################################################################################
|
||||||
|
# 5. Write graph to output file!
|
||||||
|
print("Writing output to: " + output_folder)
|
||||||
|
|
||||||
|
graph.select('citing', F.concat_ws("|", F.concat_ws(",",'cited'), F.when(F.col('cited').getItem(1) != "0", F.size('cited')).otherwise(F.lit("0")), F.lit(str(initial_score)) ).alias('cited'), 'year').withColumn('prev_pr', F.lit("0")).select('citing', 'cited', 'prev_pr', 'year')\
|
||||||
|
.write.mode("overwrite").option("delimiter","\t").csv(output_folder, compression="gzip")
|
||||||
|
|
||||||
|
if num_nodes_final != num_nodes:
|
||||||
|
print ("WARNING: the number of nodes after keeping only nodes where year is available went from: " + str(num_nodes) + " to " + str(num_nodes_final) + "\n")
|
||||||
|
print ("Check for any mistakes...")
|
||||||
|
|
||||||
|
############################################################################################################################
|
||||||
|
print ("\nDONE!\n\n")
|
||||||
|
# Wrap up
|
||||||
|
spark.stop()
|
|
@ -0,0 +1,795 @@
|
||||||
|
# This program reads hdfs directories containing ranking results from openaire's cluster.
|
||||||
|
# Based on the parameters provided by the user, it will create different types of output files.
|
||||||
|
|
||||||
|
# Modes available are:
|
||||||
|
# 1. bip
|
||||||
|
# This will result in output of the form required for bip-finder's update.
|
||||||
|
# Its lines conform to the following format:
|
||||||
|
# <doi> \t <pagerank> \t <pagerank_normalized> \t <attrank> \t <attrank_normalized> \t <citation_count> \t <citation_count_normalized> \t <3y_cc> \t <3y_cc_normalized> \t <tar_ram> \t <references_count>
|
||||||
|
|
||||||
|
# 2. zenodo
|
||||||
|
# This is the format used in zenodo for Bip-DB. (6 way classes will be named C1, C2, ..., C6)
|
||||||
|
# This should output two files per ranking method with each line having the following data:
|
||||||
|
# a. <id> <score> <6-way-class>
|
||||||
|
# NOTE: this should also run for openaire-id files, hence we should have a total of 4 files per ranking (2 for each type of identifier)
|
||||||
|
# In 'zenodo' mode the user specifies only a single file, for which zenodo-based output will be created
|
||||||
|
|
||||||
|
# 3. json
|
||||||
|
# This if the format used to provide openAIRE / claudio with data containing 1 json per identifier
|
||||||
|
# An example of such a json format follows:
|
||||||
|
#{
|
||||||
|
# "50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [
|
||||||
|
# {
|
||||||
|
# "id": "influence",
|
||||||
|
# "unit": [
|
||||||
|
# {
|
||||||
|
# "key": "score",
|
||||||
|
# "value": "5.06690394631e-09"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "key": "class",
|
||||||
|
# "value": "C"
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "id": "popularity_alt",
|
||||||
|
# "unit": [
|
||||||
|
# {
|
||||||
|
# "key": "score",
|
||||||
|
# "value": "0.0"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "key": "class",
|
||||||
|
# "value": "C"
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "id": "popularity",
|
||||||
|
# "unit": [
|
||||||
|
# {
|
||||||
|
# "key": "score",
|
||||||
|
# "value": "3.11855618382e-09"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "key": "class",
|
||||||
|
# "value": "C"
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "id": "influence_alt",
|
||||||
|
# "unit": [
|
||||||
|
# {
|
||||||
|
# "key": "score",
|
||||||
|
# "value": "0.0"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "key": "class",
|
||||||
|
# "value": "C"
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "id": "impulse",
|
||||||
|
# "unit": [
|
||||||
|
# {
|
||||||
|
# "key": "score",
|
||||||
|
# "value": "0.0"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "key": "class",
|
||||||
|
# "value": "C"
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
#}
|
||||||
|
|
||||||
|
|
||||||
|
#################################################################################################
|
||||||
|
# Imports
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Sparksession lib to communicate with cluster via session object
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
|
||||||
|
# Import sql types to define the schema of score output files
|
||||||
|
from pyspark.sql.types import *
|
||||||
|
|
||||||
|
# Import sql functions with shorthand alias
|
||||||
|
import pyspark.sql.functions as F
|
||||||
|
from pyspark.sql.functions import udf
|
||||||
|
|
||||||
|
# Json specific encoding
|
||||||
|
import json
|
||||||
|
#################################################################################################
|
||||||
|
# Clean up directory name
|
||||||
|
def clean_directory_name(dir_name):
|
||||||
|
# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_*
|
||||||
|
# and we need to keep the parts in *
|
||||||
|
dir_name_parts = dir_name.split('_')
|
||||||
|
dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
|
||||||
|
|
||||||
|
clean_name = '_'.join(dir_name_parts)
|
||||||
|
clean_name = clean_name.replace('_id', '_ids')
|
||||||
|
|
||||||
|
clean_name = clean_name.replace('.txt', '')
|
||||||
|
clean_name = clean_name.replace('.gz', '')
|
||||||
|
|
||||||
|
if 'openaire_ids_' in clean_name:
|
||||||
|
clean_name = clean_name.replace('openaire_ids_', '')
|
||||||
|
clean_name = clean_name + '_openaire_ids.txt.gz'
|
||||||
|
else:
|
||||||
|
clean_name = clean_name + '.txt.gz/'
|
||||||
|
|
||||||
|
return clean_name
|
||||||
|
# --------------------------------------------------------------------------------------------- #
|
||||||
|
# User defined function to escape special characters in a string that will turn into a json key
|
||||||
|
@udf(StringType())
|
||||||
|
def json_encode_key(doi_string):
|
||||||
|
return json.dumps(doi_string)
|
||||||
|
#################################################################################################
|
||||||
|
# --------------------------------------------------------------------------------------------- #
|
||||||
|
# Arguments from command line and initializations
|
||||||
|
|
||||||
|
# Time initialization
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Check whether input is correct, otherwise exit with appropriate message
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print ("Usage: ./format_ranking_results.py <mode> <input_file|input_file_list> <num_partitions>")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Define valid modes:
|
||||||
|
valid_modes = ['json', 'zenodo', 'bip', 'json-5-way']
|
||||||
|
# Read mode provided by user
|
||||||
|
mode = sys.argv[1].strip()
|
||||||
|
|
||||||
|
# If mode isn't valid, exit
|
||||||
|
if mode not in valid_modes:
|
||||||
|
print ("Usage: ./format_ranking_results.py <mode> <input_file|input_file_list> <num_partitions>\n")
|
||||||
|
print ("Invalid mode provided. Valid modes: ['zenodo', 'bip', 'json', 'json-5-way']")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# Once here, we should be more or less okay to run.
|
||||||
|
|
||||||
|
# Define the spark session object
|
||||||
|
spark = SparkSession.builder.appName('Parse Scores - ' + str(mode) + ' mode').getOrCreate()
|
||||||
|
# Set Log Level for spark session
|
||||||
|
spark.sparkContext.setLogLevel('WARN')
|
||||||
|
|
||||||
|
# Here we define the schema shared by all score output files
|
||||||
|
# - citation count variants have a slightly different schema, due to their scores being integers
|
||||||
|
float_schema = StructType([
|
||||||
|
StructField('id', StringType(), False),
|
||||||
|
StructField('score', FloatType(), False),
|
||||||
|
StructField('normalized_score', FloatType(), False),
|
||||||
|
StructField('3-way-class', StringType(), False),
|
||||||
|
StructField('5-way-class', StringType(), False)
|
||||||
|
])
|
||||||
|
|
||||||
|
int_schema = StructType([
|
||||||
|
StructField('id', StringType(), False),
|
||||||
|
StructField('score', IntegerType(), False),
|
||||||
|
StructField('normalized_score', FloatType(), False),
|
||||||
|
StructField('3-way-class', StringType(), False),
|
||||||
|
StructField('5-way-class', StringType(), False)
|
||||||
|
])
|
||||||
|
|
||||||
|
# This schema concerns the output of the file
|
||||||
|
# containing the number of references of each doi
|
||||||
|
refs_schema = StructType([
|
||||||
|
StructField('id', StringType(), False),
|
||||||
|
StructField('num_refs', IntegerType(), False),
|
||||||
|
])
|
||||||
|
|
||||||
|
print("--- Initialization time: %s seconds ---" % (time.time() - start_time))
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
# Time the main program execution
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# The following is executed when the user requests the bip-update specific file
|
||||||
|
if mode == 'bip':
|
||||||
|
|
||||||
|
# Read the remaining input files
|
||||||
|
if len(sys.argv) < 8:
|
||||||
|
print ("\n\nInsufficient input for 'bip' mode.")
|
||||||
|
print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <number of references> <num_partitions>\n")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# Read number of partitions:
|
||||||
|
num_partitions = int(sys.argv[-1])
|
||||||
|
|
||||||
|
|
||||||
|
pagerank_dir = sys.argv[2]
|
||||||
|
attrank_dir = sys.argv[3]
|
||||||
|
cc_dir = sys.argv[4]
|
||||||
|
impulse_dir = sys.argv[5]
|
||||||
|
ram_dir = sys.argv[6]
|
||||||
|
|
||||||
|
# NOTE: This was used initial, but @Serafeim told me to remove it since we don't get doi-doi referencew anymore
|
||||||
|
# In case of emergency, bring this back
|
||||||
|
# refs_dir = sys.argv[7]
|
||||||
|
|
||||||
|
# Score-specific dataframe
|
||||||
|
pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
|
||||||
|
attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id')
|
||||||
|
cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
|
||||||
|
impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
|
||||||
|
ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')
|
||||||
|
# refs_df = spark.read.schema(refs_schema).option('delimiter', '\t').option('header',True).csv(refs_dir).repartition(num_partitions, 'id')
|
||||||
|
|
||||||
|
# ----------- TESTING CODE --------------- #
|
||||||
|
# pagerank_entries = pagerank_df.count()
|
||||||
|
# attrank_entries = attrank_df.count()
|
||||||
|
# cc_entries = cc_df.count()
|
||||||
|
# impulse_entries = impulse_df.count()
|
||||||
|
# ram_entries = ram_df.count()
|
||||||
|
# refs_entries = refs_df.count()
|
||||||
|
|
||||||
|
# print ("Pagerank:" + str(pagerank_entries))
|
||||||
|
# print ("AttRank:" + str(attrank_entries))
|
||||||
|
# print ("CC entries: " + str(cc_entries))
|
||||||
|
# print ("Impulse entries: " + str(impulse_entries))
|
||||||
|
# print ("Refs: " + str(refs_entries))
|
||||||
|
# ---------------------------------------- #
|
||||||
|
|
||||||
|
# Create a new dataframe with the required data
|
||||||
|
results_df = pagerank_df.select('id', F.col('score').alias('pagerank'), F.col('normalized_score').alias('pagerank_normalized'))
|
||||||
|
# Add attrank dataframe
|
||||||
|
results_df = results_df.join(attrank_df.select('id', 'score', 'normalized_score'), ['id'])\
|
||||||
|
.select(results_df.id, 'pagerank', 'pagerank_normalized', F.col('score').alias('attrank'), F.col('normalized_score').alias('attrank_normalized'))
|
||||||
|
|
||||||
|
# Add citation count dataframe
|
||||||
|
results_df = results_df.join(cc_df.select('id', 'score', 'normalized_score'), ['id'])\
|
||||||
|
.select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', F.col('score').alias('cc'), F.col('normalized_score').alias('cc_normalized'))
|
||||||
|
|
||||||
|
# Add 3-year df
|
||||||
|
results_df = results_df.join(impulse_df.select('id', 'score', 'normalized_score'), ['id'])\
|
||||||
|
.select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized', \
|
||||||
|
F.col('score').alias('3-cc'), F.col('normalized_score').alias('3-cc_normalized'))
|
||||||
|
|
||||||
|
# Add ram df
|
||||||
|
results_df = results_df.join(ram_df.select('id', 'score'), ['id'])\
|
||||||
|
.select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized',\
|
||||||
|
'3-cc', '3-cc_normalized', F.col('score').alias('ram'))
|
||||||
|
|
||||||
|
# Add references - THIS WAS REMOVED SINCE WE DON't GET DOI REFERENCES
|
||||||
|
# In case of emergency bring back
|
||||||
|
# results_df = results_df.join(refs_df, ['id']).select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', \
|
||||||
|
# 'cc', 'cc_normalized', '3-cc', '3-cc_normalized', 'ram', 'num_refs')
|
||||||
|
|
||||||
|
# Write resulting dataframe to file
|
||||||
|
output_dir = "/".join(pagerank_dir.split('/')[:-1])
|
||||||
|
output_dir = output_dir + '/bip_update_data.txt.gz'
|
||||||
|
|
||||||
|
print("Writing to:" + output_dir)
|
||||||
|
results_df.write.mode('overwrite').option('delimiter','\t').option('header',True).csv(output_dir, compression='gzip')
|
||||||
|
|
||||||
|
# The following is executed when the user requests the zenodo-specific file
|
||||||
|
elif mode == 'zenodo':
|
||||||
|
|
||||||
|
# Read the remaining input files
|
||||||
|
if len(sys.argv) < 9:
|
||||||
|
print ("\n\nInsufficient input for 'zenodo' mode.")
|
||||||
|
print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <num_partitions> <graph_type>\n")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Read number of partitions:
|
||||||
|
num_partitions = int(sys.argv[-2])
|
||||||
|
graph_type = sys.argv[-1]
|
||||||
|
|
||||||
|
if graph_type not in ['bip', 'openaire']:
|
||||||
|
graph_type = 'bip'
|
||||||
|
|
||||||
|
pagerank_dir = sys.argv[2]
|
||||||
|
attrank_dir = sys.argv[3]
|
||||||
|
cc_dir = sys.argv[4]
|
||||||
|
impulse_dir = sys.argv[5]
|
||||||
|
ram_dir = sys.argv[6]
|
||||||
|
|
||||||
|
# Output directory is common for all files
|
||||||
|
output_dir_prefix = "/".join(pagerank_dir.split('/')[:-1])
|
||||||
|
# Method-specific outputs
|
||||||
|
pagerank_output = clean_directory_name(pagerank_dir.split('/')[-1])
|
||||||
|
attrank_output = clean_directory_name(attrank_dir.split('/')[-1])
|
||||||
|
cc_output = clean_directory_name(cc_dir.split('/')[-1])
|
||||||
|
impulse_output = clean_directory_name(impulse_dir.split('/')[-1])
|
||||||
|
ram_output = clean_directory_name(ram_dir.split('/')[-1])
|
||||||
|
|
||||||
|
# --------- PageRank ----------- #
|
||||||
|
# Get per file the doi - score - 6-way classes and write it to output
|
||||||
|
print("Writing to: " + output_dir_prefix + '/' + pagerank_output)
|
||||||
|
pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
|
||||||
|
# Replace dataframe class names
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.lit('C6'))
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.drop('5-way-class')
|
||||||
|
|
||||||
|
if graph_type == 'openaire':
|
||||||
|
pagerank_df = pagerank_df.where( ~F.col('id').like('10.%') )
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
pagerank_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + pagerank_output, compression='gzip')
|
||||||
|
# --------- AttRank ----------- #
|
||||||
|
print("Writing to: " + output_dir_prefix + '/' + attrank_output)
|
||||||
|
attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
|
||||||
|
# Replace dataframe class names
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.lit('C6'))
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.drop('5-way-class')
|
||||||
|
|
||||||
|
if graph_type == 'openaire':
|
||||||
|
attrank_df = attrank_df.where( ~F.col('id').like('10.%') )
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
attrank_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + attrank_output, compression='gzip')
|
||||||
|
# --------- Citation Count ----------- #
|
||||||
|
print("Writing to: " + output_dir_prefix + '/' + cc_output)
|
||||||
|
cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
|
||||||
|
# Replace dataframe class names
|
||||||
|
cc_df = cc_df.withColumn('class', F.lit('C5'))
|
||||||
|
# cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.drop('5-way-class')
|
||||||
|
|
||||||
|
if graph_type == 'openaire':
|
||||||
|
cc_df = cc_df.where( ~F.col('id').like('10.%') )
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
cc_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + cc_output, compression='gzip')
|
||||||
|
# --------- Impulse ----------- #
|
||||||
|
print("Writing to: " + output_dir_prefix + '/' + impulse_output)
|
||||||
|
impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
|
||||||
|
# Replace dataframe class names
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.lit('C5'))
|
||||||
|
# impulse_df = impulse_df.withColumn('class', F.when(F.col('6-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.drop('5-way-class')
|
||||||
|
|
||||||
|
if graph_type == 'openaire':
|
||||||
|
impulse_df = impulse_df.where( ~F.col('id').like('10.%') )
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
impulse_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + impulse_output, compression='gzip')
|
||||||
|
# --------- RAM ----------- #
|
||||||
|
print("Writing to: " + output_dir_prefix + '/' + ram_output)
|
||||||
|
ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
|
||||||
|
# Replace dataframe class names
|
||||||
|
ram_df = ram_df.withColumn('class', F.lit('C5'))
|
||||||
|
# ram_df = ram_df.withColumn('class', F.when(F.col('6-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.drop('5-way-class')
|
||||||
|
|
||||||
|
if graph_type == 'openaire':
|
||||||
|
ram_df = ram_df.where( ~F.col('id').like('10.%') )
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
ram_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + ram_output, compression='gzip')
|
||||||
|
|
||||||
|
# The following produces the json file required by openaire
|
||||||
|
elif mode == 'json':
|
||||||
|
|
||||||
|
# Read the remaining input files
|
||||||
|
if len(sys.argv) < 9:
|
||||||
|
print ("\n\nInsufficient input for 'json' mode.")
|
||||||
|
print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <num_partitions> <graph_type>\n")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Read number of partitions:
|
||||||
|
num_partitions = int(sys.argv[-2])
|
||||||
|
graph_type = sys.argv[-1]
|
||||||
|
|
||||||
|
if graph_type not in ['bip', 'openaire']:
|
||||||
|
graph_type = 'bip'
|
||||||
|
|
||||||
|
print ("Graph type: " + str(graph_type))
|
||||||
|
|
||||||
|
# File directories
|
||||||
|
pagerank_dir = sys.argv[2]
|
||||||
|
attrank_dir = sys.argv[3]
|
||||||
|
cc_dir = sys.argv[4]
|
||||||
|
impulse_dir = sys.argv[5]
|
||||||
|
ram_dir = sys.argv[6]
|
||||||
|
|
||||||
|
print ("Reading files:")
|
||||||
|
print (pagerank_dir)
|
||||||
|
print (attrank_dir)
|
||||||
|
print (cc_dir)
|
||||||
|
print (impulse_dir)
|
||||||
|
print (ram_dir)
|
||||||
|
|
||||||
|
# Score-specific dataframe - read inputs
|
||||||
|
pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
|
||||||
|
attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id')
|
||||||
|
cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
|
||||||
|
impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
|
||||||
|
ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')
|
||||||
|
# --- Join the data of the various scores --- #
|
||||||
|
|
||||||
|
# Create json data for pagerank
|
||||||
|
pagerank_df = pagerank_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
pagerank_df = pagerank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_values') )
|
||||||
|
pagerank_df = pagerank_df.select('id', F.create_map(F.lit('id'), F.lit('influence')).alias('id_map'), F.col('influence_values'))
|
||||||
|
pagerank_df = pagerank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence'))).alias('influence_key'), F.to_json(F.col('influence_values')).alias('influence_values') )
|
||||||
|
pagerank_df = pagerank_df.select('id', F.expr('substring(influence_key, 0, length(influence_key)-1)').alias('influence_key'), 'influence_values')
|
||||||
|
pagerank_df = pagerank_df.select('id', 'influence_key', F.expr('substring(influence_values, 2, length(influence_values))').alias('influence_values'))
|
||||||
|
pagerank_df = pagerank_df.select('id', F.concat_ws(', ', F.col('influence_key'), F.col('influence_values')).alias('influence_json'))
|
||||||
|
|
||||||
|
# Create json data for attrank
|
||||||
|
attrank_df = attrank_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
attrank_df = attrank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_values') )
|
||||||
|
attrank_df = attrank_df.select('id', F.create_map(F.lit('id'), F.lit('popularity')).alias('id_map'), F.col('popularity_values'))
|
||||||
|
attrank_df = attrank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity'))).alias('popularity_key'), F.to_json(F.col('popularity_values')).alias('popularity_values') )
|
||||||
|
attrank_df = attrank_df.select('id', F.expr('substring(popularity_key, 0, length(popularity_key)-1)').alias('popularity_key'), 'popularity_values')
|
||||||
|
attrank_df = attrank_df.select('id', 'popularity_key', F.expr('substring(popularity_values, 2, length(popularity_values))').alias('popularity_values'))
|
||||||
|
attrank_df = attrank_df.select('id', F.concat_ws(', ', F.col('popularity_key'), F.col('popularity_values')).alias('popularity_json'))
|
||||||
|
|
||||||
|
# Create json data for CC
|
||||||
|
cc_df = cc_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
cc_df = cc_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_alt_values') )
|
||||||
|
cc_df = cc_df.select('id', F.create_map(F.lit('id'), F.lit('influence_alt')).alias('id_map'), F.col('influence_alt_values'))
|
||||||
|
cc_df = cc_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence_alt'))).alias('influence_alt_key'), F.to_json(F.col('influence_alt_values')).alias('influence_alt_values') )
|
||||||
|
cc_df = cc_df.select('id', F.expr('substring(influence_alt_key, 0, length(influence_alt_key)-1)').alias('influence_alt_key'), 'influence_alt_values')
|
||||||
|
cc_df = cc_df.select('id', 'influence_alt_key', F.expr('substring(influence_alt_values, 2, length(influence_alt_values))').alias('influence_alt_values'))
|
||||||
|
cc_df = cc_df.select('id', F.concat_ws(', ', F.col('influence_alt_key'), F.col('influence_alt_values')).alias('influence_alt_json'))
|
||||||
|
|
||||||
|
|
||||||
|
# Create json data for RAM
|
||||||
|
ram_df = ram_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
ram_df = ram_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_alt_values') )
|
||||||
|
ram_df = ram_df.select('id', F.create_map(F.lit('id'), F.lit('popularity_alt')).alias('id_map'), F.col('popularity_alt_values'))
|
||||||
|
ram_df = ram_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity_alt'))).alias('popularity_alt_key'), F.to_json(F.col('popularity_alt_values')).alias('popularity_alt_values') )
|
||||||
|
ram_df = ram_df.select('id', F.expr('substring(popularity_alt_key, 0, length(popularity_alt_key)-1)').alias('popularity_alt_key'), 'popularity_alt_values')
|
||||||
|
ram_df = ram_df.select('id', 'popularity_alt_key', F.expr('substring(popularity_alt_values, 2, length(popularity_alt_values))').alias('popularity_alt_values'))
|
||||||
|
ram_df = ram_df.select('id', F.concat_ws(', ', F.col('popularity_alt_key'), F.col('popularity_alt_values')).alias('popularity_alt_json'))
|
||||||
|
|
||||||
|
# Create json data for impulse
|
||||||
|
impulse_df = impulse_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
impulse_df = impulse_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('impulse_values') )
|
||||||
|
impulse_df = impulse_df.select('id', F.create_map(F.lit('id'), F.lit('impulse')).alias('id_map'), F.col('impulse_values'))
|
||||||
|
impulse_df = impulse_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('impulse'))).alias('impulse_key'), F.to_json(F.col('impulse_values')).alias('impulse_values') )
|
||||||
|
impulse_df = impulse_df.select('id', F.expr('substring(impulse_key, 0, length(impulse_key)-1)').alias('impulse_key'), 'impulse_values')
|
||||||
|
impulse_df = impulse_df.select('id', 'impulse_key', F.expr('substring(impulse_values, 2, length(impulse_values))').alias('impulse_values'))
|
||||||
|
impulse_df = impulse_df.select('id', F.concat_ws(', ', F.col('impulse_key'), F.col('impulse_values')).alias('impulse_json'))
|
||||||
|
|
||||||
|
#Join dataframes together
|
||||||
|
results_df = pagerank_df.join(attrank_df, ['id'])
|
||||||
|
results_df = results_df.join(cc_df, ['id'])
|
||||||
|
results_df = results_df.join(ram_df, ['id'])
|
||||||
|
results_df = results_df.join(impulse_df, ['id'])
|
||||||
|
|
||||||
|
print ("Json encoding DOI keys")
|
||||||
|
# Json encode doi strings
|
||||||
|
results_df = results_df.select(json_encode_key('id').alias('id'), 'influence_json', 'popularity_json', 'influence_alt_json', 'popularity_alt_json', 'impulse_json')
|
||||||
|
|
||||||
|
# Concatenate individual json columns
|
||||||
|
results_df = results_df.select('id', F.concat_ws(', ', F.col('influence_json'), F.col('popularity_json'), F.col('influence_alt_json'), F.col('popularity_alt_json'), F.col('impulse_json') ).alias('json_data'))
|
||||||
|
results_df = results_df.select('id', F.concat_ws('', F.lit('['), F.col('json_data'), F.lit(']')).alias('json_data') )
|
||||||
|
|
||||||
|
# Filter out non-openaire ids if need
|
||||||
|
if graph_type == 'openaire':
|
||||||
|
results_df = results_df.where( ~F.col('id').like('"10.%') )
|
||||||
|
|
||||||
|
# Concatenate paper id and add opening and ending brackets
|
||||||
|
results_df = results_df.select(F.concat_ws('', F.lit('{'), F.col('id'), F.lit(': '), F.col('json_data'), F.lit('}')).alias('json') )
|
||||||
|
|
||||||
|
# -------------------------------------------- #
|
||||||
|
# Write json output - set the directory here
|
||||||
|
output_dir = "/".join(pagerank_dir.split('/')[:-1])
|
||||||
|
if graph_type == 'bip':
|
||||||
|
output_dir = output_dir + '/bip_universe_doi_scores/'
|
||||||
|
else:
|
||||||
|
output_dir = output_dir + '/openaire_universe_scores/'
|
||||||
|
|
||||||
|
# Write the dataframe
|
||||||
|
print ("Writing output to: " + output_dir)
|
||||||
|
results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip')
|
||||||
|
|
||||||
|
# Rename the files to .json.gz now
|
||||||
|
sc = spark.sparkContext
|
||||||
|
URI = sc._gateway.jvm.java.net.URI
|
||||||
|
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
|
||||||
|
FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
|
||||||
|
# Get master prefix from input file path
|
||||||
|
master_prefix = "/".join(pagerank_dir.split('/')[:5])
|
||||||
|
fs = FileSystem.get(URI(master_prefix), sc._jsc.hadoopConfiguration())
|
||||||
|
path = Path(output_dir)
|
||||||
|
print ("Path is:" + path.toString())
|
||||||
|
file_list = fs.listStatus(Path(output_dir))
|
||||||
|
print ("Renaming files:")
|
||||||
|
for f in file_list:
|
||||||
|
initial_filename = f.getPath().toString()
|
||||||
|
if "part" in initial_filename:
|
||||||
|
print (initial_filename + " => " + initial_filename.replace(".txt.gz", ".json.gz"))
|
||||||
|
fs.rename(Path(initial_filename), Path(initial_filename.replace(".txt.gz", ".json.gz")))
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
DEPRECATED:
|
||||||
|
# -------------------------------------------- #
|
||||||
|
# Write json output
|
||||||
|
output_dir = "/".join(pagerank_dir.split('/')[:-1])
|
||||||
|
if graph_type == 'bip':
|
||||||
|
output_dir = output_dir + '/bip_universe_doi_scores_txt/'
|
||||||
|
else:
|
||||||
|
output_dir = output_dir + '/openaire_universe_scores_txt/'
|
||||||
|
|
||||||
|
print ("Writing output to: " + output_dir)
|
||||||
|
results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip')
|
||||||
|
print ("Done writing first results")
|
||||||
|
# Read results df as json and write it as json file
|
||||||
|
print ("Reading json input from: " + str(output_dir))
|
||||||
|
resulds_df_json = spark.read.json(output_dir).cache()
|
||||||
|
# Write json to different dir
|
||||||
|
print ("Writing json output to: " + output_dir.replace("_txt", ""))
|
||||||
|
resulds_df_json.write.mode('overwrite').json(output_dir.replace("_txt", ""), compression='gzip')
|
||||||
|
'''
|
||||||
|
|
||||||
|
# The following produces the json file required by openaire
|
||||||
|
elif mode == 'json-5-way':
|
||||||
|
|
||||||
|
# Read the remaining input files
|
||||||
|
if len(sys.argv) < 9:
|
||||||
|
print ("\n\nInsufficient input for 'json-5-way' mode.")
|
||||||
|
print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <num_partitions> <graph_type>\n")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Read number of partitions:
|
||||||
|
num_partitions = int(sys.argv[-2])
|
||||||
|
graph_type = sys.argv[-1]
|
||||||
|
|
||||||
|
if graph_type not in ['bip', 'openaire']:
|
||||||
|
graph_type = 'bip'
|
||||||
|
|
||||||
|
# File directories
|
||||||
|
pagerank_dir = sys.argv[2]
|
||||||
|
attrank_dir = sys.argv[3]
|
||||||
|
cc_dir = sys.argv[4]
|
||||||
|
impulse_dir = sys.argv[5]
|
||||||
|
ram_dir = sys.argv[6]
|
||||||
|
|
||||||
|
# Score-specific dataframe - read inputs
|
||||||
|
pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
|
||||||
|
attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id')
|
||||||
|
cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
|
||||||
|
impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
|
||||||
|
ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')
|
||||||
|
# --- Join the data of the various scores --- #
|
||||||
|
|
||||||
|
|
||||||
|
# Replace 6-way classes with 5-way values
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.lit('C5'))
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
pagerank_df = pagerank_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')
|
||||||
|
|
||||||
|
|
||||||
|
# Create json data for pagerank
|
||||||
|
pagerank_df = pagerank_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pagerank_df = pagerank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_values') )
|
||||||
|
pagerank_df = pagerank_df.select('id', F.create_map(F.lit('id'), F.lit('influence')).alias('id_map'), F.col('influence_values'))
|
||||||
|
pagerank_df = pagerank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence'))).alias('influence_key'), F.to_json(F.col('influence_values')).alias('influence_values') )
|
||||||
|
pagerank_df = pagerank_df.select('id', F.expr('substring(influence_key, 0, length(influence_key)-1)').alias('influence_key'), 'influence_values')
|
||||||
|
pagerank_df = pagerank_df.select('id', 'influence_key', F.expr('substring(influence_values, 2, length(influence_values))').alias('influence_values'))
|
||||||
|
pagerank_df = pagerank_df.select('id', F.concat_ws(', ', F.col('influence_key'), F.col('influence_values')).alias('influence_json'))
|
||||||
|
|
||||||
|
# Replace 6-way classes with 5 way classes for attrank
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.lit('C5'))
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
attrank_df = attrank_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')
|
||||||
|
|
||||||
|
# Create json data for attrank
|
||||||
|
attrank_df = attrank_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
attrank_df = attrank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_values') )
|
||||||
|
attrank_df = attrank_df.select('id', F.create_map(F.lit('id'), F.lit('popularity')).alias('id_map'), F.col('popularity_values'))
|
||||||
|
attrank_df = attrank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity'))).alias('popularity_key'), F.to_json(F.col('popularity_values')).alias('popularity_values') )
|
||||||
|
attrank_df = attrank_df.select('id', F.expr('substring(popularity_key, 0, length(popularity_key)-1)').alias('popularity_key'), 'popularity_values')
|
||||||
|
attrank_df = attrank_df.select('id', 'popularity_key', F.expr('substring(popularity_values, 2, length(popularity_values))').alias('popularity_values'))
|
||||||
|
attrank_df = attrank_df.select('id', F.concat_ws(', ', F.col('popularity_key'), F.col('popularity_values')).alias('popularity_json'))
|
||||||
|
|
||||||
|
# Replace 6-way classes with 5 way classes for attrank
|
||||||
|
cc_df = cc_df.withColumn('class', F.lit('C5'))
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
cc_df = cc_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')
|
||||||
|
|
||||||
|
# Create json data for CC
|
||||||
|
cc_df = cc_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
cc_df = cc_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_alt_values') )
|
||||||
|
cc_df = cc_df.select('id', F.create_map(F.lit('id'), F.lit('influence_alt')).alias('id_map'), F.col('influence_alt_values'))
|
||||||
|
cc_df = cc_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence_alt'))).alias('influence_alt_key'), F.to_json(F.col('influence_alt_values')).alias('influence_alt_values') )
|
||||||
|
cc_df = cc_df.select('id', F.expr('substring(influence_alt_key, 0, length(influence_alt_key)-1)').alias('influence_alt_key'), 'influence_alt_values')
|
||||||
|
cc_df = cc_df.select('id', 'influence_alt_key', F.expr('substring(influence_alt_values, 2, length(influence_alt_values))').alias('influence_alt_values'))
|
||||||
|
cc_df = cc_df.select('id', F.concat_ws(', ', F.col('influence_alt_key'), F.col('influence_alt_values')).alias('influence_alt_json'))
|
||||||
|
|
||||||
|
# Replace 6-way classes with 5 way classes for attrank
|
||||||
|
ram_df = ram_df.withColumn('class', F.lit('C5'))
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
ram_df = ram_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')
|
||||||
|
|
||||||
|
# Create json data for RAM
|
||||||
|
ram_df = ram_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
ram_df = ram_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_alt_values') )
|
||||||
|
ram_df = ram_df.select('id', F.create_map(F.lit('id'), F.lit('popularity_alt')).alias('id_map'), F.col('popularity_alt_values'))
|
||||||
|
ram_df = ram_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity_alt'))).alias('popularity_alt_key'), F.to_json(F.col('popularity_alt_values')).alias('popularity_alt_values') )
|
||||||
|
ram_df = ram_df.select('id', F.expr('substring(popularity_alt_key, 0, length(popularity_alt_key)-1)').alias('popularity_alt_key'), 'popularity_alt_values')
|
||||||
|
ram_df = ram_df.select('id', 'popularity_alt_key', F.expr('substring(popularity_alt_values, 2, length(popularity_alt_values))').alias('popularity_alt_values'))
|
||||||
|
ram_df = ram_df.select('id', F.concat_ws(', ', F.col('popularity_alt_key'), F.col('popularity_alt_values')).alias('popularity_alt_json'))
|
||||||
|
|
||||||
|
# Replace 6-way classes with 5 way classes for attrank
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.lit('C5'))
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
|
||||||
|
impulse_df = impulse_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')
|
||||||
|
|
||||||
|
# Create json data for impulse
|
||||||
|
impulse_df = impulse_df.select('id', F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('score')),
|
||||||
|
F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
|
||||||
|
F.map_concat(
|
||||||
|
F.create_map(F.lit('key'), F.lit('class')),
|
||||||
|
F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
|
||||||
|
|
||||||
|
impulse_df = impulse_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('impulse_values') )
|
||||||
|
impulse_df = impulse_df.select('id', F.create_map(F.lit('id'), F.lit('impulse')).alias('id_map'), F.col('impulse_values'))
|
||||||
|
impulse_df = impulse_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('impulse'))).alias('impulse_key'), F.to_json(F.col('impulse_values')).alias('impulse_values') )
|
||||||
|
impulse_df = impulse_df.select('id', F.expr('substring(impulse_key, 0, length(impulse_key)-1)').alias('impulse_key'), 'impulse_values')
|
||||||
|
impulse_df = impulse_df.select('id', 'impulse_key', F.expr('substring(impulse_values, 2, length(impulse_values))').alias('impulse_values'))
|
||||||
|
impulse_df = impulse_df.select('id', F.concat_ws(', ', F.col('impulse_key'), F.col('impulse_values')).alias('impulse_json'))
|
||||||
|
|
||||||
|
#Join dataframes together
|
||||||
|
results_df = pagerank_df.join(attrank_df, ['id'])
|
||||||
|
results_df = results_df.join(cc_df, ['id'])
|
||||||
|
results_df = results_df.join(ram_df, ['id'])
|
||||||
|
results_df = results_df.join(impulse_df, ['id'])
|
||||||
|
|
||||||
|
print ("Json encoding DOI keys")
|
||||||
|
# Json encode doi strings
|
||||||
|
results_df = results_df.select(json_encode_key('id').alias('id'), 'influence_json', 'popularity_json', 'influence_alt_json', 'popularity_alt_json', 'impulse_json')
|
||||||
|
|
||||||
|
# Concatenate individual json columns
|
||||||
|
results_df = results_df.select('id', F.concat_ws(', ', F.col('influence_json'), F.col('popularity_json'), F.col('influence_alt_json'), F.col('popularity_alt_json'), F.col('impulse_json') ).alias('json_data'))
|
||||||
|
results_df = results_df.select('id', F.concat_ws('', F.lit('['), F.col('json_data'), F.lit(']')).alias('json_data') )
|
||||||
|
|
||||||
|
# Filter out non-openaire ids if need
|
||||||
|
if graph_type == 'openaire':
|
||||||
|
results_df = results_df.where( ~F.col('id').like('10.%') )
|
||||||
|
|
||||||
|
# Concatenate paper id and add opening and ending brackets
|
||||||
|
results_df = results_df.select(F.concat_ws('', F.lit('{'), F.col('id'), F.lit(': '), F.col('json_data'), F.lit('}')).alias('json') )
|
||||||
|
|
||||||
|
# TEST output and count
|
||||||
|
# results_df.show(20, False)
|
||||||
|
# print ("Results #" + str(results_df.count()))
|
||||||
|
|
||||||
|
# -------------------------------------------- #
|
||||||
|
# Write json output
|
||||||
|
# -------------------------------------------- #
|
||||||
|
# Write json output - set the directory here
|
||||||
|
output_dir = "/".join(pagerank_dir.split('/')[:-1])
|
||||||
|
if graph_type == 'bip':
|
||||||
|
output_dir = output_dir + '/bip_universe_doi_scores/'
|
||||||
|
else:
|
||||||
|
output_dir = output_dir + '/openaire_universe_scores/'
|
||||||
|
|
||||||
|
# Write the dataframe
|
||||||
|
print ("Writing output to: " + output_dir)
|
||||||
|
results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip')
|
||||||
|
|
||||||
|
# Rename the files to .json.gz now
|
||||||
|
sc = spark.sparkContext
|
||||||
|
URI = sc._gateway.jvm.java.net.URI
|
||||||
|
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
|
||||||
|
FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
|
||||||
|
# Get master prefix from input file path
|
||||||
|
master_prefix = "/".join(pagerank_dir.split('/')[:5])
|
||||||
|
fs = FileSystem.get(URI(master_prefix), sc._jsc.hadoopConfiguration())
|
||||||
|
path = Path(output_dir)
|
||||||
|
print ("Path is:" + path.toString())
|
||||||
|
file_list = fs.listStatus(Path(output_dir))
|
||||||
|
print ("Renaming files:")
|
||||||
|
for f in file_list:
|
||||||
|
initial_filename = f.getPath().toString()
|
||||||
|
if "part" in initial_filename:
|
||||||
|
print (initial_filename + " => " + initial_filename.replace(".txt.gz", ".json.gz"))
|
||||||
|
fs.rename(Path(initial_filename), Path(initial_filename.replace(".txt.gz", ".json.gz")))
|
||||||
|
|
||||||
|
# Close spark session
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
print("--- Main program execution time: %s seconds ---" % (time.time() - start_time))
|
||||||
|
print("--- Finished --- \n\n")
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
ranking_results_folder=$1;
|
||||||
|
|
||||||
|
pr_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/PR_.*" | grep -o "PR.*"`;
|
||||||
|
attrank_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/AttRank.*" | grep -o "AttRank.*"`;
|
||||||
|
cc_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/CC_.*" | grep -o "CC.*"`;
|
||||||
|
impulse_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/3-year_.*" | grep -o "3-year.*"`;
|
||||||
|
ram_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/RAM_.*" | grep -o "RAM.*"`;
|
||||||
|
|
||||||
|
echo "pr_file=${pr_file}";
|
||||||
|
echo "attrank_file=${attrank_file}";
|
||||||
|
echo "cc_file=${cc_file}";
|
||||||
|
echo "impulse_file=${impulse_file}";
|
||||||
|
echo "ram_file=${ram_file}";
|
||||||
|
# echo "TEST=`hdfs dfs -ls ${ranking_results_folder}/`";
|
|
@ -0,0 +1,63 @@
|
||||||
|
#/usr/bin/bash
|
||||||
|
|
||||||
|
# Read log files from ranking scripts and create a two-line file
|
||||||
|
# with score limits for the various measures. To be used by Kleanthis
|
||||||
|
|
||||||
|
attrank_file=$(ls *attrank*.log);
|
||||||
|
pr_file=$(ls *pagerank*.log)
|
||||||
|
ram_file=$(ls *ram*.log);
|
||||||
|
cc_file=$(ls *cc*.log);
|
||||||
|
impulse_file=$(ls *impulse*.log);
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "-----------------------------"
|
||||||
|
echo "Attrank file:${attrank_file}";
|
||||||
|
echo "PageRank file:${pr_file}";
|
||||||
|
echo "RAM file:${ram_file}";
|
||||||
|
echo "CC file:${cc_file}";
|
||||||
|
echo "Impulse file:${impulse_file}";
|
||||||
|
echo "-----------------------------"
|
||||||
|
echo
|
||||||
|
echo
|
||||||
|
|
||||||
|
# output file will be called score_limits.csv
|
||||||
|
echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv
|
||||||
|
# ---------------------------------------------------- #
|
||||||
|
# Get respective score limits (we don't need RAM)
|
||||||
|
inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2);
|
||||||
|
inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2);
|
||||||
|
inf_1=$(grep "^1%" ${pr_file} | cut -f 2);
|
||||||
|
inf_10=$(grep "^10%" ${pr_file} | cut -f 2);
|
||||||
|
echo "Influnence limits:"
|
||||||
|
echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}";
|
||||||
|
# ---------------------------------------------------- #
|
||||||
|
pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2);
|
||||||
|
pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2);
|
||||||
|
pop_1=$(grep "^1%" ${attrank_file} | cut -f 2);
|
||||||
|
pop_10=$(grep "^10%" ${attrank_file} | cut -f 2);
|
||||||
|
echo "Popularity limits:";
|
||||||
|
echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}";
|
||||||
|
# ---------------------------------------------------- #
|
||||||
|
imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2);
|
||||||
|
imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2);
|
||||||
|
imp_1=$(grep "^1%" ${impulse_file} | cut -f 2);
|
||||||
|
imp_10=$(grep "^10%" ${impulse_file} | cut -f 2);
|
||||||
|
echo "Popularity limits:";
|
||||||
|
echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}";
|
||||||
|
# ---------------------------------------------------- #
|
||||||
|
cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2);
|
||||||
|
cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2);
|
||||||
|
cc_1=$(grep "^1%" ${cc_file} | cut -f 2);
|
||||||
|
cc_10=$(grep "^10%" ${cc_file} | cut -f 2);
|
||||||
|
echo "Popularity limits:";
|
||||||
|
echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}";
|
||||||
|
# ---------------------------------------------------- #
|
||||||
|
|
||||||
|
echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "score_limits.csv contents:"
|
||||||
|
cat score_limits.csv
|
||||||
|
|
||||||
|
echo;
|
||||||
|
echo;
|
|
@ -0,0 +1,60 @@
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark import SparkConf, SparkContext
|
||||||
|
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: map_openaire_ids_to_dois.py <hdfs_src_dir> <hdfs_output_dir>")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs')
|
||||||
|
sc = SparkContext(conf = conf)
|
||||||
|
spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate()
|
||||||
|
sc.setLogLevel('OFF')
|
||||||
|
|
||||||
|
src_dir = sys.argv[1]
|
||||||
|
output = sys.argv[2]
|
||||||
|
|
||||||
|
# src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/"
|
||||||
|
# output = '/tmp/openaireid_to_dois/'
|
||||||
|
|
||||||
|
def transform(doc):
|
||||||
|
|
||||||
|
# get publication year from 'doc.dateofacceptance.value'
|
||||||
|
dateofacceptance = doc.get('dateofacceptance', {}).get('value')
|
||||||
|
|
||||||
|
year = 0
|
||||||
|
|
||||||
|
if (dateofacceptance is not None):
|
||||||
|
year = dateofacceptance.split('-')[0]
|
||||||
|
|
||||||
|
# for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi'
|
||||||
|
dois = [ pid['value'] for pid in doc.get('pid', []) if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)]
|
||||||
|
|
||||||
|
num_dois = len(dois)
|
||||||
|
|
||||||
|
# exlcude openaire ids that do not correspond to DOIs
|
||||||
|
if (num_dois == 0):
|
||||||
|
return None
|
||||||
|
|
||||||
|
fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ]
|
||||||
|
|
||||||
|
return '\t'.join([ v.encode('utf-8') for v in fields ])
|
||||||
|
|
||||||
|
docs = None
|
||||||
|
|
||||||
|
for result_type in ["publication", "dataset", "software", "otherresearchproduct"]:
|
||||||
|
|
||||||
|
tmp = sc.textFile(src_dir + result_type).map(json.loads)
|
||||||
|
|
||||||
|
if (docs is None):
|
||||||
|
docs = tmp
|
||||||
|
else:
|
||||||
|
# append all result types in one RDD
|
||||||
|
docs = docs.union(tmp)
|
||||||
|
|
||||||
|
docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False)
|
||||||
|
|
||||||
|
docs = docs.map(transform).filter(lambda d: d is not None)
|
||||||
|
|
||||||
|
docs.saveAsTextFile(output)
|
|
@ -0,0 +1,168 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
|
||||||
|
# and uses this mapping to create doi-based score files in the format required by BiP! DB.
|
||||||
|
# This is done by reading each openaire-id based ranking file and joining the openaire based
|
||||||
|
# score and classes to all the corresponding dois.
|
||||||
|
#################################################################################################
|
||||||
|
# Imports
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Sparksession lib to communicate with cluster via session object
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
|
||||||
|
# Import sql types to define schemas
|
||||||
|
from pyspark.sql.types import *
|
||||||
|
|
||||||
|
# Import sql functions with shorthand alias
|
||||||
|
import pyspark.sql.functions as F
|
||||||
|
|
||||||
|
from pyspark.sql.functions import max
|
||||||
|
# from pyspark.sql.functions import udf
|
||||||
|
#################################################################################################
|
||||||
|
#################################################################################################
|
||||||
|
# Clean up directory name - no longer needed in final workflow version
|
||||||
|
'''
|
||||||
|
def clean_directory_name(dir_name):
|
||||||
|
# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_*
|
||||||
|
# and we need to keep the parts in *
|
||||||
|
|
||||||
|
|
||||||
|
dir_name_parts = dir_name.split('_')
|
||||||
|
dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
|
||||||
|
|
||||||
|
dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
|
||||||
|
clean_name = dir_name + ".txt.gz"
|
||||||
|
|
||||||
|
# clean_name = '_'.join(dir_name_parts)
|
||||||
|
|
||||||
|
# if '_ids' not in clean_name:
|
||||||
|
# clean_name = clean_name.replace('id_', 'ids_')
|
||||||
|
|
||||||
|
# clean_name = clean_name.replace('.txt', '')
|
||||||
|
# clean_name = clean_name.replace('.gz', '')
|
||||||
|
|
||||||
|
# if 'openaire_ids_' in clean_name:
|
||||||
|
# clean_name = clean_name.replace('openaire_ids_', '')
|
||||||
|
# clean_name = clean_name + '.txt.gz'
|
||||||
|
# else:
|
||||||
|
# clean_name = clean_name + '.txt.gz'
|
||||||
|
|
||||||
|
return clean_name
|
||||||
|
'''
|
||||||
|
#################################################################################################
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
# Read arguments
|
||||||
|
synonyms_folder = sys.argv[1]
|
||||||
|
num_partitions = int(sys.argv[2])
|
||||||
|
input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
|
||||||
|
# input_file_list = [clean_directory_name(item) for item in input_file_list]
|
||||||
|
|
||||||
|
# Prepare output specific variables
|
||||||
|
output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
|
||||||
|
output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
|
||||||
|
|
||||||
|
# --- INFO MESSAGES --- #
|
||||||
|
print ("\n\n----------------------------")
|
||||||
|
print ("Mpping openaire ids to DOIs")
|
||||||
|
print ("Reading input from: " + synonyms_folder)
|
||||||
|
print ("Num partitions: " + str(num_partitions))
|
||||||
|
print ("Input files:" + " -- ".join(input_file_list))
|
||||||
|
print ("Output files: " + " -- ".join(output_file_list))
|
||||||
|
print ("----------------------------\n\n")
|
||||||
|
#######################################################################################
|
||||||
|
# We weill define the following schemas:
|
||||||
|
# --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character)
|
||||||
|
# --> a schema for floating point ranking scores [string - float - string] (the latter string is the class)
|
||||||
|
# --> a schema for integer ranking scores [string - int - string] (the latter string is the class)
|
||||||
|
|
||||||
|
float_schema = StructType([
|
||||||
|
StructField('id', StringType(), False),
|
||||||
|
StructField('score', FloatType(), False),
|
||||||
|
StructField('class', StringType(), False)
|
||||||
|
])
|
||||||
|
|
||||||
|
int_schema = StructType([
|
||||||
|
StructField('id', StringType(), False),
|
||||||
|
StructField('score', IntegerType(), False),
|
||||||
|
StructField('class', StringType(), False)
|
||||||
|
])
|
||||||
|
|
||||||
|
# This schema concerns the output of the file
|
||||||
|
# containing the number of references of each doi
|
||||||
|
synonyms_schema = StructType([
|
||||||
|
StructField('id', StringType(), False),
|
||||||
|
StructField('num_synonyms', IntegerType(), False),
|
||||||
|
StructField('doi_list', StringType(), False),
|
||||||
|
])
|
||||||
|
#######################################################################################
|
||||||
|
# Start spark session
|
||||||
|
spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate()
|
||||||
|
# Set Log Level for spark session
|
||||||
|
spark.sparkContext.setLogLevel('WARN')
|
||||||
|
#######################################################################################
|
||||||
|
# MAIN Program
|
||||||
|
|
||||||
|
# Read and repartition the synonym folder - also cache it since we will need to perform multiple joins
|
||||||
|
synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder)
|
||||||
|
synonym_df = synonym_df.select('id', F.split(F.col('doi_list'), chr(0x02)).alias('doi_list'))
|
||||||
|
synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache()
|
||||||
|
|
||||||
|
# TESTING
|
||||||
|
# print ("Synonyms: " + str(synonym_df.count()))
|
||||||
|
# print ("DF looks like this:" )
|
||||||
|
# synonym_df.show(1000, False)
|
||||||
|
|
||||||
|
print ("\n\n-----------------------------")
|
||||||
|
# Now we need to join the score files on the openaire-id with the synonyms and then keep
|
||||||
|
# only doi - score - class and write this to the output
|
||||||
|
for offset, input_file in enumerate(input_file_list):
|
||||||
|
|
||||||
|
print ("Mapping scores from " + input_file)
|
||||||
|
|
||||||
|
# Select correct schema
|
||||||
|
schema = int_schema
|
||||||
|
if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower():
|
||||||
|
schema = float_schema
|
||||||
|
|
||||||
|
# Load file to dataframe
|
||||||
|
ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id')
|
||||||
|
|
||||||
|
# Get max score
|
||||||
|
max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max']
|
||||||
|
print ("Max Score for " + str(input_file) + " is " + str(max_score))
|
||||||
|
|
||||||
|
# TESTING
|
||||||
|
# print ("Loaded df sample:")
|
||||||
|
# ranking_df.show(1000, False)
|
||||||
|
|
||||||
|
# Join scores to synonyms and keep required fields
|
||||||
|
doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache()
|
||||||
|
# Write output
|
||||||
|
output_file = output_file_list[offset]
|
||||||
|
print ("Writing to: " + output_file)
|
||||||
|
doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
|
||||||
|
|
||||||
|
# Creata another file for the bip update process
|
||||||
|
ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup'))
|
||||||
|
doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache()
|
||||||
|
output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz")
|
||||||
|
print ("Writing bip update to: " + output_file)
|
||||||
|
doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
|
||||||
|
|
||||||
|
|
||||||
|
# Free memory?
|
||||||
|
ranking_df.unpersist(True)
|
||||||
|
|
||||||
|
print ("-----------------------------")
|
||||||
|
print ("\n\nFinished!\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
import sys
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark import SparkConf, SparkContext
|
||||||
|
import pyspark.sql.functions as F
|
||||||
|
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
|
||||||
|
|
||||||
|
if len(sys.argv) < 8:
|
||||||
|
print("Usage: projects_impact.py <relations_folder> <influence_file> <popularity_file> <cc_file> <impulse_file> <num_partitions> <output_dir>")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
appName = 'Project Impact Indicators'
|
||||||
|
conf = SparkConf().setAppName(appName)
|
||||||
|
sc = SparkContext(conf = conf)
|
||||||
|
spark = SparkSession.builder.appName(appName).getOrCreate()
|
||||||
|
sc.setLogLevel('OFF')
|
||||||
|
|
||||||
|
# input parameters
|
||||||
|
relations_fd = sys.argv[1]
|
||||||
|
influence_fd = sys.argv[2]
|
||||||
|
popularity_fd = sys.argv[3]
|
||||||
|
cc_fd = sys.argv[4]
|
||||||
|
impulse_fd = sys.argv[5]
|
||||||
|
num_partitions = int(sys.argv[6])
|
||||||
|
output_dir = sys.argv[7]
|
||||||
|
|
||||||
|
# schema for impact indicator files
|
||||||
|
impact_files_schema = StructType([
|
||||||
|
StructField('resultId', StringType(), False),
|
||||||
|
StructField('score', IntegerType(), False),
|
||||||
|
StructField('class', StringType(), False),
|
||||||
|
])
|
||||||
|
|
||||||
|
# list of impact indicators
|
||||||
|
impact_indicators = [
|
||||||
|
('influence', influence_fd, 'class'),
|
||||||
|
('popularity', popularity_fd, 'class'),
|
||||||
|
('impulse', impulse_fd, 'score'),
|
||||||
|
('citation_count', cc_fd, 'score')
|
||||||
|
]
|
||||||
|
|
||||||
|
'''
|
||||||
|
* Read impact indicator file and return a dataframe with the following schema:
|
||||||
|
* resultId: String
|
||||||
|
* indicator_name: Integer
|
||||||
|
'''
|
||||||
|
def read_df(fd, indicator_name, column_name):
|
||||||
|
return spark.read.schema(impact_files_schema)\
|
||||||
|
.option('delimiter', '\t')\
|
||||||
|
.option('header', False)\
|
||||||
|
.csv(fd)\
|
||||||
|
.select('resultId', F.col(column_name).alias(indicator_name))\
|
||||||
|
.repartition(num_partitions, 'resultId')
|
||||||
|
|
||||||
|
# Print dataframe schema, first 5 rows, and count
|
||||||
|
def print_df(df):
|
||||||
|
df.show(50)
|
||||||
|
df.printSchema()
|
||||||
|
print(df.count())
|
||||||
|
|
||||||
|
# Sets a null value to the column if the value is equal to the given value
|
||||||
|
def set_class_value_to_null(column, value):
|
||||||
|
return F.when(column != value, column).otherwise(F.lit(None))
|
||||||
|
|
||||||
|
# load and filter Project-to-Result relations
|
||||||
|
print("Reading relations")
|
||||||
|
relations = spark.read.json(relations_fd)\
|
||||||
|
.select(F.col('source').alias('projectId'), F.col('target').alias('resultId'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
|
||||||
|
.where( (F.col('relClass') == 'produces') \
|
||||||
|
& (F.col('deletedbyinference') == "false")\
|
||||||
|
& (F.col('invisible') == "false"))\
|
||||||
|
.drop('deletedbyinference')\
|
||||||
|
.drop('invisible')\
|
||||||
|
.drop('relClass')\
|
||||||
|
.repartition(num_partitions, 'resultId')
|
||||||
|
|
||||||
|
for indicator_name, fd, column_name in impact_indicators:
|
||||||
|
|
||||||
|
print("Reading {} '{}' field from file".format(indicator_name, column_name))
|
||||||
|
df = read_df(fd, indicator_name, column_name)
|
||||||
|
|
||||||
|
# sets a zero value to the indicator column if the value is C5
|
||||||
|
if (column_name == 'class'):
|
||||||
|
df = df.withColumn(indicator_name, F.when(F.col(indicator_name).isin("C5"), 0).otherwise(1))
|
||||||
|
|
||||||
|
# print_df(df)
|
||||||
|
|
||||||
|
print("Joining {} to relations".format(indicator_name))
|
||||||
|
|
||||||
|
# NOTE: we use inner join because we want to keep only the results that have an impact score
|
||||||
|
# also note that all impact scores have the same set of results
|
||||||
|
relations = relations.join(df, 'resultId', 'inner')\
|
||||||
|
.repartition(num_partitions, 'resultId')
|
||||||
|
|
||||||
|
# uncomment to print non-null values count for each indicator
|
||||||
|
# for indicator_name, fd, column_name in impact_indicators:
|
||||||
|
# print("Counting non null values for {}".format(indicator_name))
|
||||||
|
# print(relations.filter(F.col(indicator_name).isNotNull()).count())
|
||||||
|
|
||||||
|
# sum the impact indicator values for each project
|
||||||
|
relations.groupBy('projectId')\
|
||||||
|
.agg(\
|
||||||
|
F.sum('influence').alias('numOfInfluentialResults'),\
|
||||||
|
F.sum('popularity').alias('numOfPopularResults'),\
|
||||||
|
F.sum('impulse').alias('totalImpulse'),\
|
||||||
|
F.sum('citation_count').alias('totalCitationCount')\
|
||||||
|
)\
|
||||||
|
.write.mode("overwrite")\
|
||||||
|
.json(output_dir, compression="gzip")
|
|
@ -0,0 +1,602 @@
|
||||||
|
<workflow-app xmlns="uri:oozie:workflow:0.5" name="ranking-wf">
|
||||||
|
|
||||||
|
<!-- Global params -->
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
|
||||||
|
<start to="entry-point-decision" />
|
||||||
|
|
||||||
|
<decision name="entry-point-decision">
|
||||||
|
<switch>
|
||||||
|
<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
|
||||||
|
<!-- If any different condition is set, go to the corresponding start -->
|
||||||
|
<case to="non-iterative-rankings">${wf:conf('resume') eq "rankings-start"}</case>
|
||||||
|
<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
|
||||||
|
<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
|
||||||
|
<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
|
||||||
|
<!-- <case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case> -->
|
||||||
|
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
|
||||||
|
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
|
||||||
|
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
|
||||||
|
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
|
||||||
|
|
||||||
|
<!-- Aggregation of impact scores on the project level -->
|
||||||
|
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
|
||||||
|
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
|
||||||
|
|
||||||
|
<default to="create-openaire-ranking-graph" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<!-- initial step: create citation network -->
|
||||||
|
<action name="create-openaire-ranking-graph">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>OpenAIRE Ranking Graph Creation</name>
|
||||||
|
<jar>create_openaire_ranking_graph.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkHighDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<!-- The openaire graph data from which to read relations and objects -->
|
||||||
|
<arg>${openaireDataInput}</arg>
|
||||||
|
<!-- Year for filtering entries w/ larger values / empty -->
|
||||||
|
<arg>${currentYear}</arg>
|
||||||
|
<!-- number of partitions to be used on joins -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<!-- The output of the graph should be the openaire input graph for ranking-->
|
||||||
|
<arg>${openaireGraphInputPath}</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="non-iterative-rankings" />
|
||||||
|
<error to="openaire-graph-error" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- Citation Count and RAM are calculated in parallel-->
|
||||||
|
<fork name="non-iterative-rankings">
|
||||||
|
<path start="spark-cc"/>
|
||||||
|
<!-- <path start="spark-impulse"/> -->
|
||||||
|
<path start="spark-ram"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<!-- Run Citation Count calculation -->
|
||||||
|
<action name="spark-cc">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Citation Count calculation</name>
|
||||||
|
<jar>CC.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>${openaireGraphInputPath}</arg>
|
||||||
|
<!-- number of partitions to be used on joins -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/bip-ranker/CC.py#CC.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="join-non-iterative-rankings" />
|
||||||
|
<error to="cc-fail" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- RAM calculation -->
|
||||||
|
<action name="spark-ram">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>RAM calculation</name>
|
||||||
|
<jar>TAR.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>${openaireGraphInputPath}</arg>
|
||||||
|
<arg>${ramGamma}</arg>
|
||||||
|
<arg>${currentYear}</arg>
|
||||||
|
<arg>RAM</arg>
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<arg>${checkpointDir}</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/bip-ranker/TAR.py#TAR.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="join-non-iterative-rankings" />
|
||||||
|
<error to="ram-fail" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- Join non-iterative methods -->
|
||||||
|
<join name="join-non-iterative-rankings" to="spark-impulse"/>
|
||||||
|
|
||||||
|
<action name="spark-impulse">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Impulse calculation</name>
|
||||||
|
<jar>CC.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>${openaireGraphInputPath}</arg>
|
||||||
|
<!-- number of partitions to be used on joins -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<arg>3</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/bip-ranker/CC.py#CC.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="spark-pagerank" />
|
||||||
|
<error to="impulse-fail" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="spark-pagerank">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Pagerank calculation</name>
|
||||||
|
<jar>PageRank.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>${openaireGraphInputPath}</arg>
|
||||||
|
<arg>${pageRankAlpha}</arg>
|
||||||
|
<arg>${convergenceError}</arg>
|
||||||
|
<arg>${checkpointDir}</arg>
|
||||||
|
<!-- number of partitions to be used on joins -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<arg>dfs</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/bip-ranker/PageRank.py#PageRank.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="spark-attrank" />
|
||||||
|
<error to="pagerank-fail" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="spark-attrank">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>AttRank calculation</name>
|
||||||
|
<jar>AttRank.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>${openaireGraphInputPath}</arg>
|
||||||
|
<arg>${attrankAlpha}</arg>
|
||||||
|
<arg>${attrankBeta}</arg>
|
||||||
|
<arg>${attrankGamma}</arg>
|
||||||
|
<arg>${attrankRho}</arg>
|
||||||
|
<arg>${currentYear}</arg>
|
||||||
|
<arg>${attrankStartYear}</arg>
|
||||||
|
<arg>${convergenceError}</arg>
|
||||||
|
<arg>${checkpointDir}</arg>
|
||||||
|
<!-- number of partitions to be used on joins -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<arg>dfs</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/bip-ranker/AttRank.py#AttRank.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="get-file-names" />
|
||||||
|
<error to="attrank-fail" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="get-file-names">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||||
|
|
||||||
|
<!-- Exec is needed for shell commands - points to type of shell command -->
|
||||||
|
<exec>/usr/bin/bash</exec>
|
||||||
|
<!-- name of script to run -->
|
||||||
|
<argument>get_ranking_files.sh</argument>
|
||||||
|
<!-- We only pass the directory where we expect to find the rankings -->
|
||||||
|
<argument>${workingDir}</argument>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
|
||||||
|
<!-- Get the output in order to be usable by following actions -->
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
|
||||||
|
<ok to="format-result-files" />
|
||||||
|
<error to="filename-getting-error" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
|
||||||
|
<fork name="format-result-files">
|
||||||
|
<path start="format-bip-files"/>
|
||||||
|
<path start="format-json-files"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Format json files -->
|
||||||
|
<!-- Two parts: a) format files b) make the file endings .json.gz -->
|
||||||
|
<action name="format-json-files">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Format Ranking Results JSON</name>
|
||||||
|
<jar>format_ranking_results.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkNormalExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>json-5-way</arg>
|
||||||
|
<!-- Input files must be identified dynamically -->
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
|
||||||
|
<!-- Num partitions -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
|
||||||
|
<arg>openaire</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="join-file-formatting" />
|
||||||
|
<error to="json-formatting-fail" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
|
||||||
|
<action name="format-bip-files">
|
||||||
|
<!-- This is required as a tag for spark jobs, regardless of programming language -->
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<!-- using configs from an example on openaire -->
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
|
||||||
|
<!-- This is the name of our job -->
|
||||||
|
<name>Format Ranking Results BiP! DB</name>
|
||||||
|
<!-- Script name goes here -->
|
||||||
|
<jar>format_ranking_results.py</jar>
|
||||||
|
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkNormalExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>zenodo</arg>
|
||||||
|
<!-- Input files must be identified dynamically -->
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
|
||||||
|
<!-- Num partitions -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
|
||||||
|
<arg>openaire</arg>
|
||||||
|
<!-- This needs to point to the file on the hdfs i think -->
|
||||||
|
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="join-file-formatting" />
|
||||||
|
<error to="bip-formatting-fail" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- Finish formatting jobs -->
|
||||||
|
<join name="join-file-formatting" to="map-openaire-to-doi"/>
|
||||||
|
|
||||||
|
<!-- maps openaire ids to DOIs -->
|
||||||
|
<action name="map-openaire-to-doi">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<!-- Delete previously created doi synonym folder -->
|
||||||
|
<prepare>
|
||||||
|
<delete path="${synonymFolder}"/>
|
||||||
|
</prepare>
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Openaire-DOI synonym collection</name>
|
||||||
|
<jar>map_openaire_ids_to_dois.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkHighDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>${openaireDataInput}/</arg>
|
||||||
|
<!-- number of partitions to be used on joins -->
|
||||||
|
<arg>${synonymFolder}</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="map-scores-to-dois" />
|
||||||
|
<error to="synonym-collection-fail" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- mapping openaire scores to DOIs -->
|
||||||
|
<action name="map-scores-to-dois">
|
||||||
|
<!-- This is required as a tag for spark jobs, regardless of programming language -->
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<!-- using configs from an example on openaire -->
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Mapping Openaire Scores to DOIs</name>
|
||||||
|
<jar>map_scores_to_dois.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkHighDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<arg>${synonymFolder}</arg>
|
||||||
|
<!-- Number of partitions -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
<!-- The remaining input are the ranking files fproduced for bip db-->
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
|
||||||
|
|
||||||
|
<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="project-impact-indicators" />
|
||||||
|
<error to="map-scores-fail" />
|
||||||
|
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="project-impact-indicators">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Project Impact Indicators calculation</name>
|
||||||
|
<jar>projects_impact.py</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<!-- Script arguments here -->
|
||||||
|
<!-- graph data folder from which to read relations -->
|
||||||
|
<arg>${openaireDataInput}/relation</arg>
|
||||||
|
|
||||||
|
<!-- input files with impact indicators for results -->
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
|
||||||
|
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
|
||||||
|
|
||||||
|
<!-- number of partitions to be used on joins -->
|
||||||
|
<arg>${sparkShufflePartitions}</arg>
|
||||||
|
|
||||||
|
<arg>${projectImpactIndicatorsOutput}</arg>
|
||||||
|
<file>${wfAppPath}/projects_impact.py#projects_impact.py</file>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="delete-output-path-for-actionset" />
|
||||||
|
<error to="project-impact-indicators-fail" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- Re-create folder for actionsets -->
|
||||||
|
<action name="delete-output-path-for-actionset">
|
||||||
|
<fs>
|
||||||
|
<delete path="${actionSetOutputPath}"/>
|
||||||
|
<mkdir path="${actionSetOutputPath}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="create-actionset"/>
|
||||||
|
<error to="actionset-delete-fail"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="create-actionset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Produces the atomic action with the bip finder scores</name>
|
||||||
|
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||||
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
|
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkNormalExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
|
||||||
|
<arg>--resultsInputPath</arg><arg>${bipScorePath}</arg>
|
||||||
|
<arg>--projectsInputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="end"/>
|
||||||
|
<error to="actionset-creation-fail"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- Definitions of failure messages -->
|
||||||
|
<kill name="openaire-graph-error">
|
||||||
|
<message>Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="cc-fail">
|
||||||
|
<message>CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="ram-fail">
|
||||||
|
<message>RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="impulse-fail">
|
||||||
|
<message>Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="pagerank-fail">
|
||||||
|
<message>PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="attrank-fail">
|
||||||
|
<message>AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="filename-getting-error">
|
||||||
|
<message>Error getting key-value pairs for output files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="json-formatting-fail">
|
||||||
|
<message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="bip-formatting-fail">
|
||||||
|
<message>Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="synonym-collection-fail">
|
||||||
|
<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="map-scores-fail">
|
||||||
|
<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="actionset-delete-fail">
|
||||||
|
<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="actionset-creation-fail">
|
||||||
|
<message>ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<kill name="project-impact-indicators-fail">
|
||||||
|
<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<!-- Define ending node -->
|
||||||
|
<end name="end" />
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -38,6 +38,7 @@
|
||||||
<module>dhp-usage-raw-data-update</module>
|
<module>dhp-usage-raw-data-update</module>
|
||||||
<module>dhp-broker-events</module>
|
<module>dhp-broker-events</module>
|
||||||
<module>dhp-doiboost</module>
|
<module>dhp-doiboost</module>
|
||||||
|
<module>dhp-impact-indicators</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<pluginRepositories>
|
<pluginRepositories>
|
||||||
|
|
Loading…
Reference in New Issue