forked from D-Net/dnet-hadoop
Add changes from code review
This commit is contained in:
parent
be320ba3c1
commit
ebfba38ab6
|
@ -10,15 +10,15 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -82,45 +82,32 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
private static <I extends Result> void prepareAffiliationRelations(SparkSession spark, String inputPath,
|
private static <I extends Result> void prepareAffiliationRelations(SparkSession spark, String inputPath,
|
||||||
String outputPath) {
|
String outputPath) {
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
// load and parse affiliation relations from HDFS
|
// load and parse affiliation relations from HDFS
|
||||||
JavaRDD<AffiliationRelationDeserializer> affiliationRelationsDeserializeRDD = sc
|
Dataset<Row> df = spark
|
||||||
.textFile(inputPath)
|
.read()
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, AffiliationRelationDeserializer.class));
|
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:ARRAY<STRING>,`Confidence`:DOUBLE>>")
|
||||||
|
.json(inputPath);
|
||||||
|
|
||||||
// convert affiliation to an internal representation
|
// unroll nested arrays
|
||||||
Dataset<AffiliationRelationModel> affiliationRelations = spark
|
df = df
|
||||||
.createDataset(
|
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||||
affiliationRelationsDeserializeRDD
|
.withColumn("rorid", functions.explode(new Column("matching.RORid")))
|
||||||
.flatMap(
|
.select(
|
||||||
entry -> entry
|
new Column("DOI").as("doi"),
|
||||||
.getMatchings()
|
new Column("rorid"),
|
||||||
.stream()
|
new Column("matching.Confidence").as("confidence"));
|
||||||
.flatMap(
|
|
||||||
matching -> matching
|
|
||||||
.getRorId()
|
|
||||||
.stream()
|
|
||||||
.map(
|
|
||||||
rorId -> new AffiliationRelationModel(
|
|
||||||
entry.getDoi(),
|
|
||||||
rorId,
|
|
||||||
matching.getConfidence())))
|
|
||||||
.collect(Collectors.toList())
|
|
||||||
.iterator())
|
|
||||||
.rdd(),
|
|
||||||
Encoders.bean(AffiliationRelationModel.class));
|
|
||||||
|
|
||||||
// prepare action sets for affiliation relations
|
// prepare action sets for affiliation relations
|
||||||
affiliationRelations
|
df
|
||||||
.flatMap((FlatMapFunction<AffiliationRelationModel, Relation>) affRel -> {
|
.toJavaRDD()
|
||||||
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||||
|
|
||||||
// DOI to OpenAIRE id
|
// DOI to OpenAIRE id
|
||||||
final String paperId = ID_PREFIX
|
final String paperId = ID_PREFIX
|
||||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", affRel.getDoi()));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
||||||
|
|
||||||
// ROR id to OpenAIRE id
|
// ROR id to OpenAIRE id
|
||||||
final String affId = GenerateRorActionSetJob.calculateOpenaireId(affRel.getRorId());
|
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
||||||
|
|
||||||
Qualifier qualifier = OafMapperUtils
|
Qualifier qualifier = OafMapperUtils
|
||||||
.qualifier(
|
.qualifier(
|
||||||
|
@ -137,18 +124,17 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
qualifier,
|
qualifier,
|
||||||
Double.toString(affRel.getConfidence()));
|
Double.toString(row.getAs("confidence")));
|
||||||
|
|
||||||
// return bi-directional relations
|
// return bi-directional relations
|
||||||
return getAffiliationRelationPair(paperId, affId, dataInfo).iterator();
|
return getAffiliationRelationPair(paperId, affId, dataInfo).iterator();
|
||||||
|
|
||||||
}, Encoders.bean(Relation.class))
|
})
|
||||||
.toJavaRDD()
|
|
||||||
.map(p -> new AtomicAction(Relation.class, p))
|
.map(p -> new AtomicAction(Relation.class, p))
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue