forked from D-Net/dnet-hadoop
Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring
This commit is contained in:
parent
b55fed09f8
commit
87c9c61b41
|
@ -95,7 +95,8 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
|
|
||||||
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
|
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
|
||||||
|
|
||||||
dataset.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
|
dataset
|
||||||
|
.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
|
||||||
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
|
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
|
||||||
.drop("OpenAlexId")
|
.drop("OpenAlexId")
|
||||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||||
|
@ -114,7 +115,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);//, GzipCodec.class);
|
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,7 +146,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath){
|
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
|
||||||
|
|
||||||
return spark
|
return spark
|
||||||
.read()
|
.read()
|
||||||
|
|
Loading…
Reference in New Issue