Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring

This commit is contained in:
Miriam Baglioni 2024-05-24 15:23:42 +02:00
parent 12ffde023f
commit 7a44869d87
2 changed files with 33 additions and 32 deletions

View File

@ -95,7 +95,8 @@ public class CreateActionSetFromWebEntries implements Serializable {
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath); final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
dataset.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left") dataset
.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null) .filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
.drop("OpenAlexId") .drop("OpenAlexId")
.flatMap((FlatMapFunction<Row, Relation>) row -> { .flatMap((FlatMapFunction<Row, Relation>) row -> {
@ -114,7 +115,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
.mapToPair( .mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa)))) new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);//, GzipCodec.class); .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
} }