[WebCrawl] filtering out all the results published before 2020 not associated to IE

This commit is contained in:
Miriam Baglioni 2024-04-22 09:39:29 +02:00
parent aecf3b4f2e
commit 48ed49055e
1 changed files with 2 additions and 1 deletions

View File

@ -78,7 +78,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
spark -> {
createActionSet(spark, inputPath, outputPath + "actionSet");
// createPlainRelations(spark, inputPath, outputPath + "relations");
createPlainRelations(spark, inputPath, outputPath + "relations");
});
}
@ -139,6 +139,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
String outputPath) {
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
.filter("publication_year <= 2020 or country_code=='IE'")
.drop("publication_year");
dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {