[WebCrawl] filtering out all the results published before 2020 not associated to IE
This commit is contained in:
parent
aecf3b4f2e
commit
48ed49055e
|
@ -78,7 +78,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
||||||
createActionSet(spark, inputPath, outputPath + "actionSet");
|
createActionSet(spark, inputPath, outputPath + "actionSet");
|
||||||
// createPlainRelations(spark, inputPath, outputPath + "relations");
|
createPlainRelations(spark, inputPath, outputPath + "relations");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,6 +139,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
String outputPath) {
|
String outputPath) {
|
||||||
|
|
||||||
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
||||||
|
.filter("publication_year <= 2020 or country_code=='IE'")
|
||||||
.drop("publication_year");
|
.drop("publication_year");
|
||||||
|
|
||||||
dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||||
|
|
Loading…
Reference in New Issue