From 75d5ddb999fe9a061b8211f828ffa35e6b8c763d Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 24 May 2024 16:01:19 +0200 Subject: [PATCH] Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter --- .../eu/dnetlib/dhp/actionmanager/webcrawl/job.properties | 1 + .../eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml | 1 + 2 files changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties index f616baea70..d7bd709fca 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties @@ -1,2 +1,3 @@ sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/ outputPath=/tmp/miriam/webcrawlComplete/ +blackListPath=/user/miriam.baglioni/openalex-blackList diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml index 653a7d3842..b9394c7e69 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml @@ -45,6 +45,7 @@ --sourcePath${sourcePath} --outputPath${outputPath} + --blackListPath${blackListPath}