Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter

This commit is contained in:
Miriam Baglioni 2024-05-24 16:01:19 +02:00
parent 7a44869d87
commit b864f0adcf
2 changed files with 2 additions and 0 deletions

View File

@ -1,2 +1,3 @@
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
outputPath=/tmp/miriam/webcrawlComplete/
blackListPath=/user/miriam.baglioni/openalex-blackList

View File

@ -45,6 +45,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>