Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter
This commit is contained in:
parent
7a44869d87
commit
b864f0adcf
|
@ -1,2 +1,3 @@
|
|||
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
||||
outputPath=/tmp/miriam/webcrawlComplete/
|
||||
blackListPath=/user/miriam.baglioni/openalex-blackList
|
||||
|
|
|
@ -45,6 +45,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue