forked from D-Net/dnet-hadoop
Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter
This commit is contained in:
parent
87c9c61b41
commit
75d5ddb999
|
@ -1,2 +1,3 @@
|
||||||
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
||||||
outputPath=/tmp/miriam/webcrawlComplete/
|
outputPath=/tmp/miriam/webcrawlComplete/
|
||||||
|
blackListPath=/user/miriam.baglioni/openalex-blackList
|
||||||
|
|
|
@ -45,6 +45,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
|
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Loading…
Reference in New Issue