From 420f43fc2f46b966f00c457b3358a969a7660fa0 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 11:49:13 +0200 Subject: [PATCH] [affRo] added option to run on crossref --- .../actionmanager/affiliations/job.properties | 9 ++++- .../affiliations/oozie_app/workflow.xml | 37 ++++++++++++++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/job.properties index b3648c1ce..97031b9c8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/job.properties @@ -26,10 +26,15 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen # The following is needed as a property of a workflow wfAppPath=${oozieTopWfApplicationPath} +resumeFrom=Crossref -resultFolder=/tmp/affro-results/oalex -inputFolder=/user/zeppelin/affiliations/raw_aff_string/2024-08 +#OpenAlex input/output +#resultFolder=/tmp/affro-results/oalex +#inputFolder=/user/zeppelin/affiliations/raw_aff_string/2024-08 +#Crossref input/output +resultFolder=/tmp/affro-results/crossref +inputFolder=/data/doiboost/crossref/crossref_unpack # #crossrefInputPath=/data/bip-affiliations/crossref-data.json diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/workflow.xml index 8636df1ea..93108513f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/workflow.xml @@ -64,8 +64,8 @@ ${wf:conf('resumeFrom') eq 'IIS'} + ${wf:conf('resumeFrom') eq 'Crossref'} - @@ -137,5 +137,40 @@ + + + + yarn-cluster + cluster + Affiliations inference (Affro) + crossref.py + + + --executor-cores=4 + --executor-memory=6G + --driver-memory=15G + --conf spark.executor.memoryOverhead=6G + --conf spark.sql.shuffle.partitions=20000 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3 + --conf spark.executorEnv.PYSPARK_PYTHON=python3 + --py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py + --files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt + + + ${inputFolder} + ${resultFolder} + + ${wfAppPath}/affRo/crossref.py#crossref.py + + + + + + + \ No newline at end of file