enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
4 changed files with 246 additions and 1 deletions
Showing only changes of commit e162ba5075 - Show all commits

View File

@ -7,7 +7,7 @@
<DATE_OF_CREATION value="2020-06-15T16:35:09+00:00"/>
</HEADER>
<BODY>
<WORKFLOW_NAME>Data Provision [OCEAN]</WORKFLOW_NAME>
<WORKFLOW_NAME>Graph Construction [OCEAN]</WORKFLOW_NAME>
<WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
<CONFIGURATION start="manual">

View File

@ -0,0 +1,73 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="5d750977-bec2-47f4-97bb-1b7500e4704e_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
<RESOURCE_TYPE value="WorkflowDSResourceType"/>
<RESOURCE_KIND value="WorkflowDSResources"/>
<RESOURCE_URI value=""/>
<DATE_OF_CREATION value="2020-06-15T16:35:09+00:00"/>
</HEADER>
<BODY>
<WORKFLOW_NAME>Graph to HiveDB [OCEAN]</WORKFLOW_NAME>
<WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
<CONFIGURATION start="manual">
<NODE isStart="true" name="setInputPath" type="SetEnvParameter">
<DESCRIPTION>Set the path containing the AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">inputPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string"></PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setHiveDbName" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">hiveDbName</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string"></PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isJoin="true" name="waitConfig">
<DESCRIPTION>wait configurations</DESCRIPTION>
<PARAMETERS/>
<ARCS>
<ARC to="aggregatorGraph"/>
</ARCS>
</NODE>
<NODE name="graph2hive" type="SubmitHadoopJob">
<DESCRIPTION>create the AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'inputPath' : 'inputPath',
'hiveDbName' : 'hiveDbName'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/oa/graph/hive/oozie_app'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="success"/>
</ARCS>
</NODE>
</CONFIGURATION>
<STATUS>
<LAST_EXECUTION_ID>wf_20200615_163630_609</LAST_EXECUTION_ID>
<LAST_EXECUTION_DATE>2020-06-15T17:08:00+00:00</LAST_EXECUTION_DATE>
<LAST_EXECUTION_STATUS>SUCCESS</LAST_EXECUTION_STATUS>
<LAST_EXECUTION_ERROR/>
</STATUS>
</BODY>
</RESOURCE_PROFILE>

View File

@ -0,0 +1,98 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="8d36cc94-5b82-413c-923f-e7b3953e41ba_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
<RESOURCE_TYPE value="WorkflowDSResourceType"/>
<RESOURCE_KIND value="WorkflowDSResources"/>
<RESOURCE_URI value=""/>
<DATE_OF_CREATION value="2020-06-15T16:35:09+00:00"/>
</HEADER>
<BODY>
<WORKFLOW_NAME>Update Solr [OCEAN]</WORKFLOW_NAME>
<WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
<CONFIGURATION start="manual">
<NODE isStart="true" name="setInputPath" type="SetEnvParameter">
<DESCRIPTION>Set the path containing the AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">inputGraphRootPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string"></PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setCollection" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">format</PARAM>
<PARAM function="validValues(['TMF', 'DMF'])" managedBy="user" name="parameterValue" required="true" type="string">TMF</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setIsLookUpUrl" type="SetEnvParameter">
<DESCRIPTION>Set the lookup address</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">isLookupUrl</PARAM>
<PARAM managedBy="system" name="parameterValue" required="true" type="string">http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isJoin="true" name="waitConfig">
<DESCRIPTION>wait configurations</DESCRIPTION>
<PARAMETERS/>
<ARCS>
<ARC to="updateSolr"/>
</ARCS>
</NODE>
<NODE name="updateSolr" type="SubmitHadoopJob">
<DESCRIPTION>create the AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'inputGraphRootPath' : 'inputGraphRootPath',
'isLookupUrl' : 'isLookupUrl',
'format' : 'format'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/oa/provision/oozie_app',
'maxRelations' : '100',
'relPartitions' : '3000',
'batchSize' : '2000',
'relationFilter' : 'isAuthorInstitutionOf,produces,hasAmongTopNSimilarDocuments',
'otherDsTypeId' : 'scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource',
'resumeFrom' : 'prepare_relations',
'sparkDriverMemoryForJoining' : '3G',
'sparkExecutorMemoryForJoining' : '7G',
'sparkExecutorCoresForJoining' : '4',
'sparkDriverMemoryForIndexing' : '2G',
'sparkExecutorMemoryForIndexing' : '2G',
'sparkExecutorCoresForIndexing' : '64',
'sparkNetworkTimeout' : '600',
'workingDir' : '/tmp/beta_provision/working_dir/update_solr'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="success"/>
</ARCS>
</NODE>
</CONFIGURATION>
<STATUS>
<LAST_EXECUTION_ID>wf_20200615_163630_609</LAST_EXECUTION_ID>
<LAST_EXECUTION_DATE>2020-06-15T17:08:00+00:00</LAST_EXECUTION_DATE>
<LAST_EXECUTION_STATUS>SUCCESS</LAST_EXECUTION_STATUS>
<LAST_EXECUTION_ERROR/>
</STATUS>
</BODY>
</RESOURCE_PROFILE>

View File

@ -0,0 +1,74 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="65ca9122-f8fe-4aa6-9fb2-bc1e1ffb2dda_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
<RESOURCE_TYPE value="WorkflowDSResourceType"/>
<RESOURCE_KIND value="WorkflowDSResources"/>
<RESOURCE_URI value=""/>
<DATE_OF_CREATION value="2020-06-15T16:35:09+00:00"/>
</HEADER>
<BODY>
<WORKFLOW_NAME>Update Stats [OCEAN]</WORKFLOW_NAME>
<WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
<CONFIGURATION start="manual">
<NODE isStart="true" name="setGraphDbName" type="SetEnvParameter">
<DESCRIPTION>Set the path containing the AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">openaire_db_name</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string"></PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setStatsDbNameCollection" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">stats_db_name</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string"></PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isJoin="true" name="waitConfig">
<DESCRIPTION>wait configurations</DESCRIPTION>
<PARAMETERS/>
<ARCS>
<ARC to="updateStatsDB"/>
</ARCS>
</NODE>
<NODE name="updateStatsDB" type="SubmitHadoopJob">
<DESCRIPTION>create the AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'openaire_db_name' : 'openaire_db_name',
'stats_db_name' : 'stats_db_name'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/oa/graph/stats/oozie_app',
'hive_timeout' : '3000'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="success"/>
</ARCS>
</NODE>
</CONFIGURATION>
<STATUS>
<LAST_EXECUTION_ID>wf_20200615_163630_609</LAST_EXECUTION_ID>
<LAST_EXECUTION_DATE>2020-06-15T17:08:00+00:00</LAST_EXECUTION_DATE>
<LAST_EXECUTION_STATUS>SUCCESS</LAST_EXECUTION_STATUS>
<LAST_EXECUTION_ERROR/>
</STATUS>
</BODY>
</RESOURCE_PROFILE>