dnet-hadoop/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/provision/00_prod_graph_for_IIS.xml

437 lines
25 KiB
XML

<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="4801c33c-66ca-4ab6-af64-aa812194ec69_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
<RESOURCE_TYPE value="WorkflowDSResourceType"/>
<RESOURCE_KIND value="WorkflowDSResources"/>
<RESOURCE_URI value=""/>
<DATE_OF_CREATION value="2021-07-29T14:28:39+00:00"/>
</HEADER>
<BODY>
<WORKFLOW_NAME>Graph construction for IIS [PROD NEW]</WORKFLOW_NAME>
<WORKFLOW_TYPE>IIS</WORKFLOW_TYPE>
<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
<CONFIGURATION start="manual">
<NODE isStart="true" name="setNsPrefixBlacklist" type="SetEnvParameter">
<DESCRIPTION>set blacklist of funder nsPrefixes</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">nsPrefixBlacklist</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">conicytf____,dfgf________,gsrt________,innoviris___,miur________,rif_________,rsf_________,sgov________,sfrs________</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setIdMappingPath" type="SetEnvParameter">
<DESCRIPTION>set the path of the map defining the relations id mappings</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">idMappingPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/data/maps/fct_map.json</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="aggregatorGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the path containing the PROD AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">aggregatorGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_inference/graph/00_graph_aggregator</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setRawGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">rawGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_inference/graph/01_graph_raw</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setCleanedFirstGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedFirstGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_inference/graph/02_graph_clean_first</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setDedupGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the DEDUPED graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">dedupGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_inference/graph/03_graph_dedup</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setConsistentGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the CONSISTENCY graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">consistentGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_inference/graph/04_graph_consistent</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setCleanedGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_inference/graph/05_graph_cleaned</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setDedupConfig" type="SetEnvParameter">
<DESCRIPTION>Set the dedup orchestrator name</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">dedupConfig</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">dedup-similarity-result-decisiontree-v2</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="actionSetsRaw" type="SetEnvParameter">
<DESCRIPTION>declares the ActionSet ids to promote in the RAW graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">actionSetIdsRawGraph</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">scholexplorer-dump,doiboost,orcidworks-no-doi,datacite</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setIsLookUpUrl" type="SetEnvParameter">
<DESCRIPTION>Set the IS lookup service address</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">isLookUpUrl</PARAM>
<PARAM managedBy="system" name="parameterValue" required="true" type="string">http://services.openaire.eu:8280/is/services/isLookUp?wsdl</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isJoin="true" name="waitConfig">
<DESCRIPTION>wait configurations</DESCRIPTION>
<PARAMETERS/>
<ARCS>
<ARC to="reuseODFClaims"/>
<ARC to="reuseOAFClaims"/>
<ARC to="reuseODF_hdfs"/>
<ARC to="reuseOAF_hdfs"/>
<ARC to="reuseODF"/>
<ARC to="reuseOAF"/>
<ARC to="reuseDB"/>
<ARC to="reuseDBOpenorgs"/>
<ARC to="patchRelations"/>
<ARC to="contentPath"/>
</ARCS>
</NODE>
<NODE name="reuseODFClaims" type="SetEnvParameter">
<DESCRIPTION>reuse cached ODF claims from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseODFClaims</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="reuseOAFClaims" type="SetEnvParameter">
<DESCRIPTION>reuse cached OAF claims from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseOAFClaims</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="reuseODF_hdfs" type="SetEnvParameter">
<DESCRIPTION>reuse cached ODF records on HDFS from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseODFhdfs</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="reuseOAF_hdfs" type="SetEnvParameter">
<DESCRIPTION>reuse cached OAF records on HDFS from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseOAFhdfs</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="reuseODF" type="SetEnvParameter">
<DESCRIPTION>reuse cached ODF content from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseODF</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="reuseOAF" type="SetEnvParameter">
<DESCRIPTION>reuse cached OAF content from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseOAF</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="reuseDB" type="SetEnvParameter">
<DESCRIPTION>reuse cached DB content from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseDB</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="reuseDBOpenorgs" type="SetEnvParameter">
<DESCRIPTION>reuse cached OpenOrgs content from the PROD aggregation system</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseDBOpenorgs</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="patchRelations" type="SetEnvParameter">
<DESCRIPTION>should apply the relations id patching based on the provided idMapping?</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">shouldPatchRelations</PARAM>
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">false</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE name="contentPath" type="SetEnvParameter">
<DESCRIPTION>set the PROD aggregator content path</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">contentPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_aggregator</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig2"/>
</ARCS>
</NODE>
<NODE isJoin="true" name="waitConfig2">
<DESCRIPTION>wait configurations</DESCRIPTION>
<PARAMETERS/>
<ARCS>
<ARC to="aggregatorGraph"/>
</ARCS>
</NODE>
<NODE name="aggregatorGraph" type="SubmitHadoopJob">
<DESCRIPTION>create the PROD AGGREGATOR graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'graphOutputPath' : 'aggregatorGraphPath',
'isLookupUrl' : 'isLookUpUrl',
'reuseODFClaims' : 'reuseODFClaims',
'reuseOAFClaims' : 'reuseOAFClaims',
'reuseDB' : 'reuseDB',
'reuseDBOpenorgs' : 'reuseDBOpenorgs',
'reuseODF' : 'reuseODF',
'reuseODF_hdfs' : 'reuseODFhdfs',
'reuseOAF' : 'reuseOAF',
'reuseOAF_hdfs' : 'reuseOAFhdfs',
'contentPath' : 'contentPath',
'nsPrefixBlacklist' : 'nsPrefixBlacklist',
'shouldPatchRelations' : 'shouldPatchRelations',
'idMappingPath' : 'idMappingPath'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/PROD/oa/graph/raw_all/oozie_app',
'mongoURL' : '',
'mongoDb' : '',
'mdstoreManagerUrl' : '',
'postgresURL' : '',
'postgresUser' : '',
'postgresPassword' : '',
'postgresOpenOrgsURL' : '',
'postgresOpenOrgsUser' : '',
'postgresOpenOrgsPassword' : '',
'shouldHashId' : 'true',
'importOpenorgs' : 'true',
'workingDir' : '/tmp/prod_inference/working_dir/prod_aggregator'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="promoteActionsRaw"/>
</ARCS>
</NODE>
<NODE name="promoteActionsRaw" type="SubmitHadoopJob">
<DESCRIPTION>create the RAW graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'inputActionSetIds' : 'actionSetIdsRawGraph',
'inputGraphRootPath' : 'aggregatorGraphPath',
'outputGraphRootPath' : 'rawGraphPath',
'isLookupUrl' : 'isLookUpUrl'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/PROD/actionmanager/wf/main/oozie_app',
'sparkExecutorCores' : '3',
'sparkExecutorMemory' : '10G',
'activePromoteDatasetActionPayload' : 'true',
'activePromoteDatasourceActionPayload' : 'true',
'activePromoteOrganizationActionPayload' : 'true',
'activePromoteOtherResearchProductActionPayload' : 'true',
'activePromoteProjectActionPayload' : 'true',
'activePromotePublicationActionPayload' : 'true',
'activePromoteRelationActionPayload' : 'true',
'activePromoteResultActionPayload' : 'true',
'activePromoteSoftwareActionPayload' : 'true',
'mergeAndGetStrategy' : 'MERGE_FROM_AND_GET',
'workingDir' : '/tmp/prod_inference/working_dir/promoteActionsRaw'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="graphCleaningFirst"/>
</ARCS>
</NODE>
<NODE name="graphCleaningFirst" type="SubmitHadoopJob">
<DESCRIPTION>clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'graphInputPath' : 'rawGraphPath',
'graphOutputPath': 'cleanedFirstGraphPath',
'isLookupUrl': 'isLookUpUrl'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/PROD/oa/graph/clean/oozie_app',
'workingDir' : '/tmp/prod_inference/working_dir/clean_first'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="duplicateScan"/>
</ARCS>
</NODE>
<NODE name="duplicateScan" type="SubmitHadoopJob">
<DESCRIPTION>search for duplicates in the raw graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'actionSetId' : 'dedupConfig',
'graphBasePath' : 'cleanedFirstGraphPath',
'dedupGraphPath': 'dedupGraphPath',
'isLookUpUrl' : 'isLookUpUrl'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/PROD/oa/dedup/scan/oozie_app',
'actionSetIdOpenorgs' : 'dedup-similarity-organization-simple',
'workingPath' : '/tmp/prod_inference/working_dir/dedup',
'sparkExecutorCores' : '3',
'sparkExecutorMemory' : '10G'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="dedupConsistency"/>
</ARCS>
</NODE>
<NODE name="dedupConsistency" type="SubmitHadoopJob">
<DESCRIPTION>mark duplicates as deleted and redistribute the relationships</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'graphBasePath' : 'dedupGraphPath',
'graphOutputPath': 'consistentGraphPath'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/PROD/oa/dedup/consistency/oozie_app',
'workingPath' : '/tmp/prod_inference/working_dir/dedup'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="graphCleaning"/>
</ARCS>
</NODE>
<NODE name="graphCleaning" type="SubmitHadoopJob">
<DESCRIPTION>clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'graphInputPath' : 'consistentGraphPath',
'graphOutputPath': 'cleanedGraphPath',
'isLookupUrl': 'isLookUpUrl'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/PROD/oa/graph/clean/oozie_app',
'workingDir' : '/tmp/prod_inference/working_dir/clean'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="success"/>
</ARCS>
</NODE>
</CONFIGURATION>
<STATUS>
<LAST_EXECUTION_ID>wf_20210719_165159_86</LAST_EXECUTION_ID>
<LAST_EXECUTION_DATE>2021-07-19T20:45:09+00:00</LAST_EXECUTION_DATE>
<LAST_EXECUTION_STATUS>SUCCESS</LAST_EXECUTION_STATUS>
<LAST_EXECUTION_ERROR/>
</STATUS>
</BODY>
</RESOURCE_PROFILE>