From 964972d29a96599589329cc00970344d3f97736c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 29 Apr 2020 09:25:50 +0200 Subject: [PATCH] added data provision workflow definition WIP --- dhp-workflows/dhp-worfklow-profiles/pom.xml | 15 + .../eu/dnetlib/dhp/wf/profiles/provision.xml | 596 ++++++++++++++++++ dhp-workflows/pom.xml | 3 +- 3 files changed, 613 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-worfklow-profiles/pom.xml create mode 100644 dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml diff --git a/dhp-workflows/dhp-worfklow-profiles/pom.xml b/dhp-workflows/dhp-worfklow-profiles/pom.xml new file mode 100644 index 0000000000..df90014ba7 --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/pom.xml @@ -0,0 +1,15 @@ + + + + dhp + eu.dnetlib.dhp + 1.1.7-SNAPSHOT + + 4.0.0 + + dhp-worfklow-profiles + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml new file mode 100644 index 0000000000..0467e618f6 --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml @@ -0,0 +1,596 @@ + +
+ + + + + +
+ + Data Provision [OCEAN] + Data Provision + 30 + + + Set the path containing the AGGREGATOR graph + + aggregatorGraphPath + /tmp/beta_provision/graph/00_graph_aggregator + + + + + + + Set the target path to store the RAW graph + + rawGraphPath + /tmp/beta_provision/graph/01_graph_raw + + + + + + + Set the target path to store the DEDUPED graph + + dedupGraphPath + /tmp/beta_provision/graph/02_graph_dedup + + + + + + + Set the target path to store the INFERRED graph + + inferredGraphPath + /tmp/beta_provision/graph/03_graph_inferred + + + + + + + Set the target path to store the CONSISTENCY graph + + consistentGraphPath + /tmp/beta_provision/graph/04_graph_consistent + + + + + + + Set the target path to store the ORCID enriched graph + + orcidGraphPath + /tmp/beta_provision/graph/05_graph_orcid + + + + + + + Set the target path to store the BULK TAGGED graph + + bulkTaggingGraphPath + /tmp/beta_provision/graph/06_graph_bulktagging + + + + + + + Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph + + affiliationGraphPath + /tmp/beta_provision/graph/07_graph_affiliation + + + + + + + Set the target path to store the COMMUNITY from SELECTED SOURCES graph + + communityOrganizationGraphPath + /tmp/beta_provision/graph/08_graph_comunity_organization + + + + + + + Set the target path to store the FUNDING from SEMANTIC RELATION graph + + fundingGraphPath + /tmp/beta_provision/graph/09_graph_funding + + + + + + + Set the target path to store the COMMUNITY from SEMANTIC RELATION graph + + communitySemRelGraphPath + /tmp/beta_provision/graph/10_graph_comunity_sem_rel + + + + + + + Set the target path to store the COUNTRY enriched graph + + countryGraphPath + /tmp/beta_provision/graph/11_graph_country + + + + + + + Set the lookup address + + isLookUpUrl + http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl + + + + + + + Set the map of paths for the Bulk Tagging + + bulkTaggingPathMap + {"author" : "$['author'][*]['fullname']", "title" : "$['title'][*]['value']", "orcid" : "$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']", "contributor" : "$['contributor'][*]['value']", "description" : "$['description'][*]['value']"} + + + + + + + Set the map of associations organization, community list for the propagation of community to result through organization + + propagationOrganizationCommunityMap + + { + "20|corda__h2020::3fb05a9524c3f790391261347852f638":["mes","euromarine"], + "20|corda__h2020::e8dbe14cca9bf6fce09d468872f813f8":["mes","euromarine"], + "20|snsf________::9b253f265e3bef5cae6d881fdf61aceb":["mes","euromarine"], + "20|rcuk________::e054eea0a47665af8c3656b5785ccf76":["mes","euromarine"], + "20|corda__h2020::edc18d67c9b11fb616ca9f6e1db1b151":["mes","euromarine"], + "20|rcuk________::d5736d9da90521ddcdc7828a05a85e9a":["mes","euromarine"], + "20|corda__h2020::f5d418d3aa1cf817ddefcc3fdc039f27":["mes","euromarine"], + "20|snsf________::8fa091f8f25a846779acb4ea97b50aef":["mes","euromarine"], + "20|corda__h2020::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"], + "20|corda_______::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"], + "20|snsf________::31d0a100e54e3cdb3c6f52d91e638c78":["mes","euromarine"], + "20|corda__h2020::ea379ef91b8cc86f9ac5edc4169292db":["mes","euromarine"], + "20|corda__h2020::f75ee2ee48e5cb0ec8c8d30aaa8fef70":["mes","euromarine"], + "20|rcuk________::e16010089551a1a9182a94604fc0ea59":["mes","euromarine"], + "20|corda__h2020::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"], + "20|corda_______::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"], + "20|grid________::b2cbbf5eadbbf87d534b022bad3191d7":["mes","euromarine"], + "20|snsf________::74730ef1439d7f7636a8be58a6b471b8":["mes","euromarine"], + "20|nsf_________::ad72e19043a5a467e35f9b444d11563e":["mes","euromarine"], + "20|rcuk________::0fc3e92500290902a2d38ec2445e74c3":["mes","euromarine"], + "20|grid________::ad2c29905da0eb3c06b3fa80cacd89ea":["mes","euromarine"], + "20|corda__h2020::30b53e4d63d3724f00acb9cbaca40860":["mes","euromarine"], + "20|corda__h2020::f60f84bee14ad93f0db0e49af1d5c317":["mes","euromarine"], + "20|corda__h2020::7bf251ac3765b5e89d82270a1763d09f":["mes","euromarine"], + "20|corda__h2020::65531bd11be9935948c7f2f4db1c1832":["mes","euromarine"], + "20|corda__h2020::e0e98f86bbc76638bbb72a8fe2302946":["mes","euromarine"], + "20|snsf________::3eb43582ac27601459a8d8b3e195724b":["mes","euromarine"], + "20|corda__h2020::af2481dab65d06c8ea0ae02b5517b9b6":["mes","euromarine"], + "20|corda__h2020::c19d05cfde69a50d3ebc89bd0ee49929":["mes","euromarine"], + "20|corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0":["mes","euromarine"], + "20|rcuk________::f33c02afb0dc66c49d0ed97ca5dd5cb0":["beopen"], + "20|grid________::a867f78acdc5041b34acfe4f9a349157":["beopen"], + "20|grid________::7bb116a1a9f95ab812bf9d2dea2be1ff":["beopen"], + "20|corda__h2020::6ab0e0739dbe625b99a2ae45842164ad":["beopen"], + "20|corda__h2020::8ba50792bc5f4d51d79fca47d860c602":["beopen"], + "20|corda_______::8ba50792bc5f4d51d79fca47d860c602":["beopen"], + "20|corda__h2020::e70e9114979e963eef24666657b807c3":["beopen"], + "20|corda_______::e70e9114979e963eef24666657b807c3":["beopen"], + "20|corda_______::15911e01e9744d57205825d77c218737":["beopen"], + "20|opendoar____::056a41e24e2a9a67215e87bbee6a80ab":["beopen"], + "20|opendoar____::7f67f2e6c6fbb0628f8160fcd3d92ae3":["beopen"], + "20|grid________::a8ecfd7c084e561168bcbe6bf0daf3e3":["beopen"], + "20|corda_______::7bbe6cc5d8ec1864739a04b0d020c9e9":["beopen"], + "20|corda_______::3ff558e30c2e434d688539548300b050":["beopen"], + "20|corda__h2020::5ffee5b3b83b33a8cf0e046877bd3a39":["beopen"], + "20|corda__h2020::5187217e2e806a6df3579c46f82401bc":["beopen"], + "20|grid________::5fa7e2709bcd945e26bfa18689adeec1":["beopen"], + "20|corda_______::d8696683c53027438031a96ad27c3c07":["beopen"], + "20|corda__h2020::d8696683c53027438031a96ad27c3c07":["beopen"], + "20|rcuk________::23a79ebdfa59790864e4a485881568c1":["beopen"], + "20|corda__h2020::b76cf8fe49590a966953c37e18608af9":["beopen"], + "20|grid________::d2f0204126ee709244a488a4cd3b91c2":["beopen"], + "20|corda__h2020::05aba9d2ed17533d15221e5655ac11e6":["beopen"], + "20|grid________::802401579481dc32062bdee69f5e6a34":["beopen"], + "20|corda__h2020::3f6d9d54cac975a517ba6b252c81582d":["beopen"] + } + + + + + + + + Set the dedup orchestrator name + + dedupConfig + decisiontree-dedup-test + + + + + + + declares the ActionSet ids to promote in the RAW graph + + actionSetIdsRawGraph + scholexplorer-dump,gridac-dump,doiboost-organizations,doiboost,orcidworks-no-doi,iis-wos-entities,iis-entities-software,iis-entities-patent + + + + + + + declares the ActionSet ids to promote in the INFERRED graph + + actionSetIdsIISGraph + iis-researchinitiative,iis-document-citations,iis-document-affiliation,iis-document-classes,iis-document-similarities,iis-referenced-datasets-main,iis-referenced-datasets-preprocessing,iis-referenced-projects-main,iis-referenced-projects-preprocessing,iis-referenceextraction-pdb,document_software_url,iis-extracted-metadata,iis-communities,iis-referenced-patents,iis-covid-19 + + + + + + + wait configurations + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'graphOutputPath' : 'aggregatorGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/raw_all/oozie_app', + 'mongoURL' : 'mongodb://beta.services.openaire.eu', + 'mongoDb' : 'mdstore', + 'postgresURL' : 'jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus', + 'postgresUser' : 'dnet', + 'postgresPassword' : '*****', + 'reuseContent' : 'false', + 'contentPath' : '/tmp/beta_provision/aggregator', + 'workingDir' : '/tmp/beta_provision/working_dir/aggregator' + } + + build-report + + + + + + + create the RAW graph + + executeOozieJob + IIS + + { + 'inputActionSetIds' : 'actionSetIdsRawGraph', + 'inputGraphRootPath' : 'aggregatorGraphPath', + 'outputGraphRootPath' : 'rawGraphPath', + 'isLookupUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/actionmanager/wf/main/oozie_app', + 'activePromoteDatasetActionPayload' : 'true', + 'activePromoteDatasourceActionPayload' : 'true', + 'activePromoteOrganizationActionPayload' : 'true', + 'activePromoteOtherResearchProductActionPayload' : 'true', + 'activePromoteProjectActionPayload' : 'true', + 'activePromotePublicationActionPayload' : 'true', + 'activePromoteRelationActionPayload' : 'true', + 'activePromoteResultActionPayload' : 'true', + 'activePromoteSoftwareActionPayload' : 'true', + 'mergeAndGetStrategy' : 'MERGE_FROM_AND_GET', + 'workingDir' : '/tmp/beta_provision/working_dir/promoteActionsRaw' + } + + build-report + + + + + + + search for duplicates in the raw graph + + executeOozieJob + IIS + + { + 'actionSetId' : 'dedupConfig', + 'graphBasePath' : 'rawGraphPath', + 'dedupGraphPath': 'dedupGraphPath', + 'isLookUpUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/dedup/scan/oozie_app', + 'workingPath' : '/tmp/beta_provision/working_dir/dedup' + } + + build-report + + + + + + + create the INFERRED graph + + executeOozieJob + IIS + + { + 'inputActionSetIds' : 'actionSetIdsIISGraph', + 'inputGraphRootPath' : 'dedupGraphPath', + 'outputGraphRootPath' : 'inferredGraphPath', + 'isLookupUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/actionmanager/wf/main/oozie_app', + 'activePromoteDatasetActionPayload' : 'true', + 'activePromoteDatasourceActionPayload' : 'true', + 'activePromoteOrganizationActionPayload' : 'true', + 'activePromoteOtherResearchProductActionPayload' : 'true', + 'activePromoteProjectActionPayload' : 'true', + 'activePromotePublicationActionPayload' : 'true', + 'activePromoteRelationActionPayload' : 'true', + 'activePromoteResultActionPayload' : 'true', + 'activePromoteSoftwareActionPayload' : 'true', + 'mergeAndGetStrategy' : 'MERGE_FROM_AND_GET', + 'workingDir' : '/tmp/beta_provision/working_dir/promoteActionsIIS' + } + + build-report + + + + + + + mark duplicates as deleted and redistribute the relationships + + executeOozieJob + IIS + + { + 'graphBasePath' : 'inferredGraphPath', + 'dedupGraphPath': 'consistentGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/dedup/consistency/oozie_app', + 'workingPath' : '/tmp/beta_provision/working_dir/dedup' + } + + build-report + + + + + + + + propagates ORCID among results linked by allowedsemrels semantic relationships + + executeOozieJob + IIS + + { + 'sourcePath' : 'consistentGraphPath', + 'outputPath': 'orcidGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/orcidtoresultfromsemrel/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/orcid', + 'allowedsemrels' : 'isSupplementedBy;isSupplementTo', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + mark results respecting some rules as belonging to communities + + executeOozieJob + IIS + + { + 'sourcePath' : 'orcidGraphPath', + 'outputPath': 'bulkTaggingGraphPath', + 'isLookUpUrl' : 'isLookUpUrl', + 'pathMap' : 'bulkTaggingPathMap', + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/bulktag/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/bulktag' + } + + build-report + + + + + + + + creates relashionships between results and organizations when the organizations are associated to institutional repositories + + executeOozieJob + IIS + + { + 'sourcePath' : 'bulkTaggingGraphPath', + 'outputPath': 'affiliationGraphPath', + 'saveGraph' : 'true' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/affiliation/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/affiliation' + } + + build-report + + + + + + + + marks as belonging to communities the result collected from datasources related to the organizations specified in the organizationCommunityMap + + executeOozieJob + IIS + + { + 'sourcePath' : 'affiliationGraphPath', + 'outputPath': 'communityOrganizationGraphPath', + 'organizationtoresultcommunitymap': 'propagationOrganizationCommunityMap' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/community_organization/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/community_organization', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + created relation between projects and results linked to other results trough allowedsemrel semantic relations linked to projects + + executeOozieJob + IIS + + { + 'sourcePath' : 'communityOrganizationGraphPath', + 'outputPath': 'fundingGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/funding/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/funding', + 'allowedsemrels' : 'isSupplementedBy;isSupplementTo', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + tag as belonging to communitites result in in allowedsemrels relation with other result already linked to communities + + executeOozieJob + IIS + + { + 'sourcePath' : 'fundingGraphPath', + 'outputPath': 'communitySemRelGraphPath', + 'isLookupUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/community_semrel/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/community_semrel', + 'allowedsemrels' : 'isSupplementedBy;isSupplementTo' + } + + build-report + + + + + + + + associated to results colleced from allowedtypes and those in the whithelist the country of the organization(s) handling the datasource it is collected from + + executeOozieJob + IIS + + { + 'sourcePath' : 'communitySemRelGraphPath', + 'outputPath': 'countryGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/country/oozie_app', + 'workingDir' : '/tmp/beta_provision/working_dir/country', + 'allowedtypes' : 'pubsrepository::institutional', + 'whitelist' : '10|opendoar____::300891a62162b960cf02ce3827bb363c', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + wf_20200428_155848_495 + 2020-04-28T16:53:23+00:00 + FAILURE + + + +
\ No newline at end of file diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 767674e1c9..31deeeee0a 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -14,14 +14,15 @@ pom + dhp-worfklow-profiles dhp-aggregation dhp-distcp + dhp-actionmanager dhp-graph-mapper dhp-dedup-openaire dhp-graph-provision dhp-dedup-scholexplorer dhp-graph-provision-scholexplorer - dhp-actionmanager