Compare commits

..

362 Commits

Author SHA1 Message Date
Miriam Baglioni ed560dacc0 [bulktag] align type to community api 2024-10-29 15:49:48 +01:00
Miriam Baglioni 07a1f2b31c [TransformativeAgreement] fix to remove the file downloaded from a previous run of the workflow 2024-10-28 12:23:18 +01:00
Claudio Atzori 80d7b842e4 [graph provision] added JSON payload to the SolrInputDocuments 2024-10-12 16:23:27 +02:00
Claudio Atzori dd397d107d Merge pull request 'Revert the changes for IgnoreUndefined management in tree evaluation' (#491) from fix_decision_tree into main
Reviewed-on: #491
2024-10-11 10:33:42 +02:00
Giambattista Bloisi 3152382ae8 Revert the changes for IgnoreUndefined management in tree evaluation 2024-10-09 23:00:13 +02:00
Claudio Atzori a50e04154e Merge pull request 'PidCleaner used pervasively' (#490) from pid_cleaning into main
Reviewed-on: #490
2024-10-08 15:08:13 +02:00
Claudio Atzori c4e8aaca1f PidCleaner used pervasively 2024-10-08 14:58:28 +02:00
Claudio Atzori 1596d70224 [bipAffiliations] fix: added publisherInputPath to the spark action parameter specifications 2024-10-02 10:02:10 +02:00
Claudio Atzori 5d030d1118 [graph provision] fixed serialisation of the usage counts as measures in the XML records 2024-10-02 09:45:30 +02:00
Claudio Atzori 6e0b6a886f code formatting 2024-09-30 15:13:23 +02:00
Claudio Atzori 3854fcc5e0 Merge pull request 'New Model from affRo' (#487) from affRoModelModificationOnmain into main
Reviewed-on: #487
2024-09-30 14:33:31 +02:00
Miriam Baglioni 371154d74f [OpenAireAffiliations] changed the code to handle mixed model in input. To be able to update some links for as many datasources as possible. So far crossref and openapc 2024-09-30 14:29:49 +02:00
Claudio Atzori 4e9f64e01a merged from the osfPreprints_plugin branch 2024-09-30 11:24:17 +02:00
Giambattista Bloisi d175a9745f Fix: invert the "natural" order when ordering by id lexicographically 2024-09-26 17:07:31 +02:00
Michele De Bonis fe70caa33c fixed dedup test classes 2024-09-26 11:28:51 +02:00
Claudio Atzori 81bfe3fe32 WIP merged beta into main 2024-09-26 09:23:44 +02:00
Miriam Baglioni 0765641979 [AffRo] used the collectedfrom openaire for all the relations imported as affRo output 2024-09-25 17:23:49 +02:00
Miriam Baglioni d0eba032cd [AffRo] removing package 2024-09-25 17:15:17 +02:00
Miriam Baglioni 7cd8171268 [AffRo] refactoring 2024-09-25 17:12:51 +02:00
Miriam Baglioni a54d021c37 merge with main 2024-09-25 17:06:27 +02:00
Miriam Baglioni 6eea075324 [AffRo] changed the creation of the action set against the new model of provision of the matchings. Changed the test calss and the resources accordingly 2024-09-25 17:04:37 +02:00
Claudio Atzori 2ba67f08d3 [OpenCitations] move the extracted contents under a backup path to avoid needing to re-download it in case of errors 2024-09-25 15:39:22 +02:00
Miriam Baglioni df39360822 [AffRo] changed the creation of the action set agaisnt the nen model of provision of the matchings 2024-09-25 12:32:53 +02:00
Claudio Atzori c1a309df75 Merge pull request 'retry on UnknownHostException' (#469) from retry_on_UnknownHostException into main
Reviewed-on: #469
2024-09-25 11:35:14 +02:00
Claudio Atzori 5fdc286eb9 Merge pull request 'ticket #9525: Update Crosserf Mapping' (#479) from ticket#9525 into main
Reviewed-on: #479
2024-09-25 11:32:20 +02:00
Claudio Atzori e7f6eb82df Merge pull request 'fixed a bug with topic ENRICH/MORE/SUBJECT/ARXIV' (#481) from enrich_more_subject_bug into main
Reviewed-on: #481
2024-09-24 08:56:50 +02:00
Claudio Atzori 9c7711310e Merge pull request '[broker] fixed calculation of events for ENRICH/[MISSING|MORE]/PROJECT' (#483) from fix_missing_project_rels into main
Reviewed-on: #483
2024-09-23 15:26:54 +02:00
Michele Artini 0c66b8589d removed the deletedByInference=true filter 2024-09-23 13:01:45 +02:00
Michele Artini 758d4acd05 fixed a bug with topic ENRICH/MORE/SUBJECT/ARXIV 2024-09-23 09:47:29 +02:00
Sandro La Bruzzo 890190b7ae as described on ticket #9525
1. Changed the mapping applied to Crossref records: anything that has a relationship "is-review-of" must be mapped as publication of type "Review".
2. Force the hostedby of Crossref records with DOI prefix 10.3410 and 10.12703 to the H1 Connect data source.
2024-09-18 17:16:53 +02:00
Claudio Atzori 24b5dc97c6 Merge pull request 'mergeResultsOfDifferentTypes only when checkDelegatedAuthority is true' (#478) from merge_by_id_fix into main
Reviewed-on: #478
2024-09-17 10:48:46 +02:00
Claudio Atzori c648531ccb run mergeResultsOfDifferentTypes only when checkDelegatedAuthority is true 2024-09-16 16:16:23 +02:00
Giambattista Bloisi 10cad80d4d Merge pull request 'Fixes for RestIterator' (#473) from fix_pagination into main
Reviewed-on: #473
2024-09-05 16:59:16 +02:00
Giambattista Bloisi 37b9bdc10c Fix: next returned a null value at end of stream 2024-09-05 16:52:57 +02:00
Giambattista Bloisi e7150eea7b Fix for paginationStart parameter management 2024-09-05 16:52:57 +02:00
Giambattista Bloisi 23477f3e80 Fixes for pagination strategy looping at end of download 2024-09-05 16:52:57 +02:00
Claudio Atzori ce78752aa3 BIPAffiliations to include also input data from publisher websites 2024-08-07 15:46:44 +02:00
Claudio Atzori 152cb47375 Merge pull request 'AffiliationFromPublisher' (#470) from AffiliationFromPublisher into main
Reviewed-on: #470
2024-08-07 14:49:28 +02:00
Miriam Baglioni f1dc0050c7 [AffiliationFromPublisher]extention of test 2024-08-07 11:27:11 +02:00
Miriam Baglioni 42531afc3e [AffiliationFromPublisher]refactoring after compilation 2024-08-07 11:17:56 +02:00
Miriam Baglioni 907eeadce8 [AffiliationFromPublisher]Adding to the creation of the ActrionSet also the links got from the publishers 2024-08-07 11:08:50 +02:00
Claudio Atzori 6b4fa7b8b9 the metadata collection plugins using the HttpConnector2 class shall now retry instead of failing in case of UnknownHostException 2024-08-05 16:55:07 +02:00
Claudio Atzori b8bc237079 [bip affiliations] considers only DOI based records 2024-08-05 12:14:06 +02:00
Claudio Atzori ed6d71fc70 code formatting 2024-08-05 12:12:29 +02:00
Miriam Baglioni cbe877b73c [WebCrawlAffiliation]remove from the creation of the action set the relations for pmc and pmid. Only doi are allowed 2024-08-05 11:44:38 +02:00
Claudio Atzori 5fc413a5df Merge pull request '[main] Rest collector plugin on hadoop supports a new param to pass request headers' (#467) from rest-collector-request-header-map2 into main
Reviewed-on: #467
2024-08-05 09:34:24 +02:00
Claudio Atzori 97c9706469 minors 2024-08-02 15:47:56 +02:00
Claudio Atzori 07e7b9315c code formatting 2024-08-02 14:42:24 +02:00
Alessia 39810c6e7e Rest collector plugin on hadoop supports a new param to pass request headers 2024-08-02 14:41:43 +02:00
Claudio Atzori e0f58afd30 [graph provision] include only FoS L1..L2 in the record serialization 2024-08-02 10:58:57 +02:00
Claudio Atzori 60cf7d86a1 [graph provision] include only FoS L1..L2 in the record serialization 2024-08-02 10:58:47 +02:00
Miriam Baglioni 8f11dfe554 [UnpayWall]added othe : in the identifier construction 2024-07-16 18:18:38 +02:00
Claudio Atzori d20a5e020a [graph provision] log the Solr admin application operations for alias deletion and creation 2024-07-15 16:31:04 +02:00
Claudio Atzori 3d1d8e6036 renamed workflow to better reflect its purpose 2024-07-15 15:24:18 +02:00
Claudio Atzori 0b1c58358b Merge pull request '[broker] fixing the mapping of ORCID for the identification of the enrichments' (#458) from broker_orcid into main
Reviewed-on: #458
2024-07-15 11:34:01 +02:00
Claudio Atzori b70a440aca renamed class, updated criteria to consider the ORCIDs used in the matchers 2024-07-12 17:09:01 +02:00
Michele Artini 36c3df1652 tests 2024-07-12 15:29:45 +02:00
Claudio Atzori 2f13683285 [broker] fine tuned the workflow memory settings 2024-07-12 10:27:24 +02:00
Claudio Atzori 5ab409dcab [metadata collection] added -Dcom.sun.security.enableAIAcaIssuers=true as a default for metadata collection 2024-07-12 10:26:32 +02:00
Claudio Atzori b756cfeb85 Merge pull request 'set JAVA_HOME and JAVA_OPTS in metadata collection' (#457) from metadata_collection_java_upgrade into main
Reviewed-on: #457
2024-07-11 15:32:11 +02:00
Claudio Atzori 51d6a541bd [metadata collection] added the possibility to specify the JAVA_HOME and the JAVA_OPTS parameters 2024-07-11 15:24:29 +02:00
Claudio Atzori 07ce92cef2 [OAI-PMH] fixed node name 2024-07-11 11:00:23 +02:00
Miriam Baglioni f043b7b096 [Irish Tender]changed the irish.json file according to comments #26, #29, and #34 for 9635 2024-07-04 12:22:56 +02:00
Claudio Atzori 153b56eeff make entity level pids unique by pidType:pidValue 2024-07-04 09:41:39 +02:00
Claudio Atzori ed97ba4565 Merge pull request '[prod] Openaire Affiliation Inference' (#453) from affRoFromRawStringmain into main
Reviewed-on: #453
2024-07-03 12:32:26 +02:00
Claudio Atzori 7b398a6d0b updated import of organization types from OpenOrgs 2024-07-03 11:11:35 +02:00
Claudio Atzori 13f6506ce5 Change the selection criteria for the pivot record of a group so that by best pid type becomes the first criteria. This will have the effect to slowly converge to records having DOI 2024-07-03 10:44:01 +02:00
Claudio Atzori 3d9ddaa23a importing organization types from OpenOrgs 2024-07-03 10:15:37 +02:00
Claudio Atzori c06dfdfd86 ignore dates containing 'null's 2024-07-02 15:43:11 +02:00
Claudio Atzori b822b34abe code formatting 2024-07-01 09:22:35 +02:00
Michele De Bonis ea1841fbd2 implementation of countryMatch and addition of workflow parameters 2024-07-01 09:14:32 +02:00
Miriam Baglioni 4dbce39237 [AffiliationInference]Extended the affiliation ingestion from OpenAIRE to include also the links derived from web crawl. Changed the provenance from BIP! to OpenAIRE 2024-06-29 18:51:06 +02:00
Miriam Baglioni 3ee8a7d18a [WebCrawl]moved to Constants web crawl name and id 2024-06-29 18:47:23 +02:00
Claudio Atzori ee7deb3f60 [graph provision] publicFormat worfklow parameter defined as optional 2024-06-28 14:52:43 +02:00
Claudio Atzori 157cc8be87 [graph provision] fixed serialization of the instancetypes 2024-06-28 14:21:12 +02:00
Claudio Atzori 023099a921 imported from beta 2024-06-26 11:40:16 +02:00
Claudio Atzori 786c217085 Using the updated Solr JSON payload model classes 2024-06-26 11:11:33 +02:00
Lampros Smyrnaios c858c02111 - Fix not using the "export HADOOP_USER_NAME" statement in "createPDFsAggregated.sh", which caused permission-issues when creating tables with Impala.
- Remove unused "--user" parameter in "impala-shell" calls.
- Code polishing.
2024-06-26 10:11:21 +02:00
Claudio Atzori 8220e27110 Merge pull request 'Align Solr JSON records to the explore portal requirements' (#448) from json_payload into beta_to_master_may2024
Reviewed-on: #448
2024-06-25 09:57:40 +02:00
Claudio Atzori bc993d49c1 Update pom.xml
depend on released schema version
2024-06-25 09:57:06 +02:00
Claudio Atzori 1dc7458de2 added JSON payload to the SolrInputDocument, updated unit tests 2024-06-24 14:48:09 +02:00
Claudio Atzori a7a54aab47 WIP: align Solr JSON records to the explore portal requirements 2024-06-20 15:48:45 +02:00
Miriam Baglioni eaa00a4199 [IrishFunderList]make changed according to 9635 comment 20, 21, 22 and 23 2024-06-20 12:32:57 +02:00
Claudio Atzori fb731b6d46 WIP: align Solr JSON records to the explore portal requirements 2024-06-19 15:38:43 +02:00
Miriam Baglioni b6da35e736 [IrishFunderList]make changed according to 9635 comment 14, 15 and 16 2024-06-19 11:06:58 +02:00
Lampros Smyrnaios 3c9b8de892 Miscellaneous updates to the copying operation to Impala Cluster:
- Fix not breaking out of the VIEWS-infinite-loop when the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" is set to "false".
- Exit the script when no HDFS-active-node was found, independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR".
- Fix view_name-recognition in a log-message, by using the more advanced "Perl-Compatible Regular Expressions" in "grep".
- Add error-handling for "compute stats" errors.
2024-06-18 15:59:34 +02:00
Antonis Lempesis c67ef157d3 filtering out deletedbyinference and invinsible results from accessroute 2024-06-18 15:59:00 +02:00
Lampros Smyrnaios c23f3031ed Miscellaneous updates to the copying operation to Impala Cluster:
- Show some counts and the elapsed time for various sub-tasks.
- Code polishing.
2024-06-18 15:58:46 +02:00
Claudio Atzori 8ec151aa3d [graph indexing] comment out setting the JSON payload from the SolrInputDocuments 2024-06-18 15:53:24 +02:00
Claudio Atzori 2636936162 [IE OAI-PMH] fixed oozie wf definition 2024-06-14 11:47:37 +02:00
Miriam Baglioni ef437a8cdf [Provision]temporarily removed Json paylod from indexed records (Shadow cannot support it) 2024-06-13 16:48:03 +02:00
Miriam Baglioni 86088ef26e Merge remote-tracking branch 'origin/beta_to_master_may2024' into beta_to_master_may2024 2024-06-11 17:04:07 +02:00
Miriam Baglioni 143c525343 [WebCrawl]remove relations for pid not doi 2024-06-11 17:03:59 +02:00
Claudio Atzori c371513d43 [graph resolution] use sparkExecutorMemory to define also the memoryOverhead 2024-06-11 14:21:01 +02:00
Claudio Atzori 71927ca818 avoid NPEs 2024-06-11 12:40:50 +02:00
Giambattista Bloisi 46018dc804 Fix OperationUnsupportedException while merging two Result's contexts due to modification of an immutable collection 2024-06-11 10:39:48 +02:00
Miriam Baglioni 3efd5b1308 [SDGActionSet]remove datainfo for the result. It is not needed (qualifier.classid = UPDATE) useless since subject do not go at the level of the instance 2024-06-11 10:35:57 +02:00
Miriam Baglioni 196fa55774 Merge remote-tracking branch 'origin/beta_to_master_may2024' into beta_to_master_may2024 2024-06-11 10:26:24 +02:00
Miriam Baglioni 50805e3fc1 [FoSActionSet]remove datainfo for the result. It is not needed (qualifier.classid = UPDATE) useless since subject do not go at the level of the instance 2024-06-11 10:25:46 +02:00
Claudio Atzori d39a1054b8 [actionset promotion] use sparkExecutorMemory to define also the memoryOverhead 2024-06-10 16:15:07 +02:00
Claudio Atzori 576efc1857 hostedby patching to work with the updated Crossref contents 2024-06-10 15:22:33 +02:00
Claudio Atzori efc1632e16 code formatting 2024-06-06 09:25:26 +02:00
Claudio Atzori 91b49366c6 [graph provision] align serialisation of the usage count measures to the agrred specifications 2024-06-05 16:34:40 +02:00
Claudio Atzori 5e05385d35 minor 2024-06-05 16:31:58 +02:00
Miriam Baglioni c4d9b5b9d2 [downloadsAndViews]update the test file to consider the new serialization for downloads and views 2024-06-05 16:30:15 +02:00
Miriam Baglioni bf9a5e6314 [downloadsAndViews]changed the test file to check the indicators are not there if their value is 0 2024-06-05 16:29:40 +02:00
Miriam Baglioni 9d79ddb3dd [bulkTag] fixed issue that made project disappear in graph_10_enriched 2024-06-05 16:20:40 +02:00
Miriam Baglioni 907aa28c6c [downloadsAndViews] fixed issue 2024-06-05 16:19:29 +02:00
Miriam Baglioni 3955ceaa76 [downloadsAndViews] changed the serialization for downloads and views 2024-06-05 16:18:46 +02:00
Miriam Baglioni 128c143394 {downloadsAndViews] extended test file with measures for downloads and views 2024-06-05 16:17:59 +02:00
Claudio Atzori 5133993ee5 Merge branch 'beta_to_master_may2024' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta_to_master_may2024 2024-06-05 12:17:48 +02:00
Claudio Atzori 5cf259a851 [graph2hive] use sparkExecutorMemory to define also the memoryOverhead 2024-06-05 12:17:16 +02:00
Claudio Atzori e1828fc60e Merge pull request '[PROD] Irish oaipmh exporter' (#444) from irish-oaipmh-exporter into beta_to_master_may2024
Reviewed-on: #444
2024-06-05 10:56:20 +02:00
Claudio Atzori 56920b447d Merge pull request 'Fix for missing collectedfrom after dedup' (#442) from fix_mergedcliquesort into beta_to_master_may2024
Reviewed-on: #442
2024-06-03 15:34:01 +02:00
Giambattista Bloisi 3feab5d92d Fix MergeUtils.mergeGroup: it could get rid of some records and did not consider all PID authorities whilke sorting records.
ResultTypeComparator is now renamed in MergeEntitiesComparator and can be used as a general comparator for merging groups of records
2024-06-03 15:13:40 +02:00
Claudio Atzori 6be783caec [graph cleaning] use sparkExecutorMemory to define also the memoryOverhead 2024-05-29 14:36:49 +02:00
Claudio Atzori b703f94f09 Merge pull request 'changes in copy script - beta2master' (#439) from antonis.lempesis/dnet-hadoop:beta into beta_to_master_may2024
Reviewed-on: #439
2024-05-29 14:29:26 +02:00
Miriam Baglioni 14f275ffaf [NOAMI] removed Ireland funder id 501100011103. ticket 9635 2024-05-29 11:54:17 +02:00
Claudio Atzori a428e7be7e graph cleaning to implement ugly hardcoded rules, avoid NPEs 2024-05-29 09:26:12 +02:00
Claudio Atzori 8e45c5baa8 graph cleaning to implement ugly hardcoded rules 2024-05-28 15:28:42 +02:00
Claudio Atzori db5e18c784 hostedby patching to work with the updated Crossref contents 2024-05-28 15:28:13 +02:00
Claudio Atzori fb266efbcb [org dedup] avoid NPEs in SparkPrepareNewOrgs 2024-05-26 21:23:30 +02:00
Claudio Atzori d7daf54333 [org dedup] avoid NPEs in SparkPrepareOrgRels 2024-05-26 16:48:11 +02:00
Claudio Atzori f99eaa0376 Merge branch 'beta_to_master_may2024' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta_to_master_may2024 2024-05-26 15:45:41 +02:00
Claudio Atzori 23312fcc1e [org dedup] avoid NPEs in SparkPrepareOrgRels 2024-05-26 15:43:24 +02:00
Miriam Baglioni b864f0adcf Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter 2024-05-24 16:01:19 +02:00
Miriam Baglioni 7a44869d87 Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring 2024-05-24 15:23:42 +02:00
Miriam Baglioni 12ffde023f Update to include a blackList that filters out the results we know are wrongly associated to IE 2024-05-24 12:28:24 +02:00
Claudio Atzori c3fe59bc78 fixed conflicts merging from beta, code formatting 2024-05-21 14:50:40 +02:00
Claudio Atzori 795e1b2629 Merge pull request '[graph indexing] sets spark memoryOverhead in the join operations to the same value used for the memory executor' (#426) from provision_memoryOverhead into master
Reviewed-on: #426
2024-04-19 16:59:45 +02:00
Claudio Atzori 0c05abe50b [graph indexing] sets spark memoryOverhead in the join operations to the same value used for the memory executor 2024-04-19 16:57:55 +02:00
Claudio Atzori 8fdd0244ad Merge pull request 'Various fixes for the stats DB update workflow, step16-createIndicatorsTables.sql' (#425) from stats_step16_fix into master
Reviewed-on: #425
2024-04-18 11:25:24 +02:00
Claudio Atzori 18fdaaf548 integrating suggestion from #9699 to improve the result_country table construction 2024-04-18 11:23:43 +02:00
Claudio Atzori 43e123c624 added column alias 2024-04-17 16:40:29 +02:00
Claudio Atzori 62a07b7add added missing end of statement /*EOS*/ 2024-04-17 15:13:28 +02:00
Claudio Atzori 96bddcc921 revised query implementation for indi_pub_gold_oa 2024-04-17 15:06:50 +02:00
Miriam Baglioni 0486cea4c4 removed the funder id : 100011062 Asian Spinal Cord Network, wrongly associated to Ireland 2024-04-16 15:36:40 +02:00
Claudio Atzori 013935c593 Merge pull request 'Improvements to copying data from ocean to impala' (#420) from antonis.lempesis/dnet-hadoop:beta into master
Reviewed-on: #420
2024-04-16 14:17:47 +02:00
Claudio Atzori 6132bd028e Merge pull request 'Extend Crossref-funders mapping and datacite hostedbymap' (#417) from CrossrefFundersMap into master
Reviewed-on: #417
2024-04-09 10:30:53 +02:00
Miriam Baglioni 519db1ddef Extended mapping of funder from crossref (#9169, #9277) and change the correspondece files for the irish fundrs (#9635). Extended the datacite map to include the association between metadata and the EBRAINS datasource (SciLake) 2024-04-09 09:33:09 +02:00
Claudio Atzori 5add51f38c Merge pull request 'fixed the result_country definition and updated the stats DB copy procedure' (#412) from antonis.lempesis/dnet-hadoop:beta into master
Reviewed-on: #412
2024-04-03 12:34:17 +02:00
Claudio Atzori f01390702e Merge pull request 'fixed typo in indicator query' (#410) from antonis.lempesis/dnet-hadoop:beta into master
Reviewed-on: #410
2024-03-27 13:42:07 +01:00
Claudio Atzori 5592ccc37a Merge pull request 'added missing EOS, Generate tables with parquet-files, instead of csv in the contexts.sh script' (#408) from antonis.lempesis/dnet-hadoop:beta into master
Reviewed-on: #408
2024-03-27 12:02:57 +01:00
Claudio Atzori d16c15da8d adjusted pom files 2024-03-26 14:00:44 +01:00
Claudio Atzori 09a6d17059 Merge pull request '[Stats wf] #372, #405 to production' (#406) from antonis.lempesis/dnet-hadoop:beta into master
Reviewed-on: #406
2024-03-26 12:18:26 +01:00
Claudio Atzori d70793847d resolving conflicts on step16-createIndicatorsTables.sql 2024-03-26 12:17:52 +01:00
Michele De Bonis f6601ea7d1 default parameters for openorgs updated 2024-03-25 13:07:04 +01:00
Michele De Bonis cd4c3c934d openorgs wf updated 2024-03-22 15:42:37 +01:00
Michele Artini a99942f7cf filter by base types 2024-03-13 12:12:42 +01:00
Michele Artini 7f7083f53e updated sql query for filtering BASE records 2024-03-13 11:57:26 +01:00
Michele Artini d9b23a76c5 comments 2024-03-12 14:53:34 +01:00
Michele Artini 841ca92246 Merge pull request 'new plugin to collect from a dump of BASE' (#400) from base-collector-plugin into master
Reviewed-on: #400
2024-03-12 12:22:42 +01:00
Michele Artini 3bcfc40293 new plugin to collect from a dump of BASE 2024-03-12 12:17:58 +01:00
Giambattista Bloisi 3067ea390d Use SparkSQL in place of Hive for executing step16-createIndicatorsTables.sql of stats update wf 2024-03-04 11:13:34 +01:00
Miriam Baglioni c94d94035c [BulkTagging] added check to verify if field is present in the pathMap 2024-02-28 09:41:42 +01:00
Michele Artini 4374d7449e mapping of project PIDs 2024-02-22 14:44:35 +01:00
Claudio Atzori 07d009007b Merge pull request 'Fixed problem on missing author in crossref Mapping' (#384) from crossref_missing_author_fix_master into master
Reviewed-on: #384
2024-02-15 15:06:17 +01:00
Claudio Atzori 071d044971 Merge branch 'master' into crossref_missing_author_fix_master 2024-02-15 15:04:19 +01:00
Claudio Atzori b3ddbaed58 fixed import of ORPs stored on HDFS in the internal graph format (e.g. Datacite) 2024-02-15 15:02:48 +01:00
Claudio Atzori 1416f16b35 [graph raw] fixed mapping of the original resource type from the Datacite format 2024-02-09 10:19:53 +01:00
Giambattista Bloisi ba1a0e7b4f Merge pull request 'Set deletedbyinference =true to dedup aliases, created when a dedup in a previous build has been merged in a new dedup' (#392) from fix_dedupaliases_deletedbyinference into master
Reviewed-on: #392
2024-02-08 15:29:29 +01:00
Giambattista Bloisi 079085286c Merge branch 'master' into fix_dedupaliases_deletedbyinference 2024-02-08 15:29:13 +01:00
Giambattista Bloisi 8dd666aedd Dedup aliases, created when a dedup in a previous build has been merged in a new dedup, need to be marked as "deletedbyinference", since they are "merged" in the new dedup 2024-02-08 15:27:57 +01:00
Claudio Atzori f21133229a Merge pull request 'Support for the PromoteAction strategy [master]' (#391) from promote_actions_join_type_master into master
Reviewed-on: #391
2024-02-08 15:12:16 +01:00
Claudio Atzori d86b909db2 [actiosets] fixed join type 2024-02-08 15:10:55 +01:00
Claudio Atzori 08162902ab [actiosets] introduced support for the PromoteAction strategy 2024-02-08 15:10:40 +01:00
Claudio Atzori e8630a6d03 [graph cleaning] rule out datasources without an officialname 2024-02-05 14:59:06 +02:00
Claudio Atzori f28c63d5ef [orcid enrichment] fixed directory cleanup before distcp 2024-02-05 09:44:56 +02:00
Claudio Atzori 1a8b609ed2 code formatting 2024-01-30 11:34:16 +01:00
Miriam Baglioni 4c8706efee [orcid-enrichment] change the value of parameters. 2024-01-29 18:21:36 +01:00
Claudio Atzori 4d0c59669b merged changes from beta 2024-01-26 16:08:54 +01:00
Sandro La Bruzzo 3c8c88bdd3 Fixed problem on missing author in crossref Mapping 2024-01-26 12:29:30 +01:00
Claudio Atzori 106968adaa Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-12-21 12:26:29 +01:00
Claudio Atzori a8a4db96f0 added metaresourcetype to the result hive DB view 2023-12-21 12:26:19 +01:00
Sandro La Bruzzo 37e36baf76 updated workflow for generation of Scholix Datasource's to use mdstore transactions 2023-12-18 16:05:35 +01:00
Sandro La Bruzzo 9d39845d1f uploaded input parameters on CreateBaseline WF 2023-12-18 12:23:12 +01:00
Sandro La Bruzzo 1fbd4325f5 Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop 2023-12-18 11:47:17 +01:00
Sandro La Bruzzo 1f1a6a5f5f updated the transformation Baseline workflow to include mdstore rollback/commit action 2023-12-18 11:47:00 +01:00
Claudio Atzori c4ec35b6cd Merge pull request 'Master branch updates from beta December 2023' (#369) from beta_to_master_dicember2023 into master
Reviewed-on: #369
2023-12-15 11:18:30 +01:00
Claudio Atzori 1726f49790 code formatting 2023-12-15 10:37:02 +01:00
Claudio Atzori 1763d377ad code formatting 2023-11-23 16:33:24 +01:00
Claudio Atzori a0311e8a90 Merge pull request 'Clear working dir in bipranker workflow' (#360) from 9120_bipranker_clean_working_dir into master
Reviewed-on: #360
2023-11-22 14:10:39 +01:00
Claudio Atzori 8fb05888fd Merge branch 'master' into 9120_bipranker_clean_working_dir 2023-11-22 14:10:30 +01:00
Claudio Atzori 2b626815ff Merge pull request 'Project propagation via communityAPI instead of using IS via IIS' (#362) from projectPropagation into master
Reviewed-on: #362
2023-11-14 16:37:53 +01:00
Miriam Baglioni b177cd5a0a Project propagation via communityAPI instead of using IS via IIS 2023-11-14 16:25:09 +01:00
Serafeim Chatzopoulos 671ba8a5a7 Clear working dir in bipranker workflow 2023-11-07 18:35:05 +02:00
Claudio Atzori 5f1ed61c1f merging from bulkTag branch 2023-11-03 12:51:37 +01:00
Claudio Atzori 8c03c41d5d applying changes from beta 2023-11-03 12:08:39 +01:00
Claudio Atzori 97454e9594 Merge pull request '9117_pubmed_affiliations_prod' (#357) from 9117_pubmed_affiliations_prod into master
Reviewed-on: #357
2023-11-03 11:45:34 +01:00
Serafeim Chatzopoulos 7e34dde774 Renaming input param for crossref input path 2023-11-02 17:47:04 +02:00
Serafeim Chatzopoulos 24c3f92d87 Change the description of the workflow 2023-11-02 17:46:51 +02:00
Serafeim Chatzopoulos 6ce9b600c1 Add actionset creation for pubmed affiliations 2023-11-02 17:46:39 +02:00
Serafeim Chatzopoulos 94089878fd Adjust tests to new WF input params 2023-11-02 17:46:13 +02:00
Miriam Baglioni 0097f4e64b Removed Query community testing. Removed package from common related to the interaction with Zenodo since it was moved to the dump-project 2023-10-26 09:38:09 +02:00
Miriam Baglioni 5c5a195e97 refactoring and fixing issue on property name 2023-10-23 11:26:17 +02:00
Miriam Baglioni 70b78a40c7 removed file from different propagation 2023-10-20 15:50:49 +02:00
Miriam Baglioni f206ff42d6 modified code to use the the API. Removing not needed parameters. Rewritten the code to exploit the parallel stream on the entity types 2023-10-20 15:49:41 +02:00
Miriam Baglioni 34358afe75 modified resource file, workflow anf default-config. Add 3g of memory Overhead and specified the shuffle partition in the wf confiduration. Removed the multiple instantiation in the wf because of different implementation of the spark job 2023-10-20 15:48:27 +02:00
Miriam Baglioni 18bfff8af3 adding test classes and modifying test for bulktag 2023-10-20 15:47:03 +02:00
Miriam Baglioni 69dac91659 adding the new code to use the API instead of the Information Service 2023-10-20 15:45:52 +02:00
Miriam Baglioni a9ede1e989 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-10-20 10:14:43 +02:00
Claudio Atzori 242d647146 cleanup & docs 2023-10-12 12:23:44 +02:00
Claudio Atzori af3ffad6c4 [AMF] docs 2023-10-12 10:07:52 +02:00
Claudio Atzori ba5475ed4c Merge pull request 'Fix cleaning of Pmid where parsing of numbers stopped at first not leading 0 (zero) character' (#345) from fix_truncated_pmid into master
Reviewed-on: #345
2023-10-06 14:19:49 +02:00
Giambattista Bloisi 2c235e82ad Fix cleaning of Pmid where parsing of numbers stopped at first not leading 0' character 2023-10-06 12:35:54 +02:00
Claudio Atzori 4ac06c9e37 Merge pull request 'Fix bug in conversion from dedup json model to Spark Dataset of Rows (instanceTypeMatch no longer working)' (#339) from fix_dedupfailsonmatchinginstances into master
Reviewed-on: #339
2023-10-02 11:34:20 +02:00
Claudio Atzori fa692b3629 Merge branch 'master' into fix_dedupfailsonmatchinginstances 2023-10-02 11:28:16 +02:00
Claudio Atzori ef02648399 Merge pull request 'fixed dedup configuration management in the Broker workflow' (#341) from fix_8997 into master
Reviewed-on: #341
2023-10-02 11:03:50 +02:00
Claudio Atzori d13bb534f0 Merge branch 'master' into fix_8997 2023-10-02 11:03:18 +02:00
Giambattista Bloisi 775c3f704a Fix bug in conversion from dedup json model to Spark Dataset of Rows: list of strings contained the json escaped representation of the value instead of the plain value, this caused instanceTypeMatch failures because of the leading and trailing double quotes 2023-09-27 22:30:47 +02:00
Sandro La Bruzzo 9c3ab11d5b Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop 2023-09-25 15:29:19 +02:00
Sandro La Bruzzo 423ef30676 minor fix on the aggregation of uniprot and pdb 2023-09-25 15:28:58 +02:00
Giambattista Bloisi 7152d47f84 Use asScala to convert java List to Scala Sequence 2023-09-20 16:14:27 +02:00
Claudio Atzori 4853c19b5e code formatting 2023-09-20 15:53:21 +02:00
Giambattista Bloisi 1f226d1dce Fix defect #8997: GenerateEventsJob is generating huge amounts of logs because broker entity similarity calculation consistently failed 2023-09-20 15:42:00 +02:00
Alessia Bardi 6186cdc2cc Use v5 of the UNIBI Gold ISSN list in test 2023-09-19 14:47:01 +02:00
Alessia Bardi d94b9bebf7 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-09-19 13:38:45 +02:00
Alessia Bardi 19abba8fa7 tests for d4science catalog 2023-09-19 13:38:25 +02:00
Claudio Atzori c2f179800c Merge pull request 'Run CC and RAM sequentieally in dhp-impact-indicators WF' (#338) from run_cc_and_ram_sequentially into master
Reviewed-on: #338
2023-09-13 08:52:53 +02:00
Serafeim Chatzopoulos 2aed5a74be Run CC and RAM sequentieally in dhp-impact-indicators WF 2023-09-12 22:31:50 +03:00
Claudio Atzori 4dc4862011 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-09-12 14:34:34 +02:00
Claudio Atzori dc80ab14d3 [graph dedup] consistency wf should not remove the relations while dispatching the entities 2023-09-12 14:34:28 +02:00
Alessia Bardi 77a2199837 updated test for EOSC comunity 2023-09-08 11:05:49 +02:00
Claudio Atzori 265180bfd2 added Archive ouverte UNIGE (ETHZ.UNIGENF, opendoar____::1400) to the Datacite hostedBy_map 2023-09-07 11:20:35 +02:00
Claudio Atzori da0e9828f7 resolved conflicts for PR#337 2023-09-06 11:28:46 +02:00
Miriam Baglioni 599828ce35 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-08-09 13:07:13 +02:00
Claudio Atzori 0bc74e2000 code formatting 2023-08-02 11:52:10 +02:00
Claudio Atzori 7180911ded [graph cleaning] fixed regex behaviour for cleaning ROR and GRID identifiers, added tests 2023-08-02 11:44:14 +02:00
Claudio Atzori da1727f93f rule out records with NULL dataInfo, except for Relations 2023-07-31 17:52:56 +02:00
Claudio Atzori ccac6a7f75 rule out records with NULL dataInfo 2023-07-31 12:35:05 +02:00
Claudio Atzori d512df8612 code formatting 2023-07-26 09:14:08 +02:00
Claudio Atzori 59764145bb cherry picked & fixed commit 270df939c4 2023-07-25 17:39:00 +02:00
Miriam Baglioni 9e8e39f78a - 2023-07-19 11:35:58 +02:00
Claudio Atzori 373a5f2c83 Merge pull request 'Master branch updates from beta July 2023' (#317) from master_july23 into master
Reviewed-on: #317
2023-07-18 18:22:04 +02:00
Claudio Atzori 8af129b0c7 merged stats promotion step from antonis/promotion-prod-only 2023-07-13 15:03:28 +02:00
dimitrispie 706092bc19 Update updateProductionViews.sh 2023-07-13 15:48:12 +03:00
dimitrispie aedd279f78 Updates Promotion DBs
- Add a step for promoting the splitted monitor DBs
2023-07-13 15:35:46 +03:00
Miriam Baglioni 8dcd028eed [UsageCount] fixed typo in attribute name for datasource table 2023-07-01 16:07:22 +02:00
Miriam Baglioni 8621377917 [UsageCount] fixed typo in attribute name for datasource table 2023-06-30 19:02:44 +02:00
Miriam Baglioni ef2dd7a980 resolved conflicts 2023-06-30 18:59:47 +02:00
Claudio Atzori f3a85e224b merged from branch beta the bulk tagging (single step, negative constraints), the cleanig worflow (single step, pid type based cleaning), instance level fulltext 2023-06-28 13:33:57 +02:00
Claudio Atzori 4ef0f2ec26 added dependency commons-validator:commons-validator:1.7 2023-06-28 13:32:01 +02:00
Claudio Atzori 288ec0b7d6 [doiboost] merged workflow from branch beta 2023-06-28 09:15:37 +02:00
Claudio Atzori 5f32edd9bf adopting dhp-schema:3.17.1 2023-06-27 16:57:17 +02:00
Claudio Atzori e10ce92fe5 [stats wf] merged workflows from branch beta 2023-06-27 14:32:48 +02:00
Claudio Atzori b93e1541aa Merge pull request 'update sql query to return distinct pids' (#301) from distinct_pids_from_openorgs into master
Reviewed-on: #301
2023-06-27 12:24:47 +02:00
Claudio Atzori d029bf0b94 Merge branch 'master' into distinct_pids_from_openorgs 2023-06-27 12:24:35 +02:00
Michele Artini 009d7f312f fixed a datasource Id 2023-06-21 16:17:34 +02:00
Miriam Baglioni e4b27182d0 [master] refactoring 2023-06-21 11:15:53 +02:00
Giambattista Bloisi 758e662ab8 Revert "REmove duplicated code and ensure that load and initialization is done through "DedupConfig.load" method"
This reverts commit 485f9d18cb.
2023-06-19 13:08:10 +02:00
Giambattista Bloisi 485f9d18cb REmove duplicated code and ensure that load and initialization is done through "DedupConfig.load" method 2023-06-19 13:00:02 +02:00
Michele Artini a92206dab5 re-added the name of a column (pid) 2023-06-13 11:43:10 +02:00
Miriam Baglioni d9506035e4 [ZenodoApi] gone back to okhttp3 to send the payload. 2023-06-09 12:05:02 +02:00
Alessia Bardi 118e72d7db Updated officialnmae of pangaea in hostedbymap for Datacite to avoid duplicate entries in the source filter of the portal 2023-06-06 14:39:12 +02:00
Alessia Bardi 5befd93d7d test records for Solr indexing 2023-06-06 14:34:33 +02:00
Michele Artini cae92cf811 update sql query to return distinct pids 2023-06-06 14:06:06 +02:00
Miriam Baglioni b64a5eb4a5 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-05-24 15:21:58 +02:00
Claudio Atzori 654ffcba60 Merge pull request '[UsageCount] addition of usagecount for Projects and datasources' (#296) from master_datasource_project_usagecounts into master
Reviewed-on: #296
2023-05-22 16:13:24 +02:00
Claudio Atzori db625e548d [UsageCount] addition of usagecount for Projects and datasources 2023-05-22 15:00:46 +02:00
Alessia Bardi 04141fe259 tests for records from D4Science catalogues 2023-05-19 14:28:24 +02:00
Alessia Bardi b88f009d9f combined level 4 and 6 for the demo 2023-04-24 12:10:33 +02:00
Alessia Bardi 5ffe82ffd8 aligned to current DMF index layout on production 2023-04-24 12:09:55 +02:00
Alessia Bardi 1c173642f0 removed level5 from test records 2023-04-24 09:32:32 +02:00
Alessia Bardi 382f46a8e4 tests to generate the XML records for the index for the EDITH demo on digital twins, integrating output from the FoS classifier 2023-04-21 16:46:30 +02:00
Miriam Baglioni 9fc8ebe98b refactoring 2023-04-19 09:32:13 +02:00
Miriam Baglioni 24c41806ac [ZenodoApiClienttest] change test to mirror change in the omplementation 2023-04-18 09:08:09 +02:00
Miriam Baglioni 087b5a7973 [ZenodiAPIClient] new version of the API to connect to Zenodo (change the http client 2023-04-17 18:59:22 +02:00
Claudio Atzori 688e3b7936 added eoscifguidelines in the result view; removed compute statistics statements 2023-04-11 11:45:56 +02:00
Claudio Atzori 2e465915b4 [graph to Solr] using dedicated sparkExecutorCores, sparkExecutorMemory, sparkDriverMemory in convert_to_xml 2023-04-11 10:43:44 +02:00
Claudio Atzori 4a4ca634f0 Merge pull request 'advConstraintsInBeta' (#288) from advConstraintsInBeta into master
Reviewed-on: #288
2023-04-06 15:24:23 +02:00
Miriam Baglioni c6a7602b3e refactoring after compilation 2023-04-06 14:45:01 +02:00
Miriam Baglioni 831055a1fc change of the property for test purposes, addition of two new verbs, and fix of issue for advanced constraints 2023-04-06 14:41:32 +02:00
Miriam Baglioni cf3d0f4f83 fixed issue on bulktagging for the advanced constraints 2023-04-06 12:17:35 +02:00
Claudio Atzori 4f67225fbc Merge pull request 'doiboostMappingExtention' (#286) from doiboostMappingExtention into master
Reviewed-on: #286
2023-04-06 09:25:08 +02:00
Claudio Atzori e093f04874 Merge pull request 'AdvancedConstraint' (#285) from advConstraintsInBeta into master
Reviewed-on: #285
2023-04-06 09:24:54 +02:00
Miriam Baglioni c5a9f39141 Extended the association project - result in the mapping from CrossRef 2023-04-05 16:48:36 +02:00
Miriam Baglioni ecc05fe0f3 Added the code for the advancedConstraint implementation during the bulkTagging 2023-04-05 16:40:29 +02:00
Claudio Atzori 42442ccd39 Merge pull request 'updated the order of the compatibilities' (#275) from compatibility_order into master
Reviewed-on: #275
2023-04-05 12:44:14 +02:00
Miriam Baglioni 9a9cc6a1dd changed the way the tar archive is build to support renaming in case we need to change .tt.gz into .json.gz 2023-04-04 11:40:58 +02:00
Michele Artini 200098b683 updated the order of the compatibilities 2023-02-22 11:52:59 +01:00
Michele Artini 9c1df15071 null values in date range conditions 2023-02-13 16:05:58 +01:00
Miriam Baglioni 32870339f5 refactoring after compile 2023-02-13 13:06:48 +01:00
Miriam Baglioni 7184cc0804 [FoS] added check for null on level1 subject 2023-02-13 13:03:49 +01:00
Miriam Baglioni 7473093c84 [FoS] changed the default separator from comma to tab to solve the issue in subject value split 2023-02-10 15:34:52 +01:00
Miriam Baglioni 5f0906be60 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-02-02 17:13:14 +01:00
Claudio Atzori 1b37516578 [bulk tagging] better node naming 2023-01-20 16:11:26 +01:00
Claudio Atzori c1e2460293 [cleaning] the datasource master-duplicate fixup should not be brought to production yet 2023-01-20 09:20:26 +01:00
Claudio Atzori 3800361033 [country propagation] fixes error 'cannot resolve countrySet given input columns: []' when there is no prepared information driving the propagation process for a given result type 2023-01-19 15:57:43 +01:00
Michele Artini 699736addc NPE prevention 2023-01-11 13:14:44 +01:00
Claudio Atzori f86e19b282 code formatting 2023-01-11 09:53:19 +01:00
Michele Artini d40e20f437 Considering instance pids and alteternative identifiers 2023-01-11 09:37:34 +01:00
Michele Artini 4953ae5649 fixed an invalid char 2023-01-11 08:35:53 +01:00
Miriam Baglioni c60d3a2b46 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2023-01-09 17:28:27 +01:00
Claudio Atzori 7becdaf31d Merge pull request 'Workaround to use new version of intellij on Master' (#266) from master_intellij into master
Reviewed-on: #266
2022-12-23 10:32:21 +01:00
Miriam Baglioni b713132db7 [Cleaning] adding missing classes 2022-12-21 12:49:08 +01:00
Miriam Baglioni 11f2b470d3 [Cleaning] adding missing classes 2022-12-21 12:42:19 +01:00
Sandro La Bruzzo 91c70b15a5 updated lines function to it's implementation linesWithSeparators.map(l => l.stripLineEnd) in this way we force scala plugin compiler to consider this pipeline scala code and not java.string.lines() pipeline 2022-12-21 11:14:42 +01:00
Claudio Atzori f910b7379d [cleaning] recovering missing resources from #265 2022-12-21 09:26:34 +01:00
Claudio Atzori 33bdad104e [cleaning] align parameter names 2022-12-20 21:43:59 +01:00
Claudio Atzori 5816ded93f code formatting 2022-12-20 10:41:40 +01:00
Claudio Atzori 46972f8393 [orcid propagation] skip empty directory 2022-12-20 10:28:22 +01:00
Claudio Atzori da85ca697d Merge pull request 'cleanCountryOnMaster' (#265) from cleanCountryOnMaster into master
Reviewed-on: #265
2022-12-16 15:58:44 +01:00
Miriam Baglioni 059e100ec7 [Clean Country] moving other resources for testing purposes 2022-12-16 15:48:21 +01:00
Miriam Baglioni fc95a550c3 [Clean Country] moving other resources for testing purposes 2022-12-16 15:46:32 +01:00
Miriam Baglioni 6901ac91b1 [Clean Country] moving source and resources to master 2022-12-16 15:42:49 +01:00
Claudio Atzori 08c4588d47 Merge pull request 'Changes from beta stats wf to prod' (#264) from antonis.lempesis/dnet-hadoop:beta into master
Reviewed-on: #264
2022-12-07 15:56:22 +01:00
Miriam Baglioni 29d3da85f1 [EOSC DUMP] added resources needed for the review as test 2022-11-25 17:16:20 +01:00
Miriam Baglioni 33a2b1b5dc [Bulk Tag] fixed typo in test configuration 2022-11-23 11:31:17 +01:00
Miriam Baglioni c6df8327b3 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2022-11-23 11:26:57 +01:00
Miriam Baglioni 935aa367d8 [BulkTag] removed commented code 2022-11-23 11:16:39 +01:00
Miriam Baglioni 43aedbdfe5 [BulkTag] changed verb name in configuration 2022-11-23 11:14:23 +01:00
Miriam Baglioni b6da9b67ff [BulkTag] fixed typo in annotation for verb name 2022-11-23 11:13:58 +01:00
Claudio Atzori a34c8b6f81 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2022-11-22 10:22:31 +01:00
Miriam Baglioni 122e75aa17 fixed conflicts 2022-11-21 18:13:12 +01:00
Miriam Baglioni cee7a45b1d [Bulk Tag Datasource] fixed issue with verb name and add new test for neanias selection for orcid 2022-11-21 18:10:20 +01:00
Claudio Atzori ed64618235 increased spark.sql.shuffle.partitions in the last join phase of the result (publication) to community through semantic relation propagation 2022-11-18 16:06:51 +01:00
Claudio Atzori 8742934843 added spark.sql.shuffle.partitions in the last join phase of the result to community through semantic relation propagation 2022-11-18 11:32:22 +01:00
Claudio Atzori 13cc592f39 code formatting 2022-11-15 09:37:57 +01:00
Claudio Atzori af15b1e48d [eosc tag] extending criteria for Jupyter Notebook (adding to ORP the same constraint) 2022-11-14 18:30:43 +01:00
Claudio Atzori eb45ba7af0 extended mapping from ODF relations (PR#251) 2022-11-14 18:26:13 +01:00
Claudio Atzori a929dc5fee integrated changes for mapping ROHub contents in the Graph 2022-11-14 18:15:35 +01:00
Miriam Baglioni 5f9383b2d9 [EOSC TAG] remove reduntant check for jupyter notebook 2022-11-11 14:06:19 +01:00
Miriam Baglioni b18bbca8af [EOSC TAG] adding search in orp for jupyter notebook criteria 2022-11-11 12:42:58 +01:00
dimitrispie 55fa3b2a17 Hive memory parameters 2022-11-03 15:21:04 +01:00
Claudio Atzori 80c5e0f637 code formatting 2022-09-27 12:51:51 +02:00
Claudio Atzori c01d528ab2 suppressing hyper verbose spark logs during unit test execution 2022-09-23 15:19:50 +02:00
Claudio Atzori e6d788d27a [stats wf] adding missing changes lost in PR#248 2022-09-23 14:38:42 +02:00
Claudio Atzori 930f118673 fixed semantic (subreltype) for ServiceOrganization relations 2022-09-22 16:24:44 +02:00
Claudio Atzori b2c3071e72 Merge branch 'master' into beta2master_sept_2022 2022-09-22 14:39:15 +02:00
Claudio Atzori 10ec074f79 Merge remote-tracking branch 'antonis.lempesis/beta' into beta2master_sept_2022 2022-09-22 14:12:19 +02:00
Claudio Atzori 7225fe9cbe integrated changes from discard-non-wellformed 2022-09-22 10:06:07 +02:00
Miriam Baglioni 869e129288 [EOSC BulkTag] refactoring 2022-09-20 16:13:18 +02:00
Miriam Baglioni 840465958b [EOSC BulkTag] filtering aout the datasources registered in the eosc with compatibility different from 3.0, 4.0 for literature, data and CRIS to add the context eosc to the results 2022-09-20 10:30:41 +02:00
Claudio Atzori bdc8f993d0 [Patch Hosted By] check also the presence of datasource.officialname.value 2022-09-19 15:28:03 +02:00
Miriam Baglioni ec87149cb3 [Patch Hosted By] added fix to avoi NPE error when datasource official name is not provided. Removing datasources if no officialname has been provided 2022-09-19 14:06:52 +02:00
Miriam Baglioni b42e2c9df6 [Patch Hosted By] added fix to avoi NPE error when datasource official name is not provided 2022-09-19 12:30:32 +02:00
Miriam Baglioni 1329aa8479 [EOSC BulkTag] modified test to remove association of result to eosc when eoscifguidelines are set 2022-09-19 11:59:48 +02:00
Miriam Baglioni a0ee1a8640 [EOSC BulkTag] remove addition of eosc context for result with eosc if guidelines set 2022-09-19 11:44:10 +02:00
Claudio Atzori 96062164f9 Merge pull request '[Aggregator graph|master] Discard invalid records' (#245) from discard-non-wellformed into master
Reviewed-on: #245
2022-09-19 09:48:16 +02:00
Claudio Atzori 35bb7c423f updated dhp-schemas version to 2.12.1 2022-09-16 16:13:15 +02:00
Claudio Atzori fd87571506 code formatting 2022-09-16 16:05:03 +02:00
Claudio Atzori c527112e33 Merge commit 'ff6f789b6d9be0567b6ad72f8a0e75fe3f52726a' into beta2master_sept_2022 2022-09-16 15:59:10 +02:00
Claudio Atzori 65209359bc Merge commit 'b5f7bd30be7f7adaaa28170740da0484b50a77ed' into beta2master_sept_2022 2022-09-16 15:58:11 +02:00
Claudio Atzori d72a64ded3 Merge commit '690be4482fc84327dc7617acbc8d976d559df512' into beta2master_sept_2022 2022-09-16 15:57:44 +02:00
Claudio Atzori 3e8499ce47 Merge commit '71b069ca90a2f7ec09d64241c60917d3636fc81e' into beta2master_sept_2022 2022-09-16 15:57:20 +02:00
Claudio Atzori 61aacb3271 Merge commit '1203378441dc6d8e8435cacd42e76e11746f6d1b' into beta2master_sept_2022 2022-09-16 15:56:55 +02:00
Claudio Atzori dbb567251a merged 853c996fa2 2022-09-16 15:56:28 +02:00
Claudio Atzori c7e8ad853e Merge commit '2b5f8c9c9a3611c57ee5febfe262a455a39ad801' into beta2master_sept_2022 2022-09-16 15:55:04 +02:00
Claudio Atzori 0849ebfd80 merged a11eb38065 2022-09-16 15:54:32 +02:00
Claudio Atzori 281239249e Merge commit 'b7c387c21f946adbc9da90ded95166205195edb0' into beta2master_sept_2022 2022-09-16 15:49:20 +02:00
Claudio Atzori 45fc5e12be Merge commit 'cb7c07c54e59675e8dffe42b7f2a13f16c956068' into beta2master_sept_2022 2022-09-16 15:48:55 +02:00
Claudio Atzori 1c05aaaa2e Merge commit '3418ce50ac9b28fed4fa949919e6c8208738cdcf' into beta2master_sept_2022 2022-09-16 15:48:36 +02:00
Claudio Atzori 01d5ad6361 Merge commit 'd85ba3c1a9d7f0e80565742161ff6c9ecffd52b7' into beta2master_sept_2022 2022-09-16 15:48:16 +02:00
Claudio Atzori d872d1cdd9 Merge commit 'a4815f6bec87f05be8cd740d236707949a0f746e' into beta2master_sept_2022 2022-09-16 15:47:49 +02:00
Claudio Atzori ab0efecab4 Merge commit '84598c75356cf580de6c81653a9351e9b8173639' into beta2master_sept_2022 2022-09-16 15:47:05 +02:00
Claudio Atzori 725c3c68d0 Merge commit '844f6eb46533cdd4be3210401b10401322079640' into beta2master_sept_2022 2022-09-16 15:46:40 +02:00
Claudio Atzori 300ae6221c Merge commit '32cee1f619eb30d2e2ac6083435b76b1aba7db09' into beta2master_sept_2022 2022-09-16 15:45:57 +02:00
Claudio Atzori 0ec2eaba35 Merge commit 'c1f2ffc53dc41f1fac3855b2d2df7d6a5ea15e3e' into beta2master_sept_2022 2022-09-16 15:45:27 +02:00
Claudio Atzori a387807d43 Merge commit 'b78889a0ce27a79c7ab2d8da05b118ee4f1bcb36' into beta2master_sept_2022 2022-09-16 15:44:17 +02:00
Claudio Atzori 2abe2bc137 Merge commit '08ce2cadc2d84aa982726e429c280a905536a715' into beta2master_sept_2022 2022-09-16 15:43:49 +02:00
Claudio Atzori a07c876922 Merge commit '27a91841e7fa2a1b615b4d1e161d606db5bead96' into beta2master_sept_2022 2022-09-16 15:43:02 +02:00
Claudio Atzori cbd48bc645 Merge commit 'efd96e7e664e4139321e35e8d172b884ba4b61a1' into beta2master_sept_2022 2022-09-16 15:38:56 +02:00
113 changed files with 3096 additions and 2770 deletions

View File

@ -7,12 +7,12 @@ import java.sql.*;
import java.util.function.Consumer; import java.util.function.Consumer;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log; import org.slf4j.Logger;
import org.apache.commons.logging.LogFactory; import org.slf4j.LoggerFactory;
public class DbClient implements Closeable { public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class); private static final Logger log = LoggerFactory.getLogger(DbClient.class);
private final Connection connection; private final Connection connection;
@ -37,6 +37,8 @@ public class DbClient implements Closeable {
try (final Statement stmt = connection.createStatement()) { try (final Statement stmt = connection.createStatement()) {
stmt.setFetchSize(100); stmt.setFetchSize(100);
log.info("running SQL:\n\n{}\n\n", sql);
try (final ResultSet rs = stmt.executeQuery(sql)) { try (final ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) { while (rs.next()) {
consumer.accept(rs); consumer.accept(rs);

View File

@ -1,53 +0,0 @@
package eu.dnetlib.dhp.common.api;
import java.io.IOException;
import java.io.InputStream;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.internal.Util;
import okio.BufferedSink;
import okio.Okio;
import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private final InputStream inputStream;
private final MediaType mediaType;
private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
return new InputStreamRequestBody(inputStream, mediaType, len);
}
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
this.inputStream = inputStream;
this.mediaType = mediaType;
this.lenght = len;
}
@Override
public MediaType contentType() {
return mediaType;
}
@Override
public long contentLength() {
return lenght;
}
@Override
public void writeTo(BufferedSink sink) throws IOException {
Source source = null;
try {
source = Okio.source(inputStream);
sink.writeAll(source);
} finally {
Util.closeQuietly(source);
}
}
}

View File

@ -1,8 +0,0 @@
package eu.dnetlib.dhp.common.api;
public class MissingConceptDoiException extends Throwable {
public MissingConceptDoiException(String message) {
super(message);
}
}

View File

@ -1,363 +0,0 @@
package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import org.jetbrains.annotations.NotNull;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
import okhttp3.*;
public class ZenodoAPIClient implements Serializable {
String urlString;
String bucket;
String deposition_id;
String access_token;
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
public String getUrlString() {
return urlString;
}
public void setUrlString(String urlString) {
this.urlString = urlString;
}
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public void setDeposition_id(String deposition_id) {
this.deposition_id = deposition_id;
}
public ZenodoAPIClient(String urlString, String access_token) {
this.urlString = urlString;
this.access_token = access_token;
}
/**
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
*
* @return response code
* @throws IOException
*/
public int newDeposition() throws IOException {
String json = "{}";
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return responseCode;
}
/**
* Upload files in Zenodo.
*
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @return the response code
*/
public int uploadIS(InputStream is, String file_name) throws IOException {
URL url = new URL(bucket + "/" + file_name);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
byte[] buf = new byte[8192];
int length;
try (OutputStream os = conn.getOutputStream()) {
while ((length = is.read(buf)) != -1) {
os.write(buf, 0, length);
}
}
int responseCode = conn.getResponseCode();
if (!checkOKStatus(responseCode)) {
throw new IOException("Unexpected code " + responseCode + getBody(conn));
}
return responseCode;
}
@NotNull
private String getBody(HttpURLConnection conn) throws IOException {
String body = "{}";
try (BufferedReader br = new BufferedReader(
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
StringBuilder response = new StringBuilder();
String responseLine = null;
while ((responseLine = br.readLine()) != null) {
response.append(responseLine.trim());
}
body = response.toString();
}
return body;
}
/**
* Associates metadata information to the current deposition
*
* @param metadata the metadata
* @return response code
* @throws IOException
*/
public int sendMretadata(String metadata) throws IOException {
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
try (OutputStream os = conn.getOutputStream()) {
byte[] input = metadata.getBytes("utf-8");
os.write(input, 0, input.length);
}
final int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + getBody(conn));
return responseCode;
}
private boolean checkOKStatus(int responseCode) {
if (HttpURLConnection.HTTP_OK != responseCode ||
HttpURLConnection.HTTP_CREATED != responseCode)
return true;
return false;
}
/**
* To publish the current deposition. It works for both new deposition or new version of an old deposition
*
* @return response code
* @throws IOException
*/
@Deprecated
public int publish() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/publish")
.addHeader("Authorization", "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
}
/**
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
* for the new version.
*
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id, 1);
String json = "{}";
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return responseCode;
}
/**
* To finish uploading a version or new deposition not published
* It sets the deposition_id and the bucket to be used
*
*
* @param deposition_id the deposition id of the not yet published upload
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
this.deposition_id = deposition_id;
String json = "{}";
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return responseCode;
}
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson()
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
for (ZenodoModel zm : zenodoModelList) {
if (zm.getConceptrecid().equals(concept_rec_id)) {
deposition_id = zm.getId();
return;
}
}
if (zenodoModelList.size() == 0)
throw new MissingConceptDoiException(
"The concept record id specified was missing in the list of depositions");
setDepositionId(concept_rec_id, page + 1);
}
private String getPrevDepositions(String page) throws IOException {
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
urlBuilder.addQueryParameter("page", page);
URL url = new URL(urlBuilder.build().toString());
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
return body;
}
private String getBucket(String inputUurl) throws IOException {
URL url = new URL(inputUurl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
return zenodoModel.getLinks().getBucket();
}
}

View File

@ -1,14 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Community {
private String identifier;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
}

View File

@ -1,47 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Creator {
private String affiliation;
private String name;
private String orcid;
public String getAffiliation() {
return affiliation;
}
public void setAffiliation(String affiliation) {
this.affiliation = affiliation;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator();
if (name != null) {
c.name = name;
}
if (affiliation != null) {
c.affiliation = affiliation;
}
if (orcid != null) {
c.orcid = orcid;
}
return c;
}
}

View File

@ -1,44 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class File implements Serializable {
private String checksum;
private String filename;
private long filesize;
private String id;
public String getChecksum() {
return checksum;
}
public void setChecksum(String checksum) {
this.checksum = checksum;
}
public String getFilename() {
return filename;
}
public void setFilename(String filename) {
this.filename = filename;
}
public long getFilesize() {
return filesize;
}
public void setFilesize(long filesize) {
this.filesize = filesize;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
}

View File

@ -1,23 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Grant implements Serializable {
private String id;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static Grant newInstance(String id) {
Grant g = new Grant();
g.id = id;
return g;
}
}

View File

@ -1,92 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Links implements Serializable {
private String bucket;
private String discard;
private String edit;
private String files;
private String html;
private String latest_draft;
private String latest_draft_html;
private String publish;
private String self;
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public String getDiscard() {
return discard;
}
public void setDiscard(String discard) {
this.discard = discard;
}
public String getEdit() {
return edit;
}
public void setEdit(String edit) {
this.edit = edit;
}
public String getFiles() {
return files;
}
public void setFiles(String files) {
this.files = files;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getLatest_draft() {
return latest_draft;
}
public void setLatest_draft(String latest_draft) {
this.latest_draft = latest_draft;
}
public String getLatest_draft_html() {
return latest_draft_html;
}
public void setLatest_draft_html(String latest_draft_html) {
this.latest_draft_html = latest_draft_html;
}
public String getPublish() {
return publish;
}
public void setPublish(String publish) {
this.publish = publish;
}
public String getSelf() {
return self;
}
public void setSelf(String self) {
this.self = self;
}
}

View File

@ -1,153 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class Metadata implements Serializable {
private String access_right;
private List<Community> communities;
private List<Creator> creators;
private String description;
private String doi;
private List<Grant> grants;
private List<String> keywords;
private String language;
private String license;
private PrereserveDoi prereserve_doi;
private String publication_date;
private List<String> references;
private List<RelatedIdentifier> related_identifiers;
private String title;
private String upload_type;
private String version;
public String getUpload_type() {
return upload_type;
}
public void setUpload_type(String upload_type) {
this.upload_type = upload_type;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getAccess_right() {
return access_right;
}
public void setAccess_right(String access_right) {
this.access_right = access_right;
}
public List<Community> getCommunities() {
return communities;
}
public void setCommunities(List<Community> communities) {
this.communities = communities;
}
public List<Creator> getCreators() {
return creators;
}
public void setCreators(List<Creator> creators) {
this.creators = creators;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public List<Grant> getGrants() {
return grants;
}
public void setGrants(List<Grant> grants) {
this.grants = grants;
}
public List<String> getKeywords() {
return keywords;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public String getLicense() {
return license;
}
public void setLicense(String license) {
this.license = license;
}
public PrereserveDoi getPrereserve_doi() {
return prereserve_doi;
}
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
this.prereserve_doi = prereserve_doi;
}
public String getPublication_date() {
return publication_date;
}
public void setPublication_date(String publication_date) {
this.publication_date = publication_date;
}
public List<String> getReferences() {
return references;
}
public void setReferences(List<String> references) {
this.references = references;
}
public List<RelatedIdentifier> getRelated_identifiers() {
return related_identifiers;
}
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
this.related_identifiers = related_identifiers;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class PrereserveDoi implements Serializable {
private String doi;
private String recid;
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getRecid() {
return recid;
}
public void setRecid(String recid) {
this.recid = recid;
}
}

View File

@ -1,43 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class RelatedIdentifier implements Serializable {
private String identifier;
private String relation;
private String resource_type;
private String scheme;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getRelation() {
return relation;
}
public void setRelation(String relation) {
this.relation = relation;
}
public String getResource_type() {
return resource_type;
}
public void setResource_type(String resource_type) {
this.resource_type = resource_type;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
}

View File

@ -1,118 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class ZenodoModel implements Serializable {
private String conceptrecid;
private String created;
private List<File> files;
private String id;
private Links links;
private Metadata metadata;
private String modified;
private String owner;
private String record_id;
private String state;
private boolean submitted;
private String title;
public String getConceptrecid() {
return conceptrecid;
}
public void setConceptrecid(String conceptrecid) {
this.conceptrecid = conceptrecid;
}
public String getCreated() {
return created;
}
public void setCreated(String created) {
this.created = created;
}
public List<File> getFiles() {
return files;
}
public void setFiles(List<File> files) {
this.files = files;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Links getLinks() {
return links;
}
public void setLinks(Links links) {
this.links = links;
}
public Metadata getMetadata() {
return metadata;
}
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
}
public String getModified() {
return modified;
}
public void setModified(String modified) {
this.modified = modified;
}
public String getOwner() {
return owner;
}
public void setOwner(String owner) {
this.owner = owner;
}
public String getRecord_id() {
return record_id;
}
public void setRecord_id(String record_id) {
this.record_id = record_id;
}
public String getState() {
return state;
}
public void setState(String state) {
this.state = state;
}
public boolean isSubmitted() {
return submitted;
}
public void setSubmitted(boolean submitted) {
this.submitted = submitted;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -1,7 +0,0 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.util.ArrayList;
public class ZenodoModelList extends ArrayList<ZenodoModel> {
}

View File

@ -65,13 +65,7 @@ public class RunSQLSparkJob {
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) { for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
log.info("executing: {}", statement); log.info("executing: {}", statement);
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
try { spark.sql(statement).show();
spark.sql(statement).show();
} catch (Exception e) {
log.error("Error executing statement: {}", statement, e);
System.err.println("Error executing statement: " + statement + "\n" + e);
throw e;
}
log log
.info( .info(
"executed in {}", "executed in {}",

View File

@ -119,7 +119,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.getContext() .getContext()
.stream() .stream()
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId)) .filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
.collect(Collectors.toList())); .collect(Collectors.toCollection(ArrayList::new)));
} }
return (T) res; return (T) res;
} else { } else {
@ -1015,4 +1015,41 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.orElse(null); .orElse(null);
} }
/**
* Implements bad and ugly things that we should get rid of ASAP.
*
* @param value
* @return
* @param <T>
*/
public static <T extends Oaf> T dedicatedUglyHacks(T value) {
if (value instanceof OafEntity) {
if (value instanceof Result) {
final Result r = (Result) value;
// Fix for AMS Acta
Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.filter(
i -> Optional
.ofNullable(i.getHostedby())
.map(KeyValue::getKey)
.map(dsId -> dsId.equals("10|re3data_____::4cc76bed7ce2fb95fd8e7a2dfde16016"))
.orElse(false)))
.ifPresent(instance -> instance.forEach(i -> {
if (Optional
.ofNullable(i.getPid())
.map(pid -> pid.stream().noneMatch(p -> p.getValue().startsWith("10.6092/unibo/amsacta")))
.orElse(false)) {
i.setHostedby(UNKNOWN_REPOSITORY);
}
}));
}
}
return value;
}
} }

View File

@ -433,7 +433,10 @@ public class MergeUtils {
// merge datainfo for same context id // merge datainfo for same context id
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> { merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
r.getDataInfo().addAll(l.getDataInfo()); ArrayList<DataInfo> di = new ArrayList<>();
di.addAll(r.getDataInfo());
di.addAll(l.getDataInfo());
r.setDataInfo(di);
return r; return r;
})); }));

View File

@ -1,109 +0,0 @@
package eu.dnetlib.dhp.common.api;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@Disabled
class ZenodoAPIClientTest {
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
private final String ACCESS_TOKEN = "";
private final String CONCEPT_REC_ID = "657113";
private final String depositionId = "674915";
@Test
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewDeposition() throws IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newDeposition());
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/newVersion")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
}
}

View File

@ -177,7 +177,7 @@ class OafMapperUtilsTest {
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals( assertEquals(
ModelConstants.DATASET_RESULTTYPE_CLASSID, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
((Result) MergeUtils ((Result) MergeUtils
.merge(p2, d1)) .merge(p2, d1))
.getResulttype() .getResulttype()

View File

@ -6,18 +6,7 @@
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.2.5-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-aggregation</artifactId> <artifactId>dhp-aggregation</artifactId>
<properties>
<affro.release.version>1.0.0</affro.release.version>
</properties>
<scm>
<url>https://code-repo.d4science.org/mkallipo/affRo</url>
<connection>scm:git:https://code-repo.d4science.org/mkallipo/affRo.git</connection>
</scm>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@ -54,32 +43,6 @@
<scalaVersion>${scala.version}</scalaVersion> <scalaVersion>${scala.version}</scalaVersion>
</configuration> </configuration>
</plugin> </plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-scm-plugin</artifactId>
<version>1.8.1</version>
<configuration>
<connectionType>connection</connectionType>
<!--
<scmVersionType>tag</scmVersionType>--><!-- 'branch' can also be provided here -->
<!-- <scmVersion>${affro.release.version}</scmVersion>--><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
<scmVersionType>branch</scmVersionType><!-- 'branch' can also be provided here -->
<scmVersion>openaire-workflow-ready</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
<checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/affRo</checkoutDirectory>
</configuration>
<executions>
<execution>
<id>checkout-affro</id>
<phase>prepare-package</phase>
<goals>
<goal>checkout</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins> </plugins>
</build> </build>

View File

@ -46,9 +46,6 @@ public class GetOpenCitationsRefs implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath); log.info("outputPath {}", outputPath);
final String backupPath = parser.get("backupPath");
log.info("backupPath {}", backupPath);
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.defaultFS", hdfsNameNode);
@ -56,11 +53,11 @@ public class GetOpenCitationsRefs implements Serializable {
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs(); GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
ocr.doExtract(inputPath, outputPath, backupPath, fileSystem); ocr.doExtract(inputPath, outputPath, fileSystem);
} }
private void doExtract(String inputPath, String outputPath, String backupPath, FileSystem fileSystem) private void doExtract(String inputPath, String outputPath, FileSystem fileSystem)
throws IOException { throws IOException {
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
@ -92,7 +89,6 @@ public class GetOpenCitationsRefs implements Serializable {
} }
} }
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
} }
} }

View File

@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable {
final String workingPath = parser.get("inputPath"); final String workingPath = parser.get("inputPath");
log.info("workingPath {}", workingPath); log.info("workingPath {}", workingPath);
final String backupPath = parser.get("backupPath");
log.info("backupPath {}", backupPath);
SparkConf sconf = new SparkConf(); SparkConf sconf = new SparkConf();
Configuration conf = new Configuration(); Configuration conf = new Configuration();
@ -68,12 +71,14 @@ public class ReadCOCI implements Serializable {
workingPath, workingPath,
fileSystem, fileSystem,
outputPath, outputPath,
backupPath,
delimiter); delimiter);
}); });
} }
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem, private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
String outputPath, String outputPath,
String backupPath,
String delimiter) throws IOException { String delimiter) throws IOException {
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles( .listFiles(
@ -108,7 +113,7 @@ public class ReadCOCI implements Serializable {
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
fileSystem.delete(fileStatus.getPath()); fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
} }
} }

View File

@ -297,7 +297,7 @@ public class ExtractPerson implements Serializable {
} }
private static Relation getAffiliationRelation(Employment row) { private static Relation getAffiliationRelation(Employment row) {
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(row.getOrcid()); String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
String target = ROR_PREFIX String target = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue())); + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
List<KeyValue> properties = new ArrayList<>(); List<KeyValue> properties = new ArrayList<>();

View File

@ -1,45 +0,0 @@
# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties ---
# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
# dhp.hadoop.frontend.user.name=ilias.kanellos
# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
# dhp.hadoop.frontend.port.ssh=22
# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
# jobTracker=yarnRM
# nameNode=hdfs://nameservice1
# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
# maven.executable=mvn
# The above is given differently in an example I found online
oozie.action.sharelib.for.spark=spark2
oozieActionShareLibForSpark2=spark2
spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
spark2EventLogDir=/user/spark/spark2ApplicationHistory
sparkSqlWarehouseDir=/user/hive/warehouse
#hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
# This MAY avoid the no library used error
oozie.use.system.libpath=true
# Some stuff copied from openaire's jobs
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
# The following is needed as a property of a workflow
wfAppPath=${oozieTopWfApplicationPath}
resumeFrom=Crossref
#OpenAlex input/output
#resultFolder=/tmp/affro-results/oalex
#inputFolder=/user/zeppelin/affiliations/raw_aff_string/2024-08
#Crossref input/output
resultFolder=/tmp/affro-results/crossref
inputFolder=/data/doiboost/crossref/crossref_unpack
#
#crossrefInputPath=/data/bip-affiliations/crossref-data.json
#pubmedInputPath=/data/bip-affiliations/pubmed-data.json
#openapcInputPath=/data/bip-affiliations/openapc-data.json
#dataciteInputPath=/data/bip-affiliations/datacite-data.json
#
#outputPath=/tmp/crossref-affiliations-output-v5

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,176 +0,0 @@
<workflow-app name="AffroAffiliations" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resumeFrom"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="resumeFrom">
<switch>
<case to="run-affro-on-iisdata">${wf:conf('resumeFrom') eq 'IIS'}</case>
<case to="run-affro-on-crossref">${wf:conf('resumeFrom') eq 'Crossref'}</case>
<default to="run-affro-on-oalexstrings"/>
</switch>
</decision>
<action name="run-affro-on-iisdata">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Affiliations inference (Affro)</name>
<jar>update_records.py</jar>
<spark-opts>
--executor-cores=4
--executor-memory=6G
--driver-memory=15G
--conf spark.executor.memoryOverhead=6G
--conf spark.sql.shuffle.partitions=20000
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
--conf spark.executorEnv.PYSPARK_PYTHON=python3
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/affro_test_example.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
</spark-opts>
<arg>${resultFolder}</arg>
<file>${wfAppPath}/affRo/update_records.py#update_records.py</file>
</spark>
<ok to="End" />
<error to="Kill" />
</action>
<action name="run-affro-on-oalexstrings">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Affiliations inference (Affro)</name>
<jar>strings.py</jar>
<spark-opts>
--executor-cores=4
--executor-memory=6G
--driver-memory=15G
--conf spark.executor.memoryOverhead=6G
--conf spark.sql.shuffle.partitions=20000
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
--conf spark.executorEnv.PYSPARK_PYTHON=python3
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
</spark-opts>
<arg>${inputFolder}</arg>
<arg>${resultFolder}</arg>
<file>${wfAppPath}/affRo/strings.py#strings.py</file>
</spark>
<ok to="End" />
<error to="Kill" />
</action>
<action name="run-affro-on-crossref">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Affiliations inference (Affro)</name>
<jar>crossref.py</jar>
<spark-opts>
--executor-cores=4
--executor-memory=6G
--driver-memory=15G
--conf spark.executor.memoryOverhead=6G
--conf spark.sql.shuffle.partitions=20000
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
--conf spark.executorEnv.PYSPARK_PYTHON=python3
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
</spark-opts>
<arg>${inputFolder}</arg>
<arg>${resultFolder}</arg>
<file>${wfAppPath}/affRo/crossref.py#crossref.py</file>
</spark>
<ok to="End" />
<error to="Kill" />
</action>
<end name="End"/>
</workflow-app>

View File

@ -35,5 +35,6 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json
pubmedInputPath=/data/bip-affiliations/pubmed-data.json pubmedInputPath=/data/bip-affiliations/pubmed-data.json
openapcInputPath=/data/bip-affiliations/openapc-data.json openapcInputPath=/data/bip-affiliations/openapc-data.json
dataciteInputPath=/data/bip-affiliations/datacite-data.json dataciteInputPath=/data/bip-affiliations/datacite-data.json
webCrawlInputPath=/data/bip-affiliations/webCrawl/
outputPath=/tmp/crossref-affiliations-output-v5 outputPath=/tmp/crossref-affiliations-output-v5

View File

@ -21,6 +21,10 @@
<name>webCrawlInputPath</name> <name>webCrawlInputPath</name>
<description>the path where to find the inferred affiliation relations from webCrawl</description> <description>the path where to find the inferred affiliation relations from webCrawl</description>
</property> </property>
<property>
<name>publisherInputPath</name>
<description>the path where to find the inferred affiliation relations from publisher websites</description>
</property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<description>the path where to store the actionset</description> <description>the path where to store the actionset</description>
@ -117,6 +121,7 @@
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg> <arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg> <arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg> <arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
<arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg> <arg>--outputPath</arg><arg>${outputPath}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>

View File

@ -16,11 +16,5 @@
"paramLongName": "hdfsNameNode", "paramLongName": "hdfsNameNode",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": true "paramRequired": true
},
{
"paramName": "bp",
"paramLongName": "backupPath",
"paramDescription": "the hdfs path to move the OC data after the extraction",
"paramRequired": true
} }
] ]

View File

@ -30,6 +30,12 @@
"paramLongName": "hdfsNameNode", "paramLongName": "hdfsNameNode",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": true "paramRequired": true
},
{
"paramName": "bp",
"paramLongName": "backupPath",
"paramDescription": "the hdfs path to move the OC data after the extraction",
"paramRequired": true
} }
] ]

View File

@ -94,7 +94,17 @@
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg> <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputPath</arg><arg>${inputPath}/Original</arg> <arg>--inputPath</arg><arg>${inputPath}/Original</arg>
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg> <arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
<arg>--backupPath</arg><arg>${inputPath}/backup</arg> </java>
<ok to="read"/>
<error to="Kill"/>
</action>
<action name="extract_correspondence">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputPath</arg><arg>${inputPath}/correspondence</arg>
<arg>--outputPath</arg><arg>${inputPath}/correspondence_extracted</arg>
</java> </java>
<ok to="read"/> <ok to="read"/>
<error to="Kill"/> <error to="Kill"/>
@ -119,6 +129,7 @@
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg> <arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg> <arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
<arg>--delimiter</arg><arg>${delimiter}</arg> <arg>--delimiter</arg><arg>${delimiter}</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg> <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
</spark> </spark>

View File

@ -16,10 +16,11 @@
"paramLongName": "isSparkSessionManged", "paramLongName": "isSparkSessionManged",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": false "paramRequired": false
},{ },
"paramName": "nn", {
"paramLongName": "nameNode", "paramName": "nn",
"paramDescription": "the hdfs name node", "paramLongName": "nameNode",
"paramRequired": true "paramDescription": "the hdfs name node",
} "paramRequired": true
}
] ]

View File

@ -24,7 +24,7 @@
<decision name="resume_from"> <decision name="resume_from">
<switch> <switch>
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case> <case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed --> <default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
</switch> </switch>
</decision> </decision>
@ -33,6 +33,14 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="download"/>
<error to="Kill"/>
</action>
<action name="download"> <action name="download">
<shell xmlns="uri:oozie:shell-action:0.2"> <shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>

View File

@ -1,4 +1,4 @@
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
@ -8,19 +8,40 @@
<name>database</name> <name>database</name>
<description>the PDB Database Working Path</description> <description>the PDB Database Working Path</description>
</property> </property>
<property> <property>
<name>targetPath</name> <name>mdStoreOutputId</name>
<description>the Target Working dir path</description> <description>the identifier of the cleaned MDStore</description>
</property>
<property>
<name>mdStoreManagerURI</name>
<description>the path of the cleaned mdstore</description>
</property> </property>
</parameters> </parameters>
<start to="ConvertDB"/> <start to="StartTransaction"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="StartTransaction">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="ConvertDB"/>
<error to="RollBack"/>
</action>
<action name="ConvertDB"> <action name="ConvertDB">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -41,11 +62,48 @@
<arg>--master</arg><arg>yarn</arg> <arg>--master</arg><arg>yarn</arg>
<arg>--dbPath</arg><arg>${sourcePath}</arg> <arg>--dbPath</arg><arg>${sourcePath}</arg>
<arg>--database</arg><arg>${database}</arg> <arg>--database</arg><arg>${database}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg> <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
</spark> </spark>
<ok to="End"/> <ok to="CommitVersion"/>
<error to="Kill"/> <error to="RollBack"/>
</action> </action>
<end name="End"/> <action name="CommitVersion">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="RollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>ROLLBACK</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app> </workflow-app>

View File

@ -2,5 +2,5 @@
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true}, {"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true}, {"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true} {"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}
] ]

View File

@ -1,5 +1,20 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true}, "paramName": "mt",
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true} "paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source Path",
"paramRequired": true
},
{
"paramName": "mo",
"paramLongName": "mdstoreOutputVersion",
"paramDescription": "the oaf path ",
"paramRequired": true
}
] ]

View File

@ -9,34 +9,26 @@
<description>the Working Path</description> <description>the Working Path</description>
</property> </property>
<property> <property>
<name>targetPath</name> <name>mdStoreOutputId</name>
<description>the OAF MDStore Path</description> <description>the identifier of the cleaned MDStore</description>
</property> </property>
<property> <property>
<name>sparkDriverMemory</name> <name>mdStoreManagerURI</name>
<description>memory for driver process</description> <description>the path of the cleaned mdstore</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property> </property>
<property> <property>
<name>resumeFrom</name> <name>resumeFrom</name>
<value>DownloadEBILinks</value> <value>CreateEBIDataSet</value>
<description>node to start</description> <description>node to start</description>
</property> </property>
</parameters> </parameters>
<start to="resume_from"/> <start to="StartTransaction"/>
<decision name="resume_from"> <decision name="resume_from">
<switch> <switch>
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case> <case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case> <case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
<default to="DownloadEBILinks"/> <default to="DownloadEBILinks"/>
</switch> </switch>
</decision> </decision>
@ -77,9 +69,29 @@
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/> <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/> <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
</fs> </fs>
<ok to="CreateEBIDataSet"/> <ok to="StartTransaction"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="StartTransaction">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="CreateEBIDataSet"/>
<error to="RollBack"/>
</action>
<action name="CreateEBIDataSet"> <action name="CreateEBIDataSet">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master> <master>yarn-cluster</master>
@ -95,11 +107,49 @@
${sparkExtraOPT} ${sparkExtraOPT}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg> <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--master</arg><arg>yarn</arg> <arg>--master</arg><arg>yarn</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="CommitVersion">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="RollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>ROLLBACK</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -407,9 +407,10 @@ object DataciteToOAFTransformation {
) )
} }
if (c.affiliation.isDefined) if (c.affiliation.isDefined)
a.setRawAffiliationString( a.setAffiliation(
c.affiliation.get c.affiliation.get
.filter(af => af.nonEmpty) .filter(af => af.nonEmpty)
.map(af => OafMapperUtils.field(af, dataInfo))
.asJava .asJava
) )
a.setRank(idx + 1) a.setRank(idx + 1)

View File

@ -231,7 +231,7 @@ object BioDBToOAF {
def uniprotToOAF(input: String): List[Oaf] = { def uniprotToOAF(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
val pid = (json \ "pid").extract[String] val pid = (json \ "pid").extract[String].trim()
val d = new Dataset val d = new Dataset

View File

@ -2,12 +2,15 @@ package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
object SparkTransformBioDatabaseToOAF { object SparkTransformBioDatabaseToOAF {
@ -25,8 +28,13 @@ object SparkTransformBioDatabaseToOAF {
val dbPath: String = parser.get("dbPath") val dbPath: String = parser.get("dbPath")
log.info("dbPath: {}", database) log.info("dbPath: {}", database)
val targetPath: String = parser.get("targetPath")
log.info("targetPath: {}", database) val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
@ -43,24 +51,28 @@ object SparkTransformBioDatabaseToOAF {
case "UNIPROT" => case "UNIPROT" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
case "PDB" => case "PDB" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
case "SCHOLIX" => case "SCHOLIX" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
case "CROSSREF_LINKS" => case "CROSSREF_LINKS" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
} }
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
} }
} }

View File

@ -9,6 +9,9 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
object SparkEBILinksToOaf { object SparkEBILinksToOaf {
@ -32,8 +35,13 @@ object SparkEBILinksToOaf {
import spark.implicits._ import spark.implicits._
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath") val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info(s"targetPath -> $targetPath") log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val ebLinks: Dataset[EBILinkItem] = spark.read val ebLinks: Dataset[EBILinkItem] = spark.read
@ -46,7 +54,10 @@ object SparkEBILinksToOaf {
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) .flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)), .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
} }
} }

View File

@ -28,6 +28,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner; import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
@ -39,7 +40,8 @@ public class PrepareAffiliationRelationsTest {
private static Path workingDir; private static Path workingDir;
private static final String ID_PREFIX = "50|doi_________::"; private static final String ID_PREFIX = "50|doi_________::";
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelationsTest.class); private static final Logger log = LoggerFactory
.getLogger(PrepareAffiliationRelationsTest.class);
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {

View File

@ -77,13 +77,13 @@ public class RemapTest {
MapOCIdsInPids MapOCIdsInPids
.main( .main(
new String[] { new String[] {
"-isSparkSessionManged", "--isSparkSessionManged",
Boolean.FALSE.toString(), Boolean.FALSE.toString(),
"-inputPath", "--inputPath",
inputPath, inputPath,
"-outputPath", "--outputPath",
workingDir.toString() + "/out/", workingDir.toString() + "/out/",
"-nameNode", "input1;input2;input3;input4;input5" "--nameNode", "hdfs://localhost"
}); });
} }

View File

@ -1,15 +1,44 @@
{"pdb": "1CW0", "title": "crystal structure analysis of very short patch repair (vsr) endonuclease in complex with a duplex dna", "authors": ["S.E.Tsutakawa", "H.Jingami", "K.Morikawa"], "doi": "10.1016/S0092-8674(00)81550-0", "pmid": "10612397"} {"classification": "Signaling protein", "pdb": "5NM4", "deposition_date": "2017-04-05", "title": "A2a adenosine receptor room-temperature structure determined by serial Femtosecond crystallography", "Keywords": ["Oom-temperature", " serial crystallography", " signaling protein"], "authors": ["T.weinert", "R.cheng", "D.james", "D.gashi", "P.nogly", "K.jaeger", "M.hennig", "", "J.standfuss"], "pmid": "28912485", "doi": "10.1038/S41467-017-00630-4"}
{"pdb": "2CWW", "title": "crystal structure of thermus thermophilus ttha1280, a putative sam- dependent rna methyltransferase, in complex with s-adenosyl-l- homocysteine", "authors": ["A.A.Pioszak", "K.Murayama", "N.Nakagawa", "A.Ebihara", "S.Kuramitsu", "M.Shirouzu", "S.Yokoyama", "Riken Structural Genomics/proteomics Initiative (Rsgi)"], "doi": "10.1107/S1744309105029842", "pmid": "16511182"} {"classification": "Oxidoreductase/oxidoreductase inhibitor", "pdb": "4KN3", "deposition_date": "2013-05-08", "title": "Structure of the y34ns91g double mutant of dehaloperoxidase from Amphitrite ornata with 2,4,6-trichlorophenol", "Keywords": ["Lobin", " oxygen storage", " peroxidase", " oxidoreductase", " oxidoreductase-", "Oxidoreductase inhibitor complex"], "authors": ["C.wang", "L.lovelace", "L.lebioda"], "pmid": "23952341", "doi": "10.1021/BI400627W"}
{"pdb": "6CWE", "title": "structure of alpha-gsa[8,6p] bound by cd1d and in complex with the va14vb8.2 tcr", "authors": ["J.Wang", "D.Zajonc"], "doi": null, "pmid": null} {"classification": "Transport protein", "pdb": "8HKM", "deposition_date": "2022-11-27", "title": "Ion channel", "Keywords": ["On channel", " transport protein"], "authors": ["D.h.jiang", "J.t.zhang"], "pmid": "37494189", "doi": "10.1016/J.CELREP.2023.112858"}
{"pdb": "5CWS", "title": "crystal structure of the intact chaetomium thermophilum nsp1-nup49- nup57 channel nucleoporin heterotrimer bound to its nic96 nuclear pore complex attachment site", "authors": ["C.J.Bley", "S.Petrovic", "M.Paduch", "V.Lu", "A.A.Kossiakoff", "A.Hoelz"], "doi": "10.1126/SCIENCE.AAC9176", "pmid": "26316600"} {"classification": "Signaling protein", "pdb": "6JT1", "deposition_date": "2019-04-08", "title": "Structure of human soluble guanylate cyclase in the heme oxidised State", "Keywords": ["Oluble guanylate cyclase", " signaling protein"], "authors": ["L.chen", "Y.kang", "R.liu", "J.-x.wu"], "pmid": "31514202", "doi": "10.1038/S41586-019-1584-6"}
{"pdb": "5CWE", "title": "structure of cyp107l2 from streptomyces avermitilis with lauric acid", "authors": ["T.-V.Pham", "S.-H.Han", "J.-H.Kim", "D.-H.Kim", "L.-W.Kang"], "doi": null, "pmid": null} {"classification": "Immune system", "pdb": "7OW6", "deposition_date": "2021-06-16", "title": "Crystal structure of a tcr in complex with hla-a*11:01 bound to kras G12d peptide (vvvgadgvgk)", "Keywords": ["La", " kras", " tcr", " immune system"], "authors": ["V.karuppiah", "R.a.robinson"], "doi": "10.1038/S41467-022-32811-1"}
{"pdb": "7CW4", "title": "acetyl-coa acetyltransferase from bacillus cereus atcc 14579", "authors": ["J.Hong", "K.J.Kim"], "doi": "10.1016/J.BBRC.2020.09.048", "pmid": "32972748"} {"classification": "Biosynthetic protein", "pdb": "5EQ8", "deposition_date": "2015-11-12", "title": "Crystal structure of medicago truncatula histidinol-phosphate Phosphatase (mthpp) in complex with l-histidinol", "Keywords": ["Istidine biosynthesis", " metabolic pathways", " dimer", " plant", "", "Biosynthetic protein"], "authors": ["M.ruszkowski", "Z.dauter"], "pmid": "26994138", "doi": "10.1074/JBC.M115.708727"}
{"pdb": "2CWP", "title": "crystal structure of metrs related protein from pyrococcus horikoshii", "authors": ["K.Murayama", "M.Kato-Murayama", "M.Shirouzu", "S.Yokoyama", "Riken StructuralGenomics/proteomics Initiative (Rsgi)"], "doi": null, "pmid": null} {"classification": "De novo protein", "pdb": "8CWA", "deposition_date": "2022-05-18", "title": "Solution nmr structure of 8-residue rosetta-designed cyclic peptide D8.21 in cdcl3 with cis/trans switching (tc conformation, 53%)", "Keywords": ["Yclic peptide", " non natural amino acids", " cis/trans", " switch peptides", "", "De novo design", "Membrane permeability", "De novo protein"], "authors": ["T.a.ramelot", "R.tejero", "G.t.montelione"], "pmid": "36041435", "doi": "10.1016/J.CELL.2022.07.019"}
{"pdb": "2CW7", "title": "crystal structure of intein homing endonuclease ii", "authors": ["H.Matsumura", "H.Takahashi", "T.Inoue", "H.Hashimoto", "M.Nishioka", "S.Fujiwara", "M.Takagi", "T.Imanaka", "Y.Kai"], "doi": "10.1002/PROT.20858", "pmid": "16493661"} {"classification": "Hydrolase", "pdb": "3R6M", "deposition_date": "2011-03-21", "title": "Crystal structure of vibrio parahaemolyticus yeaz", "Keywords": ["Ctin/hsp70 nucleotide-binding fold", " bacterial resuscitation", " viable", "But non-culturable state", "Resuscitation promoting factor", "Ygjd", "", "Yjee", "Vibrio parahaemolyticus", "Hydrolase"], "authors": ["A.roujeinikova", "I.aydin"], "pmid": "21858042", "doi": "10.1371/JOURNAL.PONE.0023245"}
{"pdb": "1CWU", "title": "brassica napus enoyl acp reductase a138g mutant complexed with nad+ and thienodiazaborine", "authors": ["A.Roujeinikova", "J.B.Rafferty", "D.W.Rice"], "doi": "10.1074/JBC.274.43.30811", "pmid": "10521472"} {"classification": "Hydrolase", "pdb": "2W5J", "deposition_date": "2008-12-10", "title": "Structure of the c14-rotor ring of the proton translocating Chloroplast atp synthase", "Keywords": ["Ydrolase", " chloroplast", " atp synthase", " lipid-binding", " cf(0)", " membrane", "", "Transport", "Formylation", "Energy transduction", "Hydrogen ion transport", "", "Ion transport", "Transmembrane", "Membrane protein"], "authors": ["M.vollmar", "D.schlieper", "M.winn", "C.buechner", "G.groth"], "pmid": "19423706", "doi": "10.1074/JBC.M109.006916"}
{"pdb": "3CWN", "title": "escherichia coli transaldolase b mutant f178y", "authors": ["T.Sandalova", "G.Schneider", "A.Samland"], "doi": "10.1074/JBC.M803184200", "pmid": "18687684"} {"classification": "De novo protein", "pdb": "4GLU", "deposition_date": "2012-08-14", "title": "Crystal structure of the mirror image form of vegf-a", "Keywords": ["-protein", " covalent dimer", " cysteine knot protein", " growth factor", " de", "Novo protein"], "authors": ["K.mandal", "M.uppalapati", "D.ault-riche", "J.kenney", "J.lowitz", "S.sidhu", "", "S.b.h.kent"], "pmid": "22927390", "doi": "10.1073/PNAS.1210483109"}
{"pdb": "1CWL", "title": "human cyclophilin a complexed with 4 4-hydroxy-meleu cyclosporin", "authors": ["V.Mikol", "J.Kallen", "P.Taylor", "M.D.Walkinshaw"], "doi": "10.1006/JMBI.1998.2108", "pmid": "9769216"} {"classification": "Hydrolase/hydrolase inhibitor", "pdb": "3WYL", "deposition_date": "2014-09-01", "title": "Crystal structure of the catalytic domain of pde10a complexed with 5- Methoxy-3-(1-phenyl-1h-pyrazol-5-yl)-1-(3-(trifluoromethyl)phenyl) Pyridazin-4(1h)-one", "Keywords": ["Ydrolase-hydrolase inhibitor complex"], "authors": ["H.oki", "Y.hayano"], "pmid": "25384088", "doi": "10.1021/JM5013648"}
{"pdb": "3CW2", "title": "crystal structure of the intact archaeal translation initiation factor 2 from sulfolobus solfataricus .", "authors": ["E.A.Stolboushkina", "S.V.Nikonov", "A.D.Nikulin", "U.Blaesi", "D.J.Manstein", "R.V.Fedorov", "M.B.Garber", "O.S.Nikonov"], "doi": "10.1016/J.JMB.2008.07.039", "pmid": "18675278"} {"classification": "Isomerase", "pdb": "5BOR", "deposition_date": "2015-05-27", "title": "Structure of acetobacter aceti pure-s57c, sulfonate form", "Keywords": ["Cidophile", " pure", " purine biosynthesis", " isomerase"], "authors": ["K.l.sullivan", "T.j.kappock"]}
{"pdb": "3CW9", "title": "4-chlorobenzoyl-coa ligase/synthetase in the thioester-forming conformation, bound to 4-chlorophenacyl-coa", "authors": ["A.S.Reger", "J.Cao", "R.Wu", "D.Dunaway-Mariano", "A.M.Gulick"], "doi": "10.1021/BI800696Y", "pmid": "18620418"} {"classification": "Hydrolase", "pdb": "1X0C", "deposition_date": "2005-03-17", "title": "Improved crystal structure of isopullulanase from aspergillus niger Atcc 9642", "Keywords": ["Ullulan", " glycoside hydrolase family 49", " glycoprotein", " hydrolase"], "authors": ["M.mizuno", "T.tonozuka", "A.yamamura", "Y.miyasaka", "H.akeboshi", "S.kamitori", "", "A.nishikawa", "Y.sakano"], "pmid": "18155243", "doi": "10.1016/J.JMB.2007.11.098"}
{"pdb": "3CWU", "title": "crystal structure of an alka host/guest complex 2'-fluoro-2'-deoxy-1, n6-ethenoadenine:thymine base pair", "authors": ["B.R.Bowman", "S.Lee", "S.Wang", "G.L.Verdine"], "doi": "10.1016/J.STR.2008.04.012", "pmid": "18682218"} {"classification": "Oxidoreductase", "pdb": "7CUP", "deposition_date": "2020-08-23", "title": "Structure of 2,5-dihydroxypridine dioxygenase from pseudomonas putida Kt2440", "Keywords": ["On-heme dioxygenase", " oxidoreductase"], "authors": ["G.q.liu", "H.z.tang"]}
{"pdb": "5CWF", "title": "crystal structure of de novo designed helical repeat protein dhr8", "authors": ["G.Bhabha", "D.C.Ekiert"], "doi": "10.1038/NATURE16162", "pmid": "26675729"} {"classification": "Ligase", "pdb": "1VCN", "deposition_date": "2004-03-10", "title": "Crystal structure of t.th. hb8 ctp synthetase complex with sulfate Anion", "Keywords": ["Etramer", " riken structural genomics/proteomics initiative", " rsgi", "", "Structural genomics", "Ligase"], "authors": ["M.goto", "Riken structural genomics/proteomics initiative (rsgi)"], "pmid": "15296735", "doi": "10.1016/J.STR.2004.05.013"}
{"classification": "Transferase/transferase inhibitor", "pdb": "6C9V", "deposition_date": "2018-01-28", "title": "Mycobacterium tuberculosis adenosine kinase bound to (2r,3s,4r,5r)-2- (hydroxymethyl)-5-(6-(4-phenylpiperazin-1-yl)-9h-purin-9-yl) Tetrahydrofuran-3,4-diol", "Keywords": ["Ucleoside analog", " complex", " inhibitor", " structural genomics", " psi-2", "", "Protein structure initiative", "Tb structural genomics consortium", "", "Tbsgc", "Transferase-transferase inhibitor complex"], "authors": ["R.a.crespo", "Tb structural genomics consortium (tbsgc)"], "pmid": "31002508", "doi": "10.1021/ACS.JMEDCHEM.9B00020"}
{"classification": "De novo protein", "pdb": "4LPY", "deposition_date": "2013-07-16", "title": "Crystal structure of tencon variant g10", "Keywords": ["Ibronectin type iii fold", " alternate scaffold", " de novo protein"], "authors": ["A.teplyakov", "G.obmolova", "G.l.gilliland"], "pmid": "24375666", "doi": "10.1002/PROT.24502"}
{"classification": "Isomerase", "pdb": "2Y88", "deposition_date": "2011-02-03", "title": "Crystal structure of mycobacterium tuberculosis phosphoribosyl Isomerase (variant d11n) with bound prfar", "Keywords": ["Romatic amino acid biosynthesis", " isomerase", " tim-barrel", " histidine", "Biosynthesis", "Tryptophan biosynthesis"], "authors": ["J.kuper", "A.v.due", "A.geerlof", "M.wilmanns"], "pmid": "21321225", "doi": "10.1073/PNAS.1015996108"}
{"classification": "Unknown function", "pdb": "1SR0", "deposition_date": "2004-03-22", "title": "Crystal structure of signalling protein from sheep(sps-40) at 3.0a Resolution using crystal grown in the presence of polysaccharides", "Keywords": ["Ignalling protein", " involution", " unknown function"], "authors": ["D.b.srivastava", "A.s.ethayathulla", "N.singh", "J.kumar", "S.sharma", "T.p.singh"]}
{"classification": "Dna binding protein", "pdb": "3RH2", "deposition_date": "2011-04-11", "title": "Crystal structure of a tetr-like transcriptional regulator (sama_0099) From shewanella amazonensis sb2b at 2.42 a resolution", "Keywords": ["Na/rna-binding 3-helical bundle", " structural genomics", " joint center", "For structural genomics", "Jcsg", "Protein structure initiative", "Psi-", "Biology", "Dna binding protein"], "authors": ["Joint center for structural genomics (jcsg)"]}
{"classification": "Transferase", "pdb": "2WK5", "deposition_date": "2009-06-05", "title": "Structural features of native human thymidine phosphorylase And in complex with 5-iodouracil", "Keywords": ["Lycosyltransferase", " developmental protein", " angiogenesis", "", "5-iodouracil", "Growth factor", "Enzyme kinetics", "", "Differentiation", "Disease mutation", "Thymidine", "Phosphorylase", "Chemotaxis", "Transferase", "Mutagenesis", "", "Polymorphism"], "authors": ["E.mitsiki", "A.c.papageorgiou", "S.iyer", "N.thiyagarajan", "S.h.prior", "", "D.sleep", "C.finnis", "K.r.acharya"], "pmid": "19555658", "doi": "10.1016/J.BBRC.2009.06.104"}
{"classification": "Hydrolase", "pdb": "3P9Y", "deposition_date": "2010-10-18", "title": "Crystal structure of the drosophila melanogaster ssu72-pctd complex", "Keywords": ["Hosphatase", " cis proline", " lmw ptp-like fold", " rna polymerase ii ctd", "", "Hydrolase"], "authors": ["J.w.werner-allen", "P.zhou"], "pmid": "21159777", "doi": "10.1074/JBC.M110.197129"}
{"classification": "Recombination/dna", "pdb": "6OEO", "deposition_date": "2019-03-27", "title": "Cryo-em structure of mouse rag1/2 nfc complex (dna1)", "Keywords": ["(d)j recombination", " dna transposition", " rag", " scid", " recombination", "", "Recombination-dna complex"], "authors": ["X.chen", "Y.cui", "Z.h.zhou", "W.yang", "M.gellert"], "pmid": "32015552", "doi": "10.1038/S41594-019-0363-2"}
{"classification": "Hydrolase", "pdb": "4ECA", "deposition_date": "1997-02-21", "title": "Asparaginase from e. coli, mutant t89v with covalently bound aspartate", "Keywords": ["Ydrolase", " acyl-enzyme intermediate", " threonine amidohydrolase"], "authors": ["G.j.palm", "J.lubkowski", "A.wlodawer"], "pmid": "8706862", "doi": "10.1016/0014-5793(96)00660-6"}
{"classification": "Transcription/protein binding", "pdb": "3UVX", "deposition_date": "2011-11-30", "title": "Crystal structure of the first bromodomain of human brd4 in complex With a diacetylated histone 4 peptide (h4k12ack16ac)", "Keywords": ["Romodomain", " bromodomain containing protein 4", " cap", " hunk1", " mcap", "", "Mitotic chromosome associated protein", "Peptide complex", "Structural", "Genomics consortium", "Sgc", "Transcription-protein binding complex"], "authors": ["P.filippakopoulos", "S.picaud", "T.keates", "E.ugochukwu", "F.von delft", "", "C.h.arrowsmith", "A.m.edwards", "J.weigelt", "C.bountra", "S.knapp", "Structural", "Genomics consortium (sgc)"], "pmid": "22464331", "doi": "10.1016/J.CELL.2012.02.013"}
{"classification": "Membrane protein", "pdb": "1TLZ", "deposition_date": "2004-06-10", "title": "Tsx structure complexed with uridine", "Keywords": ["Ucleoside transporter", " beta barrel", " uridine", " membrane", "Protein"], "authors": ["J.ye", "B.van den berg"], "pmid": "15272310", "doi": "10.1038/SJ.EMBOJ.7600330"}
{"classification": "Dna binding protein", "pdb": "7AZD", "deposition_date": "2020-11-16", "title": "Dna polymerase sliding clamp from escherichia coli with peptide 20 Bound", "Keywords": ["Ntibacterial drug", " dna binding protein"], "authors": ["C.monsarrat", "G.compain", "C.andre", "I.martiel", "S.engilberge", "V.olieric", "", "P.wolff", "K.brillet", "M.landolfo", "C.silva da veiga", "J.wagner", "G.guichard", "", "D.y.burnouf"], "pmid": "34806883", "doi": "10.1021/ACS.JMEDCHEM.1C00918"}
{"classification": "Transferase", "pdb": "5N3K", "deposition_date": "2017-02-08", "title": "Camp-dependent protein kinase a from cricetulus griseus in complex With fragment like molecule o-guanidino-l-homoserine", "Keywords": ["Ragment", " complex", " transferase", " serine threonine kinase", " camp", "", "Kinase", "Pka"], "authors": ["C.siefker", "A.heine", "G.klebe"]}
{"classification": "Biosynthetic protein", "pdb": "8H52", "deposition_date": "2022-10-11", "title": "Crystal structure of helicobacter pylori carboxyspermidine Dehydrogenase in complex with nadp", "Keywords": ["Arboxyspermidine dehydrogenase", " biosynthetic protein"], "authors": ["K.y.ko", "S.c.park", "S.y.cho", "S.i.yoon"], "pmid": "36283333", "doi": "10.1016/J.BBRC.2022.10.049"}
{"classification": "Metal binding protein", "pdb": "6DYC", "deposition_date": "2018-07-01", "title": "Co(ii)-bound structure of the engineered cyt cb562 variant, ch3", "Keywords": ["Esigned protein", " 4-helix bundle", " electron transport", " metal binding", "Protein"], "authors": ["F.a.tezcan", "J.rittle"], "pmid": "30778140", "doi": "10.1038/S41557-019-0218-9"}
{"classification": "Protein fibril", "pdb": "6A6B", "deposition_date": "2018-06-27", "title": "Cryo-em structure of alpha-synuclein fiber", "Keywords": ["Lpha-syn fiber", " parkinson disease", " protein fibril"], "authors": ["Y.w.li", "C.y.zhao", "F.luo", "Z.liu", "X.gui", "Z.luo", "X.zhang", "D.li", "C.liu", "X.li"], "pmid": "30065316", "doi": "10.1038/S41422-018-0075-X"}
{"classification": "Dna", "pdb": "7D5E", "deposition_date": "2020-09-25", "title": "Left-handed g-quadruplex containing two bulges", "Keywords": ["-quadruplex", " bulge", " dna", " left-handed"], "authors": ["P.das", "A.maity", "K.h.ngo", "F.r.winnerdy", "B.bakalar", "Y.mechulam", "E.schmitt", "", "A.t.phan"], "pmid": "33503265", "doi": "10.1093/NAR/GKAA1259"}
{"classification": "Transferase", "pdb": "3RSY", "deposition_date": "2011-05-02", "title": "Cellobiose phosphorylase from cellulomonas uda in complex with sulfate And glycerol", "Keywords": ["H94", " alpha barrel", " cellobiose phosphorylase", " disaccharide", "Phosphorylase", "Transferase"], "authors": ["A.van hoorebeke", "J.stout", "W.soetaert", "J.van beeumen", "T.desmet", "S.savvides"]}
{"classification": "Oxidoreductase", "pdb": "7MCI", "deposition_date": "2021-04-02", "title": "Mofe protein from azotobacter vinelandii with a sulfur-replenished Cofactor", "Keywords": ["Zotobacter vinelandii", " mofe-protein", " nitrogenase", " oxidoreductase"], "authors": ["W.kang", "C.lee", "Y.hu", "M.w.ribbe"], "doi": "10.1038/S41929-022-00782-7"}
{"classification": "Dna", "pdb": "1XUW", "deposition_date": "2004-10-26", "title": "Structural rationalization of a large difference in rna affinity Despite a small difference in chemistry between two 2'-o-modified Nucleic acid analogs", "Keywords": ["Na mimetic methylcarbamate amide analog", " dna"], "authors": ["R.pattanayek", "L.sethaphong", "C.pan", "M.prhavc", "T.p.prakash", "M.manoharan", "", "M.egli"], "pmid": "15547979", "doi": "10.1021/JA044637K"}
{"classification": "Lyase", "pdb": "7C0D", "deposition_date": "2020-05-01", "title": "Crystal structure of azospirillum brasilense l-2-keto-3-deoxyarabonate Dehydratase (hydroxypyruvate-bound form)", "Keywords": ["-2-keto-3-deoxyarabonate dehydratase", " lyase"], "authors": ["Y.watanabe", "S.watanabe"], "pmid": "32697085", "doi": "10.1021/ACS.BIOCHEM.0C00515"}
{"classification": "Signaling protein", "pdb": "5LYK", "deposition_date": "2016-09-28", "title": "Crystal structure of intracellular b30.2 domain of btn3a1 bound to Citrate", "Keywords": ["30.2", " butyrophilin", " signaling protein"], "authors": ["F.mohammed", "A.t.baker", "M.salim", "B.e.willcox"], "pmid": "28862425", "doi": "10.1021/ACSCHEMBIO.7B00694"}
{"classification": "Toxin", "pdb": "4IZL", "deposition_date": "2013-01-30", "title": "Structure of the n248a mutant of the panton-valentine leucocidin s Component from staphylococcus aureus", "Keywords": ["I-component leucotoxin", " staphylococcus aureus", " s component", "Leucocidin", "Beta-barrel pore forming toxin", "Toxin"], "authors": ["L.maveyraud", "B.j.laventie", "G.prevost", "L.mourey"], "pmid": "24643034", "doi": "10.1371/JOURNAL.PONE.0092094"}
{"classification": "Dna", "pdb": "6F3C", "deposition_date": "2017-11-28", "title": "The cytotoxic [pt(h2bapbpy)] platinum complex interacting with the Cgtacg hexamer", "Keywords": ["Rug-dna complex", " four-way junction", " dna"], "authors": ["M.ferraroni", "C.bazzicalupi", "P.gratteri", "F.papi"], "pmid": "31046177", "doi": "10.1002/ANIE.201814532"}
{"classification": "Signaling protein/inhibitor", "pdb": "4L5M", "deposition_date": "2013-06-11", "title": "Complexe of arno sec7 domain with the protein-protein interaction Inhibitor n-(4-hydroxy-2,6-dimethylphenyl)benzenesulfonamide at ph6.5", "Keywords": ["Ec-7domain", " signaling protein-inhibitor complex"], "authors": ["F.hoh", "J.rouhana"], "pmid": "24112024", "doi": "10.1021/JM4009357"}
{"classification": "Signaling protein", "pdb": "5I6J", "deposition_date": "2016-02-16", "title": "Crystal structure of srgap2 f-barx", "Keywords": ["Rgap2", " f-bar", " fx", " signaling protein"], "authors": ["M.sporny", "J.guez-haddad", "M.n.isupov", "Y.opatowsky"], "pmid": "28333212", "doi": "10.1093/MOLBEV/MSX094"}
{"classification": "Metal binding protein", "pdb": "1Q80", "deposition_date": "2003-08-20", "title": "Solution structure and dynamics of nereis sarcoplasmic calcium binding Protein", "Keywords": ["Ll-alpha", " metal binding protein"], "authors": ["G.rabah", "R.popescu", "J.a.cox", "Y.engelborghs", "C.t.craescu"], "pmid": "15819893", "doi": "10.1111/J.1742-4658.2005.04629.X"}
{"classification": "Transferase", "pdb": "1TW1", "deposition_date": "2004-06-30", "title": "Beta-1,4-galactosyltransferase mutant met344his (m344h-gal-t1) complex With udp-galactose and magnesium", "Keywords": ["Et344his mutation; closed conformation; mn binding", " transferase"], "authors": ["B.ramakrishnan", "E.boeggeman", "P.k.qasba"], "pmid": "15449940", "doi": "10.1021/BI049007+"}
{"classification": "Rna", "pdb": "2PN4", "deposition_date": "2007-04-23", "title": "Crystal structure of hepatitis c virus ires subdomain iia", "Keywords": ["Cv", " ires", " subdoamin iia", " rna", " strontium", " hepatitis"], "authors": ["Q.zhao", "Q.han", "C.r.kissinger", "P.a.thompson"], "pmid": "18391410", "doi": "10.1107/S0907444908002011"}

View File

@ -1,6 +1,36 @@
{"pid": "Q6GZX4", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 41."}], "title": "Putative transcription factor 001R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]} {"pid": " Q6GZX4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 43"}], "title": "Putative transcription factor 001R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": "Q6GZX3", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 42."}], "title": "Uncharacterized protein 002L;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]} {"pid": " Q6GZX3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 45"}], "title": "Uncharacterized protein 002L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": "Q197F8", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 27."}], "title": "Uncharacterized protein 002R;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]} {"pid": " Q197F8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 29"}], "title": "Uncharacterized protein 002R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": "Q197F7", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 23."}], "title": "Uncharacterized protein 003L;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]} {"pid": " Q197F7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 23"}], "title": "Uncharacterized protein 003L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": "Q6GZX2", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 36."}], "title": "Uncharacterized protein 3R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]} {"pid": " Q6GZX2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 3R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": "Q6GZX1", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 34."}], "title": "Uncharacterized protein 004R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]} {"pid": " Q6GZX1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 38"}], "title": "Uncharacterized protein 004R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197F5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 005L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZX0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 47"}], "title": "Uncharacterized protein 005R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q91G88", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-06-28", "date_info": "entry version 53"}], "title": "Putative KilA-N domain-containing protein 006L", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
{"pid": " Q6GZW9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 34"}], "title": "Uncharacterized protein 006R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 32"}], "title": "Uncharacterized protein 007R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197F3", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 007R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q197F2", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 22"}], "title": "Uncharacterized protein 008L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZW6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 67"}], "title": "Putative helicase 009L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q91G85", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 38"}], "title": "Uncharacterized protein 009R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
{"pid": " Q6GZW5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 010R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197E9", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 011L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZW4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 011R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 012L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197E7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 37"}], "title": "Uncharacterized protein IIV3-013L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZW2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 30"}], "title": "Uncharacterized protein 013R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 014R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 50"}], "title": "Uncharacterized protein 015R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 017L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV7", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 018L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 87"}], "title": "Putative serine/threonine-protein kinase 019R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 40"}], "title": "Uncharacterized protein 020R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 021L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197D8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-12-14", "date_info": "entry version 35"}], "title": "Transmembrane protein 022L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZV2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 023R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197D7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 25"}], "title": "Uncharacterized protein 023R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZV1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 024R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197D5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 24"}], "title": "Uncharacterized protein 025R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q91G70", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 026R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
{"pid": " Q6GZU9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 49"}], "title": "Uncharacterized protein 027R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZU8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 55"}], "title": "Uncharacterized protein 028R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}

View File

@ -26,7 +26,7 @@ class MAGMappingTest {
@Test @Test
def mappingMagType(): Unit = { def mappingMagType(): Unit = {
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = false, "Other literature type") checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = true, "Other literature type")
checkResult[Publication]( checkResult[Publication](
MagUtility.createResultFromType(Some("BookChapter"), null), MagUtility.createResultFromType(Some("BookChapter"), null),
invisible = false, invisible = false,

View File

@ -17,45 +17,6 @@ import eu.dnetlib.pace.tree.support.TreeStats;
class DecisionTreeTest { class DecisionTreeTest {
@Test
void testJPath() throws IOException {
DedupConfig conf = DedupConfig
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("countrytitle"));
}
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
}
@Test @Test
void organizationDecisionTreeTest() throws Exception { void organizationDecisionTreeTest() throws Exception {
DedupConfig conf = DedupConfig DedupConfig conf = DedupConfig

View File

@ -452,18 +452,18 @@ public class SparkDedupTest implements Serializable {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType()); assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass()); assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
assertTrue(dups.contains(r.getTarget())); assertFalse(dups.contains(r.getTarget()));
}); });
final List<Relation> mergedIn = pubs final List<Relation> mergedIn = pubs
.filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'") .filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
.collectAsList(); .collectAsList();
assertEquals(3, mergedIn.size()); assertEquals(1, mergedIn.size());
mergedIn.forEach(r -> { mergedIn.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType()); assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass()); assertEquals(ModelConstants.MERGES, r.getRelClass());
assertTrue(dups.contains(r.getSource())); assertFalse(dups.contains(r.getSource()));
}); });
System.out.println("orgs_mergerel = " + orgs_mergerel); System.out.println("orgs_mergerel = " + orgs_mergerel);
@ -473,8 +473,8 @@ public class SparkDedupTest implements Serializable {
System.out.println("orp_mergerel = " + orp_mergerel); System.out.println("orp_mergerel = " + orp_mergerel);
if (CHECK_CARDINALITIES) { if (CHECK_CARDINALITIES) {
assertEquals(1268, orgs_mergerel); assertEquals(1278, orgs_mergerel);
assertEquals(1156, pubs.count()); assertEquals(1158, pubs.count());
assertEquals(292, sw_mergerel); assertEquals(292, sw_mergerel);
assertEquals(476, ds_mergerel); assertEquals(476, ds_mergerel);
assertEquals(742, orp_mergerel); assertEquals(742, orp_mergerel);

View File

@ -241,7 +241,6 @@ public class SparkPublicationRootsTest implements Serializable {
verifyRoot_case_1(roots, pubs); verifyRoot_case_1(roots, pubs);
verifyRoot_case_2(roots, pubs); verifyRoot_case_2(roots, pubs);
verifyRoot_case_3(roots, pubs);
} }
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) { private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
@ -322,34 +321,6 @@ public class SparkPublicationRootsTest implements Serializable {
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty()); assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
} }
private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
Publication root = roots
.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
.first();
assertNotNull(root);
Publication pivot_duplicate = pubs
.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
.first();
assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> dups_cf = pubs
.collectAsList()
.stream()
.flatMap(p -> p.getCollectedfrom().stream())
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> root_cf = root
.getCollectedfrom()
.stream()
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
}
@Test @Test
@Order(6) @Order(6)
void updateEntityTest() throws Exception { void updateEntityTest() throws Exception {

View File

@ -143,7 +143,9 @@ public class SparkPublicationRootsTest2 implements Serializable {
"--graphBasePath", graphInputPath, "--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId, "--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl", "--isLookUpUrl", "lookupurl",
"--workingPath", workingPath "--workingPath", workingPath,
"--hiveMetastoreUris", "none",
"--pivotHistoryDatabase", ""
}), spark) }), spark)
.run(isLookUpService); .run(isLookUpService);
@ -153,7 +155,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
.as(Encoders.bean(Relation.class)); .as(Encoders.bean(Relation.class));
assertEquals( assertEquals(
3, merges 4, merges
.filter("relclass == 'isMergedIn'") .filter("relclass == 'isMergedIn'")
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING()) .map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
.distinct() .distinct()
@ -178,7 +180,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord") .textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
.map(asEntity(Publication.class), Encoders.bean(Publication.class)); .map(asEntity(Publication.class), Encoders.bean(Publication.class));
assertEquals(3, roots.count()); assertEquals(4, roots.count());
final Dataset<Publication> pubs = spark final Dataset<Publication> pubs = spark
.read() .read()
@ -195,7 +197,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
.collectAsList() .collectAsList()
.get(0); .get(0);
assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue()); assertEquals("2022-01-01", root.getDateofacceptance().getValue());
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName()); assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted()); assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue()); assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());

View File

@ -168,7 +168,7 @@ public class SparkStatsTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
.count(); .count();
assertEquals(414, orgs_blocks); assertEquals(412, orgs_blocks);
assertEquals(221, pubs_blocks); assertEquals(221, pubs_blocks);
assertEquals(134, sw_blocks); assertEquals(134, sw_blocks);
assertEquals(196, ds_blocks); assertEquals(196, ds_blocks);

View File

@ -73,12 +73,6 @@
"name": "Irish Nephrology Society", "name": "Irish Nephrology Society",
"synonym": [] "synonym": []
}, },
{
"id": "100011062",
"uri": "http://dx.doi.org/10.13039/100011062",
"name": "Asian Spinal Cord Network",
"synonym": []
},
{ {
"id": "100011096", "id": "100011096",
"uri": "http://dx.doi.org/10.13039/100011096", "uri": "http://dx.doi.org/10.13039/100011096",
@ -223,12 +217,6 @@
"name": "Global Brain Health Institute", "name": "Global Brain Health Institute",
"synonym": [] "synonym": []
}, },
{
"id": "100015776",
"uri": "http://dx.doi.org/10.13039/100015776",
"name": "Health and Social Care Board",
"synonym": []
},
{ {
"id": "100015992", "id": "100015992",
"uri": "http://dx.doi.org/10.13039/100015992", "uri": "http://dx.doi.org/10.13039/100015992",
@ -403,18 +391,6 @@
"name": "Irish Hospice Foundation", "name": "Irish Hospice Foundation",
"synonym": [] "synonym": []
}, },
{
"id": "501100001596",
"uri": "http://dx.doi.org/10.13039/501100001596",
"name": "Irish Research Council for Science, Engineering and Technology",
"synonym": []
},
{
"id": "501100001597",
"uri": "http://dx.doi.org/10.13039/501100001597",
"name": "Irish Research Council for the Humanities and Social Sciences",
"synonym": []
},
{ {
"id": "501100001598", "id": "501100001598",
"uri": "http://dx.doi.org/10.13039/501100001598", "uri": "http://dx.doi.org/10.13039/501100001598",
@ -515,7 +491,7 @@
"id": "501100002081", "id": "501100002081",
"uri": "http://dx.doi.org/10.13039/501100002081", "uri": "http://dx.doi.org/10.13039/501100002081",
"name": "Irish Research Council", "name": "Irish Research Council",
"synonym": [] "synonym": ["501100001596", "501100001597"]
}, },
{ {
"id": "501100002736", "id": "501100002736",

View File

@ -560,7 +560,15 @@ case object Crossref2Oaf {
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" | "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
"10.13039/501100013589" | "10.13039/501100000271" => "10.13039/501100013589" | "10.13039/501100000271" =>
generateSimpleRelationFromAward(funder, "ukri________", a => a) generateSimpleRelationFromAward(funder, "ukri________", a => a)
//HFRI
case "10.13039/501100013209" =>
generateSimpleRelationFromAward(funder, "hfri________", a => a)
val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
//ERASMUS+
case "10.13039/501100010790" =>
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
case _ => logger.debug("no match for " + funder.DOI.get) case _ => logger.debug("no match for " + funder.DOI.get)
} }

View File

@ -313,7 +313,7 @@ case object ConversionUtil {
if (f.author.DisplayName.isDefined) if (f.author.DisplayName.isDefined)
a.setFullname(f.author.DisplayName.get) a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null) if (f.affiliation != null)
a.setRawAffiliationString(List(f.affiliation).asJava) a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid( a.setPid(
List( List(
createSP( createSP(
@ -386,7 +386,7 @@ case object ConversionUtil {
a.setFullname(f.author.DisplayName.get) a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null) if (f.affiliation != null)
a.setRawAffiliationString(List(f.affiliation).asJava) a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid( a.setPid(
List( List(

View File

@ -13,13 +13,13 @@ public class CommunityContentprovider {
private String openaireId; private String openaireId;
private SelectionConstraints selectioncriteria; private SelectionConstraints selectioncriteria;
private String enabled; private Boolean enabled;
public String getEnabled() { public Boolean getEnabled() {
return enabled; return enabled;
} }
public void setEnabled(String enabled) { public void setEnabled(Boolean enabled) {
this.enabled = enabled; this.enabled = enabled;
} }

View File

@ -53,6 +53,8 @@ public class Constraints implements Serializable {
for (Constraint sc : constraint) { for (Constraint sc : constraint) {
boolean verified = false; boolean verified = false;
if (!param.containsKey(sc.getField()))
return false;
for (String value : param.get(sc.getField())) { for (String value : param.get(sc.getField())) {
if (sc.verifyCriteria(value.trim())) { if (sc.verifyCriteria(value.trim())) {
verified = true; verified = true;

View File

@ -14,6 +14,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -84,19 +85,26 @@ public class SparkCountryPropagationJob {
Dataset<R> res = readPath(spark, sourcePath, resultClazz); Dataset<R> res = readPath(spark, sourcePath, resultClazz);
log.info("Reading prepared info: {}", preparedInfoPath); log.info("Reading prepared info: {}", preparedInfoPath);
Dataset<ResultCountrySet> prepared = spark final Dataset<Row> preparedInfoRaw = spark
.read() .read()
.json(preparedInfoPath) .json(preparedInfoPath);
.as(Encoders.bean(ResultCountrySet.class));
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
if (!preparedInfoRaw.isEmpty()) {
final Dataset<ResultCountrySet> prepared = preparedInfoRaw.as(Encoders.bean(ResultCountrySet.class));
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
} else {
res
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
} }
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() { private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {

View File

@ -147,6 +147,7 @@ public class CleanGraphSparkJob {
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz)) .map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) .map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz)) .map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
.map((MapFunction<T, T>) GraphCleaningFunctions::dedicatedUglyHacks, Encoders.bean(clazz))
.filter((FilterFunction<T>) GraphCleaningFunctions::filter); .filter((FilterFunction<T>) GraphCleaningFunctions::filter);
// read the master-duplicate tuples // read the master-duplicate tuples

View File

@ -9,7 +9,10 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -22,6 +25,8 @@ public class GraphHiveTableImporterJob {
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class); private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -69,12 +74,7 @@ public class GraphHiveTableImporterJob {
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName, private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
Class<T> clazz, int numPartitions) { Class<T> clazz, int numPartitions) {
final Encoder<T> clazzEncoder = Encoders.bean(clazz); Dataset<String> dataset = spark.read().textFile(inputPath);
Dataset<Row> dataset = spark
.read()
.schema(clazzEncoder.schema())
.json(inputPath);
if (numPartitions > 0) { if (numPartitions > 0) {
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions); log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
@ -82,6 +82,7 @@ public class GraphHiveTableImporterJob {
} }
dataset dataset
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.saveAsTable(tableIdentifier(hiveDbName, clazz)); .saveAsTable(tableIdentifier(hiveDbName, clazz));

View File

@ -94,7 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
} }
author.setRawAffiliationString(prepareListString(n, "./*[local-name()='affiliation']")); author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info));
author.setPid(preparePids(n, info)); author.setPid(preparePids(n, info));
author.setRank(pos++); author.setRank(pos++);
res.add(author); res.add(author);

View File

@ -85,7 +85,7 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<fork name="fork_downloads_csv"> <fork name="fork_downloads_csv">
<path start="download_gold"/> <path start="download_gold"/>
<path start="download_doaj_json"/> <path start="download_doaj_json"/>
</fork> </fork>
@ -223,11 +223,13 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=15000
</spark-opts> </spark-opts>
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg> <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
@ -253,11 +255,13 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=15000
</spark-opts> </spark-opts>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg> <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
@ -278,6 +282,7 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -73,10 +73,14 @@ public class GraphHiveImporterJobTest {
GraphHiveImporterJob GraphHiveImporterJob
.main( .main(
new String[] { new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged",
"--inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), Boolean.FALSE.toString(),
"--hiveMetastoreUris", "", "-inputPath",
"--hiveDbName", dbName getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
"-hiveMetastoreUris",
"",
"-hiveDbName",
dbName
}); });
ModelSupport.oafTypes ModelSupport.oafTypes

View File

@ -406,15 +406,15 @@ class MappersTest {
assertEquals("Baracchini", author.get().getSurname()); assertEquals("Baracchini", author.get().getSurname());
assertEquals("Theo", author.get().getName()); assertEquals("Theo", author.get().getName());
assertEquals(1, author.get().getRawAffiliationString().size()); assertEquals(1, author.get().getAffiliation().size());
final Optional<String> opAff = author final Optional<Field<String>> opAff = author
.get() .get()
.getRawAffiliationString() .getAffiliation()
.stream() .stream()
.findFirst(); .findFirst();
assertTrue(opAff.isPresent()); assertTrue(opAff.isPresent());
final String affiliation = opAff.get(); final Field<String> affiliation = opAff.get();
assertEquals("ISTI-CNR", affiliation); assertEquals("ISTI-CNR", affiliation.getValue());
assertFalse(d.getSubject().isEmpty()); assertFalse(d.getSubject().isEmpty());
assertFalse(d.getInstance().isEmpty()); assertFalse(d.getInstance().isEmpty());

View File

@ -31,11 +31,5 @@ class ORCIDAuthorMatchersTest {
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin")) assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
} }
@Test def testDocumentationNames(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
}
@Test def testDocumentationNames2(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller Jones"))
}
} }

View File

@ -69,7 +69,7 @@
</configuration> </configuration>
</global> </global>
<start to="oaiphm_provision"/> <start to="irish_oaiphm_provision"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>

View File

@ -67,7 +67,7 @@ public class PrepareRelationsJobTest {
@Test @Test
void testRunPrepareRelationsJob(@TempDir Path testPath) throws Exception { void testRunPrepareRelationsJob(@TempDir Path testPath) throws Exception {
final int maxRelations = 20; final int maxRelations = 5;
PrepareRelationsJob PrepareRelationsJob
.main( .main(
new String[] { new String[] {
@ -86,7 +86,7 @@ public class PrepareRelationsJobTest {
.as(Encoders.bean(Relation.class)) .as(Encoders.bean(Relation.class))
.cache(); .cache();
assertEquals(maxRelations, out.count()); assertEquals(44, out.count());
Dataset<Row> freq = out Dataset<Row> freq = out
.toDF() .toDF()
@ -101,12 +101,8 @@ public class PrepareRelationsJobTest {
long affiliation = getRows(freq, AFFILIATION).get(0).getAs("count"); long affiliation = getRows(freq, AFFILIATION).get(0).getAs("count");
assertEquals(outcome, participation); assertEquals(outcome, participation);
assertTrue(outcome > affiliation); assertEquals(outcome, affiliation);
assertTrue(participation > affiliation); assertEquals(4, affiliation);
assertEquals(7, outcome);
assertEquals(7, participation);
assertEquals(6, affiliation);
} }
protected List<Row> getRows(Dataset<Row> freq, String col) { protected List<Row> getRows(Dataset<Row> freq, String col) {

View File

@ -91,9 +91,6 @@ class SolrRecordDumpJobTest {
public void prepareMocks() throws ISLookUpException, IOException { public void prepareMocks() throws ISLookUpException, IOException {
isLookupClient.setIsLookup(isLookUpService); isLookupClient.setIsLookup(isLookUpService);
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString())) .when(isLookupClient.getLayoutSource(Mockito.anyString()))
.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml"))); .thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));

View File

@ -48,16 +48,25 @@
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case> <case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case> <case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case> <case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case> <case to="clear-working-dir">${wf:conf('resume') eq "start"}</case>
<!-- Aggregation of impact scores on the project level --> <!-- Aggregation of impact scores on the project level -->
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case> <case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case> <case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
<default to="create-openaire-ranking-graph" /> <default to="clear-working-dir" />
</switch> </switch>
</decision> </decision>
<action name="clear-working-dir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="create-openaire-ranking-graph"/>
<error to="clear-working-dir-fail"/>
</action>
<!-- initial step: create citation network --> <!-- initial step: create citation network -->
<action name="create-openaire-ranking-graph"> <action name="create-openaire-ranking-graph">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
@ -618,6 +627,10 @@
<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<kill name="clear-working-dir-fail">
<message>Re-create working dir failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<!-- Define ending node --> <!-- Define ending node -->
<end name="end" /> <end name="end" />

View File

@ -32,7 +32,7 @@ select distinct * from (
from SOURCE.result r from SOURCE.result r
join SOURCE.result_projects rp on rp.id=r.id join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project join SOURCE.project p on p.id=rp.project
join TARGET.irish_funders irf on irf.funder=p.funder join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
union all union all
select r.* select r.*
from SOURCE.result r from SOURCE.result r

View File

@ -1,3 +1,79 @@
--drop database if exists TARGET cascade;
--create database if not exists TARGET;
--
--create view if not exists TARGET.category as select * from SOURCE.category;
--create view if not exists TARGET.concept as select * from SOURCE.concept;
--create view if not exists TARGET.context as select * from SOURCE.context;
--create view if not exists TARGET.country as select * from SOURCE.country;
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
--create view if not exists TARGET.funder as select * from SOURCE.funder;
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--
--create table TARGET.result stored as parquet as
-- select distinct * from (
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
-- ) )) foo;
--
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context; create view if not exists TARGET.context as select * from SOURCE.context;

View File

@ -81,17 +81,7 @@ create table TARGET.result stored as parquet as
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden 'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
))) foo; ))) foo;
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -61,17 +61,7 @@ create table TARGET.result stored as parquet as
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden 'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
))) foo; ))) foo;
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -1,18 +0,0 @@
# Install the whole "dnet-hadoop" project.
# Delete this module's previous build-files in order to avoid any conflicts.
rm -rf target/ ||
# Go to the root directory of this project.
cd ../../
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Install the project.
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
# We skip tests for all modules, since the take a big amount of time and some of them fail.
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.

View File

@ -1,20 +0,0 @@
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Build and deploy this module.
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
# Show the Oozie-job-ID.
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
cat ./target/extract-and-run-on-remote-host.log
# Check oozie workflow status
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
# Get the <job-ID> from the previous output and check the logs:
# yarn logs -applicationId application_<job-ID>

View File

@ -1,10 +1,8 @@
set mapred.job.queue.name=analytics; /*EOS*/
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
-- Stats database creation -- Stats database creation
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/ DROP database IF EXISTS ${stats_db_name} CASCADE;
CREATE database ${stats_db_name}; /*EOS*/ CREATE database ${stats_db_name};

View File

@ -1,5 +1,3 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture) -- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
@ -7,27 +5,27 @@ set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
SELECT * SELECT *
FROM ${external_stats_db_name}.fundref; /*EOS*/ FROM ${external_stats_db_name}.fundref;
CREATE OR REPLACE VIEW ${stats_db_name}.country AS CREATE OR REPLACE VIEW ${stats_db_name}.country AS
SELECT * SELECT *
FROM ${external_stats_db_name}.country; /*EOS*/ FROM ${external_stats_db_name}.country;
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
SELECT * SELECT *
FROM ${external_stats_db_name}.countrygdp; /*EOS*/ FROM ${external_stats_db_name}.countrygdp;
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
SELECT * SELECT *
FROM ${external_stats_db_name}.roarmap; /*EOS*/ FROM ${external_stats_db_name}.roarmap;
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT * SELECT *
FROM ${external_stats_db_name}.rndexpediture; /*EOS*/ FROM ${external_stats_db_name}.rndexpediture;
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
SELECT * SELECT *
FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/ FROM ${external_stats_db_name}.licenses_normalized;
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
@ -35,23 +33,23 @@ FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
create or replace view ${stats_db_name}.usage_stats as create or replace view ${stats_db_name}.usage_stats as
select * from openaire_prod_usage_stats.usage_stats; /*EOS*/ select * from openaire_prod_usage_stats.usage_stats;
create or replace view ${stats_db_name}.downloads_stats as create or replace view ${stats_db_name}.downloads_stats as
select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/ select * from openaire_prod_usage_stats.downloads_stats;
create or replace view ${stats_db_name}.pageviews_stats as create or replace view ${stats_db_name}.pageviews_stats as
select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/ select * from openaire_prod_usage_stats.pageviews_stats;
create or replace view ${stats_db_name}.views_stats as create or replace view ${stats_db_name}.views_stats as
select * from openaire_prod_usage_stats.views_stats; /*EOS*/ select * from openaire_prod_usage_stats.views_stats;
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
-- Creation date of the database -- Creation date of the database
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge;
create table ${stats_db_name}.creation_date STORED AS PARQUET as create table ${stats_db_name}.creation_date STORED AS PARQUET as
select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/ select date_format(current_date(), 'dd-MM-yyyy') as date;

View File

@ -1,11 +1,110 @@
set mapred.job.queue.name=analytics; /*EOS*/
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
-- Post processing - Updates on main tables -- Post processing - Updates on main tables
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
--Datasource temporary table updates
UPDATE ${stats_db_name}.datasource_tmp
SET harvested='true'
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
FROM ${stats_db_name}.datasource_tmp d,
${stats_db_name}.result_datasources rd
WHERE d.id = rd.datasource);
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
UPDATE ${stats_db_name}.project_tmp
SET haspubs='yes'
WHERE project_tmp.id IN (SELECT pr.id
FROM ${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pr.result = r.id
AND r.type = 'publication');
DROP TABLE IF EXISTS ${stats_db_name}.stored purge;
CREATE TABLE ${stats_db_name}.project stored as parquet as
SELECT p.id,
p.acronym,
p.title,
p.funder,
p.funding_lvl0,
p.funding_lvl1,
p.funding_lvl2,
p.ec39,
p.type,
p.startdate,
p.enddate,
p.start_year,
p.end_year,
p.duration,
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
p.callidentifier,
p.code,
p.totalcost,
p.fundedamount,
p.currency
FROM ${stats_db_name}.project_tmp p
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
FROM ${stats_db_name}.project_results pr
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
WHERE r.type = 'publication'
GROUP BY pr.id) AS prr1 on prr1.id = p.id
LEFT JOIN (SELECT pp.id,
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
count(distinct r.id) AS dp
FROM ${stats_db_name}.project_tmp pp,
${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pp.id = pr.id
AND pr.result = r.id
AND r.type = 'publication'
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
GROUP BY pp.id) AS prr2
ON prr2.id = p.id;
UPDATE ${stats_db_name}.publication_tmp
SET delayed = 'yes'
WHERE publication_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.dataset_tmp
SET delayed = 'yes'
WHERE dataset_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.software_tmp
SET delayed = 'yes'
WHERE software_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.otherresearchproduct_tmp
SET delayed = 'yes'
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
SELECT result_projects.id AS result, SELECT result_projects.id AS result,
result_projects.project AS project_results, result_projects.project AS project_results,
@ -17,4 +116,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project ${stats_db_name}.project
WHERE result_projects.id = result.id WHERE result_projects.id = result.id
AND result.type = 'publication' AND result.type = 'publication'
AND project.id = result_projects.project; /*EOS*/ AND project.id = result_projects.project;

View File

@ -1,4 +1,42 @@
set mapred.job.queue.name=analytics; /*EOS*/ ------------------------------------------------------------------------------------------------------
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge;
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
SELECT *
FROM ${stats_db_name}.datasource_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.publication purge;
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
SELECT *
FROM ${stats_db_name}.publication_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge;
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
SELECT *
FROM ${stats_db_name}.dataset_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.software purge;
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
SELECT *
FROM ${stats_db_name}.software_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge;
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
SELECT *
FROM ${stats_db_name}.otherresearchproduct_tmp;
DROP TABLE ${stats_db_name}.project_tmp;
DROP TABLE ${stats_db_name}.datasource_tmp;
DROP TABLE ${stats_db_name}.publication_tmp;
DROP TABLE ${stats_db_name}.dataset_tmp;
DROP TABLE ${stats_db_name}.software_tmp;
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
---------------------------------------------- ----------------------------------------------
-- Re-creating views from final parquet tables -- Re-creating views from final parquet tables
@ -16,4 +54,4 @@ SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset FROM ${stats_db_name}.dataset
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct; /*EOS*/ FROM ${stats_db_name}.otherresearchproduct;

View File

@ -1,5 +1,3 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations
@ -7,10 +5,10 @@ set mapred.job.queue.name=analytics; /*EOS*/
-- Sources related tables/views -- Sources related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -18,12 +16,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -31,12 +29,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -44,12 +42,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -57,7 +55,7 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
SELECT * FROM ${stats_db_name}.publication_sources SELECT * FROM ${stats_db_name}.publication_sources
@ -66,24 +64,24 @@ SELECT * FROM ${stats_db_name}.dataset_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_sources SELECT * FROM ${stats_db_name}.software_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/ SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
from ( from (
SELECT substr(res.id, 4) as id, auth_pid.value as orcid SELECT substr(res.id, 4) as id, auth_pid.value as orcid
FROM ${openaire_db_name}.result res FROM ${openaire_db_name}.result res
LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(author) a as auth
LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid) ap as auth_pid
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/ WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_result purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -93,12 +91,12 @@ where reltype='resultResult'
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/ and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -110,12 +108,12 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4); /*EOS*/ group by substr(target, 4);
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references select substr(source, 4) as id, count(distinct substr(target, 4)) as references
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -127,4 +125,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(source, 4); /*EOS*/ group by substr(source, 4);

View File

@ -1,5 +1,4 @@
set mapred.job.queue.name=analytics; /*EOS*/ set mapred.job.queue.name=analytics;
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations
@ -7,33 +6,33 @@ set mapred.job.queue.name=analytics; /*EOS*/
-- Licences related tables/views -- Licences related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
SELECT * FROM ${stats_db_name}.publication_licenses SELECT * FROM ${stats_db_name}.publication_licenses
@ -42,29 +41,29 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_licenses SELECT * FROM ${stats_db_name}.software_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/ SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/ from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
FROM ( FROM (
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/ WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/ WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE;

View File

@ -1,4 +1,4 @@
set mapred.job.queue.name=analytics; /*EOS*/ set mapred.job.queue.name=analytics;
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -18,15 +18,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select /*+ COALESCE(100) */ distinct * select distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; /*EOS*/ where peer_reviewed.id is null) pr;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -36,15 +36,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select /*+ COALESCE(100) */ distinct * select distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; /*EOS*/ where peer_reviewed.id is null) pr;
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -54,15 +54,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select /*+ COALESCE(100) */ distinct * select distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; /*EOS*/ where peer_reviewed.id is null) pr;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -72,13 +72,13 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select /*+ COALESCE(100) */ distinct * select distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; /*EOS*/ where peer_reviewed.id is null) pr;
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
select * from ${stats_db_name}.publication_refereed select * from ${stats_db_name}.publication_refereed
@ -87,23 +87,23 @@ select * from ${stats_db_name}.dataset_refereed
union all union all
select * from ${stats_db_name}.software_refereed select * from ${stats_db_name}.software_refereed
union all union all
select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/ select * from ${stats_db_name}.otherresearchproduct_refereed;
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge;
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/ where measures_ids.id!='views' and measures_ids.id!='downloads';
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge;
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
cast(rel.properties[0].value as double) apc_amount, cast(rel.properties[0].value as double) apc_amount,
rel.properties[1].value apc_currency rel.properties[1].value apc_currency
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.organization o on o.id=rel.source
join ${openaire_db_name}.result r on r.id=rel.target join ${openaire_db_name}.result r on r.id=rel.target
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/ where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;

View File

@ -1,27 +1,27 @@
set mapred.job.queue.name=analytics; /*EOS*/ set mapred.job.queue.name=analytics;
------------------------------------------- -------------------------------------------
--- Extra tables, mostly used by indicators --- Extra tables, mostly used by indicators
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge;
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count select r.id, count(distinct p.id) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; /*EOS*/ group by r.id;
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge;
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count select r.id, count(distinct p.funder) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; /*EOS*/ group by r.id;
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge;
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
with rcount as ( with rcount as (
@ -30,39 +30,39 @@ with rcount as (
left outer join ${stats_db_name}.result_projects rp on rp.project=p.id left outer join ${stats_db_name}.result_projects rp on rp.project=p.id
left outer join ${stats_db_name}.result r on r.id=rp.id left outer join ${stats_db_name}.result r on r.id=rp.id
group by r.type, p.id ) group by r.type, p.id )
select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets, sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets,
sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='software' then rcount.count else 0 end) as software,
sum(case when rcount.type='other' then rcount.count else 0 end) as other sum(case when rcount.type='other' then rcount.count else 0 end) as other
from rcount from rcount
group by rcount.pid; /*EOS*/ group by rcount.pid;
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/ create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/ create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/ create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/ create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/ create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/ create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/ create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates;
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge;
create table if not exists ${stats_db_name}.result_instance stored as parquet as create table if not exists ${stats_db_name}.result_instance stored as parquet as
select /*+ COALESCE(100) */ distinct r.* select distinct r.*
from ( from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id; /*EOS*/ join ${stats_db_name}.result res on res.id=r.id;
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency select distinct r.id, r.amount, r.currency
from ( from (
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
join ${stats_db_name}.result res on res.id=r.id join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null; /*EOS*/ where r.amount is not null;
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/ create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;

View File

@ -1,7 +1,7 @@
-- Sprint 1 ---- -- Sprint 1 ----
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa select distinct p.id, coalesce(green_oa, 0) as green_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as green_oa select p.id, 1 as green_oa
@ -12,7 +12,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit select distinct p.id, coalesce(grey_lit, 0) as grey_lit
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as grey_lit select p.id, 1 as grey_lit
@ -23,7 +23,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
@ -33,7 +33,7 @@ left outer join (
-- Sprint 2 ---- -- Sprint 2 ----
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select r.id, license.type as lic from ${stats_db_name}.result r select r.id, license.type as lic from ${stats_db_name}.result r
@ -42,7 +42,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select r.id, lower(parse_url(license.type, "HOST")) as lic_host select r.id, lower(parse_url(license.type, "HOST")) as lic_host
@ -52,12 +52,12 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
from ${stats_db_name}.publication; /*EOS*/ from ${stats_db_name}.publication; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/ select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
@ -66,7 +66,7 @@ left outer join (
---- Sprint 3 ---- ---- Sprint 3 ----
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref select distinct r.result as id, coalesce(fundref, 0) as fundref
from ${stats_db_name}.project_results r from ${stats_db_name}.project_results r
left outer join ( left outer join (
select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/ select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par
SELECT ro.organization organization, ro.id, o.name SELECT ro.organization organization, ro.id, o.name
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null) join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/ group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization join ${stats_db_name}.organization o on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null) where country <> 'UNKNOWN' and o.name is not null)
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
from tmp as o1 join tmp as o2 on o1.id=o2.id from tmp as o1 join tmp as o2 on o1.id=o2.id
where o1.id=o2.id and o1.country!=o2.country where o1.id=o2.id and o1.country!=o2.country
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/ group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa
select o.id organization, o.name, ro.project as project select o.id organization, o.name, ro.project as project
from ${stats_db_name}.organization o from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null) join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.name<>o2.name where o1.organization<>o2.organization and o1.name<>o2.name
@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor
select o.id organization, o.name, o.country , ro.project as project select o.id organization, o.name, o.country , ro.project as project
from ${stats_db_name}.organization o from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null) join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country where o1.organization<>o2.organization and o1.country<>o2.country
@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as
join ${stats_db_name}.organization o on o.id=op.id join ${stats_db_name}.organization o on o.id=op.id
join ${stats_db_name}.project p on p.id=op.project join ${stats_db_name}.project p on p.id=op.project
where country <> 'UNKNOWN') where country <> 'UNKNOWN')
select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
from tmp as f1 from tmp as f1
join tmp as f2 on f1.project=f2.project join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
select distinct country, ro.id as result from ${stats_db_name}.organization o select distinct country, ro.id as result from ${stats_db_name}.organization o
join ${stats_db_name}.result_organization ro on o.id=ro.organization join ${stats_db_name}.result_organization ro on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null) where country <> 'UNKNOWN' and o.name is not null)
select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations select o1.country country1, o2.country country2, count(o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country where o1.country<>o2.country
@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
---- Sprint 4 ---- ---- Sprint 4 ----
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from ${stats_db_name}.publication_datasources pd from ${stats_db_name}.publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as in_diamond_journal select pd.id, 1 as in_diamond_journal
@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from ${stats_db_name}.publication pd from ${stats_db_name}.publication pd
left outer join ( left outer join (
select pd.id, 1 as is_transformative select pd.id, 1 as is_transformative
@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
from ${stats_db_name}.result_instance ri from ${stats_db_name}.result_instance ri
left outer join ( left outer join (
select ri.id, 1 as pub_closed_other_open select ri.id, 1 as pub_closed_other_open
@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as
---- Sprint 5 ---- ---- Sprint 5 ----
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
select /*+ COALESCE(100) */ id, count(id) as number_of_copies select id, count(id) as number_of_copies
from ${stats_db_name}.result_instance from ${stats_db_name}.result_instance
group by id; /*EOS*/ group by id; /*EOS*/
---- Sprint 6 ---- ---- Sprint 6 ----
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads SELECT result_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads SELECT result_id, repository_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us from openaire_prod_usage_stats.usage_stats us
join ${stats_db_name}.publication on result_id=id where downloads>0 join ${stats_db_name}.publication on result_id=id where downloads>0
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/ GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us from openaire_prod_usage_stats.usage_stats us
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
UNION ALL UNION ALL
select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo
) )
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM ${stats_db_name}.publication pd FROM ${stats_db_name}.publication pd
left outer join ( left outer join (
select pd.id, 1 as is_gold select pd.id, 1 as is_gold
@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
FROM ${stats_db_name}.datasource FROM ${stats_db_name}.datasource
WHERE issn_online IS NOT NULL ) as issn WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7) WHERE LENGTH(issn) > 7)
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
FROM ${stats_db_name}.publication_datasources pd FROM ${stats_db_name}.publication_datasources pd
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid select distinct p.id, coalesce(is_hybrid, 0) is_hybrid
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as is_hybrid select p.id, 1 as is_hybrid
@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet
where cast(year as int)>2003 where cast(year as int)>2003
group by ro.organization) group by ro.organization)
--return results_fair/all_results --return results_fair/all_results
select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization; /*EOS*/ join result_fair on result_fair.organization=allresults.organization; /*EOS*/
@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar from allresults ar
join result_fair rf on rf.organization=ar.organization; /*EOS*/ join result_fair rf on rf.organization=ar.organization; /*EOS*/
@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct
drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/ join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/
@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar join result_fair rf from allresults ar join result_fair rf
on rf.organization=ar.organization; /*EOS*/ on rf.organization=ar.organization; /*EOS*/
@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/ join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/
@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/ join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/
@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as
select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/ join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/
@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof
drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as
select /*+ COALESCE(100) */ allpubsshare.organization, select allpubsshare.organization,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all
drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as
select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization, select cast(allpubsshare.year as int) year, allpubsshare.organization,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint select distinct p.id, coalesce(has_preprint, 0) as has_preprint
from ${stats_db_name}.publication_classifications p from ${stats_db_name}.publication_classifications p
left outer join ( left outer join (
select p.id, 1 as has_preprint select p.id, 1 as has_preprint
@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p
drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription select distinct p.id, coalesce(is_subscription, 0) as is_subscription
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join( left outer join(
select p.id, 1 as is_subscription from ${stats_db_name}.publication p select p.id, 1 as is_subscription from ${stats_db_name}.publication p
@ -640,7 +640,7 @@ from ${stats_db_name}.publication p
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from ${stats_db_name}.result p from ${stats_db_name}.result p
left outer join ( left outer join (
select p.id, 1 as result_with_pid select p.id, 1 as result_with_pid
@ -654,7 +654,7 @@ group by rf.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0) select distinct p.id as id, coalesce(is_interdisciplinary, 0)
as is_interdisciplinary as is_interdisciplinary
from pub_fos_totals p from pub_fos_totals p
left outer join ( left outer join (
@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/
create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as
select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as is_bronze_oa select p.id, 1 as is_bronze_oa
@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as
select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id, select pry.project_id, pry.acronym, pry.result_id,
coalesce(is_project_result_after, 0) as is_project_result_after coalesce(is_project_result_after, 0) as is_project_result_after
from project_year_result_year pry from project_year_result_year pry
left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after
@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/
drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as
select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
from ${stats_db_name}.funder f from ${stats_db_name}.funder f
left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder
join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp
@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu
join ${stats_db_name}.project p on p.id=rp.project join ${stats_db_name}.project p on p.id=rp.project
where cast(year as int)>2003 where cast(year as int)>2003
group by p.funder) group by p.funder)
select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
from allresults from allresults
join result_fair on result_fair.funder=allresults.funder; /*EOS*/ join result_fair on result_fair.funder=allresults.funder; /*EOS*/
@ -745,7 +745,7 @@ allresults as
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
where cast(year as int)>2003 where cast(year as int)>2003
group by rc.ri_initiative) group by rc.ri_initiative)
select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
from allresults from allresults
join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/ join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/
@ -817,14 +817,16 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware
drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as
select /*+ COALESCE(100) */ allpubsshare.funder, select allpubsshare.funder,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) funder_openess +(case when d is null then 0 else 1 end))
FROM allpubsshare funder_openess FROM allpubsshare
left outer join (select funder,d from alldatasetssshare) tmp1 left outer join (select funder,d from
on tmp1.funder=allpubsshare.funder alldatasetssshare) tmp1
left outer join (select funder,s from allsoftwaresshare) tmp2 on tmp1.funder=allpubsshare.funder
on tmp2.funder=allpubsshare.funder; /*EOS*/ left outer join (select funder,s from
allsoftwaresshare) tmp2
on tmp2.funder=allpubsshare.funder; /*EOS*/
DROP VIEW pubs_oa; /*EOS*/ DROP VIEW pubs_oa; /*EOS*/
DROP VIEW datasets_oa; /*EOS*/ DROP VIEW datasets_oa; /*EOS*/
@ -903,7 +905,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso
drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as
select /*+ COALESCE(100) */ allpubsshare.ri_initiative, select allpubsshare.ri_initiative,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
ris_openess FROM allpubsshare ris_openess FROM allpubsshare
@ -941,7 +943,7 @@ with result_findable as
join ${stats_db_name}.project p on p.id=rp.project join ${stats_db_name}.project p on p.id=rp.project
where cast(year as int)>2003 where cast(year as int)>2003
group by p.funder) group by p.funder)
select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
from allresults from allresults
join result_findable on result_findable.funder=allresults.funder; /*EOS*/ join result_findable on result_findable.funder=allresults.funder; /*EOS*/
@ -950,43 +952,41 @@ drop table if exists ${stats_db_name}.indi_ris_findable purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as
with result_contexts as with result_contexts as
(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc (select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc
join ${stats_db_name}.concept on concept.id=rc.concept join ${stats_db_name}.concept on concept.id=rc.concept
join ${stats_db_name}.category on category.id=concept.category join ${stats_db_name}.category on category.id=concept.category
join ${stats_db_name}.context on context.id=category.context), join ${stats_db_name}.context on context.id=category.context),
result_findable as result_findable as
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
join ${stats_db_name}.result_pids rp on rp.id=r.id join ${stats_db_name}.result_pids rp on rp.id=r.id
where cast(r.year as int)>2003 where cast(r.year as int)>2003
group by rc.ri_initiative), group by rc.ri_initiative),
allresults as allresults as
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
where cast(r.year as int)>2003 where cast(r.year as int)>2003
group by rc.ri_initiative) group by rc.ri_initiative)
select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
from allresults from allresults
join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/ join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as
with org_names_pids as with org_names_pids as
(select org.id,name, pid from ${stats_db_name}.organization org (select org.id,name, pid from ${stats_db_name}.organization org
join ${stats_db_name}.organization_pids op on org.id=op.id), join ${stats_db_name}.organization_pids op on org.id=op.id),
publicly_funded_orgs as publicly_funded_orgs as
(select distinct name from (select distinct name from
(select pf.name from stats_ext.insitutions_for_publicly_funded pf (select pf.name from stats_ext.insitutions_for_publicly_funded pf
join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government'
union all union all
select pf.name from stats_ext.insitutions_for_publicly_funded pf select pf.name from stats_ext.insitutions_for_publicly_funded pf
join ${stats_db_name}.project p on p.funder=pf.name join ${stats_db_name}.project p on p.funder=pf.name
union all union all
select op.name from stats_ext.insitutions_for_publicly_funded pf select op.name from stats_ext.insitutions_for_publicly_funded pf
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
and pf.publicly_funded='yes') foo) and pf.publicly_funded='yes') foo)
select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro
@ -995,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license select distinct p.id, coalesce(green_with_license, 0) as green_with_license
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p
@ -1006,7 +1006,7 @@ left outer join (
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
create table ${stats_db_name}.result_country stored as parquet as create table ${stats_db_name}.result_country stored as parquet as
select /*+ COALESCE(100) */ distinct id, country select distinct id, country
from ( from (
select ro.id, o.country select ro.id, o.country
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
@ -1021,7 +1021,7 @@ where rc.country is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license select distinct r.id, coalesce(oa_with_license,0) as oa_with_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r
join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/
@ -1029,9 +1029,9 @@ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open
drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as
with without_license as with without_license as
(select distinct id from ${stats_db_name}.indi_result_oa_with_license (select distinct id from ${stats_db_name}.indi_result_oa_with_license
where oa_with_license=0) where oa_with_license=0)
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license select distinct r.id, coalesce(oa_without_license,0) as oa_without_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select distinct r.id, 1 as oa_without_license left outer join (select distinct r.id, 1 as oa_without_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
@ -1042,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /*
create table ${stats_db_name}.indi_result_under_transformative stored as parquet as create table ${stats_db_name}.indi_result_under_transformative stored as parquet as
with transformative_dois as ( with transformative_dois as (
select distinct doi from stats_ext.transformative_facts) select distinct doi from stats_ext.transformative_facts)
select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative select distinct r.id, coalesce(under_transformative,0) as under_transformative
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select distinct rp.id, 1 as under_transformative select distinct rp.id, 1 as under_transformative

View File

@ -1,30 +1,30 @@
set mapred.job.queue.name=analytics; /*EOS*/ set mapred.job.queue.name=analytics;
---------------------------------------------------- ----------------------------------------------------
-- Shortcuts for various definitions in stats db --- -- Shortcuts for various definitions in stats db ---
---------------------------------------------------- ----------------------------------------------------
-- Peer reviewed: -- Peer reviewed:
drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/ drop table if exists ${stats_db_name}.result_peerreviewed purge;
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/ left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
-- Green OA: -- Green OA:
drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/ drop table if exists ${stats_db_name}.result_greenoa purge;
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green select r.id, case when green.green_oa=1 then true else false end as green
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/ left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
-- GOLD OA: -- GOLD OA:
drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/ drop table if exists ${stats_db_name}.result_gold purge;
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold select r.id, case when gold.is_gold=1 then true else false end as gold
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/ left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;

View File

@ -1,26 +1,58 @@
set mapred.job.queue.name=analytics; /*EOS*/ set mapred.job.queue.name=analytics;
-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold, -- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
-- peer reviewed) -- peer reviewed)
drop table if exists ${stats_db_name}.result_tmp;
drop view if exists ${stats_db_name}.result; /*EOS*/ CREATE TABLE ${stats_db_name}.result_tmp (
drop table if exists ${stats_db_name}.result; /*EOS*/ id STRING,
title STRING,
publisher STRING,
journal STRING,
`date` STRING,
`year` INT,
bestlicence STRING,
access_mode STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING ,
peer_reviewed BOOLEAN,
green BOOLEAN,
gold BOOLEAN)
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
CREATE TABLE ${stats_db_name}.result stored as parquet as insert into ${stats_db_name}.result_tmp
SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ( FROM ${stats_db_name}.publication r
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.publication)
UNION ALL
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.dataset)
UNION ALL
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.software)
UNION ALL
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.otherresearchproduct)
) r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.dataset r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.software r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.otherresearchproduct r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
drop table if exists ${stats_db_name}.result;
drop view if exists ${stats_db_name}.result;
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
drop table ${stats_db_name}.result_tmp;

View File

@ -1,4 +1,4 @@
set mapred.job.queue.name=analytics; /*EOS*/ set mapred.job.queue.name=analytics;
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
@ -7,65 +7,65 @@ set mapred.job.queue.name=analytics; /*EOS*/
-------------------------------------------------------------- --------------------------------------------------------------
-- Publication temporary table -- Publication temporary table
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge;
CREATE TABLE ${stats_db_name}.publication_tmp
(
id STRING,
title STRING,
publisher STRING,
journal STRING,
date STRING,
year STRING,
bestlicence STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING
)
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
CREATE TABLE ${stats_db_name}.publication stored as parquet as INSERT INTO ${stats_db_name}.publication_tmp
with pub_pr as ( SELECT substr(p.id, 4) as id,
select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed p.title[0].value as title,
from ${openaire_db_name}.publication pub p.publisher.value as publisher,
join ${openaire_db_name}.relation rel p.journal.name as journal,
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id p.dateofacceptance.value as date,
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false date_format(p.dateofacceptance.value, 'yyyy') as year,
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false p.bestaccessright.classname as bestlicence,
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false p.embargoenddate.value as embargo_end_date,
), false as delayed,
pub_delayed as ( size(p.author) as authors,
select pub_id, max(delayed) as delayed concat_ws('\u003B', p.source.value) as source,
from pub_pr case when size(p.description) > 0 then true else false end as abstract,
group by pub_id 'publication' as type
) from ${openaire_db_name}.publication p
select /*+ COALESCE(100) */ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
substr(pub.id, 4) as id,
pub.title[0].value as title,
pub.publisher.value as publisher,
pub.journal.name as journal,
pub.dateofacceptance.value as date,
date_format(pub.dateofacceptance.value, 'yyyy') as year,
pub.bestaccessright.classname as bestlicence,
pub.embargoenddate.value as embargo_end_date,
coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects.
size(pub.author) as authors,
concat_ws('\u003B', pub.source.value) as source,
case when size(pub.description) > 0 then true else false end as abstract,
'publication' as type
from ${openaire_db_name}.publication pub
left outer join pub_delayed on pub.id=pub_delayed.pub_id
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge;
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type SELECT substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge;
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge;
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
@ -73,44 +73,44 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge;
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language select substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge;
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge;
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge;
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge;
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;

View File

@ -1,5 +1,3 @@
set mapred.job.queue.name=analytics; /*EOS*/
create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context; create view if not exists TARGET.context as select * from SOURCE.context;

View File

@ -1,5 +1,3 @@
set mapred.job.queue.name=analytics; /*EOS*/
drop database if exists TARGET cascade; drop database if exists TARGET cascade;
create database if not exists TARGET; create database if not exists TARGET;
@ -83,17 +81,11 @@ create table TARGET.result stored as parquet as
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications 'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
) )) foo; ) )) foo;
create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.category as select * from SOURCE.category;
@ -264,6 +256,7 @@ create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * f
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);

View File

@ -1,5 +1,3 @@
set mapred.job.queue.name=analytics;
drop database if exists TARGET cascade; drop database if exists TARGET cascade;
create database if not exists TARGET; create database if not exists TARGET;

Some files were not shown because too many files have changed in this diff Show More