Compare commits

...

258 Commits

Author SHA1 Message Date
Myrto Kallipoliti 1d946234f8 Delete txt_files/city_names_small.txt 2025-07-23 05:29:32 +02:00
Myrto Kallipoliti 3e896bbfe2 Upload files to "txt_files" 2025-07-23 05:27:32 +02:00
Myrto Kallipoliti a95d7cd024 Upload files to "txt_files" 2025-07-23 05:27:04 +02:00
Myrto Kallipoliti a8c301de36 Upload files to "dictionaries" 2025-07-23 05:25:38 +02:00
Myrto Kallipoliti 5fbb024a20 Upload files to "dictionaries" 2025-07-23 05:25:08 +02:00
Myrto Kallipoliti 52290e8787 Upload files to "dictionaries" 2025-07-23 05:23:52 +02:00
Myrto Kallipoliti f03f70e831 Upload files to "/" 2025-07-23 05:21:36 +02:00
Miriam Baglioni 6859f12cd2 Update schemas.py
Added new schema to the set
2025-07-08 17:49:30 +02:00
Miriam Baglioni 33cab47ac8 Add dataset.py
Added code to run affro on the single dataset produced putting together all the input sources available in OpenAIRE
2025-07-08 17:23:43 +02:00
Myrto Kallipoliti 9bf6fe36b8 Upload files to "/" 2025-06-27 10:10:08 +02:00
Miriam Baglioni 6b11f34a89 Update oalex.py
removed field not present in the dataset anymore
2025-06-26 12:46:57 +02:00
Miriam Baglioni 4660c63e60 Update oalex.py
fixed typo in attribute name
2025-06-24 17:39:22 +02:00
Miriam Baglioni 8434c94f21 Update oalex.py
fixed syntax mistake
2025-06-24 16:28:26 +02:00
Miriam Baglioni 98ba852129 Update oalex.py
extended step of handling for the input data
2025-06-24 14:59:04 +02:00
Miriam Baglioni b53569908d Update oalex.py
removed not present attribute in oalex schema
2025-06-24 14:18:50 +02:00
Miriam Baglioni f5f10da1ea Update oalex.py
changed the code to read directly from the data snapshot
2025-06-24 10:16:23 +02:00
Miriam Baglioni 72a32faefc Update schemas.py
Adding source schema for other sources
2025-06-24 10:01:26 +02:00
Myrto Kallipoliti 0962586053 Upload files to "dictionaries" 2025-06-23 11:23:13 +02:00
Myrto Kallipoliti 34fd7a7a28 Upload files to "dictionaries" 2025-06-23 11:22:55 +02:00
Myrto Kallipoliti e1377d61fc Upload files to "dictionaries" 2025-06-23 11:22:35 +02:00
Myrto Kallipoliti c75f4f38ae Upload files to "dictionaries" 2025-06-23 11:21:50 +02:00
Miriam Baglioni 3553849645 Update graph.py
change the first join order since left part in the join should be the bigger file
2025-05-27 15:28:54 +02:00
Miriam Baglioni 96cb07353e Update graph.py
Added serialization steps
2025-05-27 14:02:57 +02:00
Miriam Baglioni 620e3d6691 Update graph.py
verify if the matching was already performed for the current eecution. In case make it jump the search for matchings (it would have searched for all the string with no match found)
2025-05-27 12:40:53 +02:00
Miriam Baglioni dc4dc3c38c Update utils.py
Fix to toAuthorModelGraph: in the schema for the graph author model there is no entry for name or familyname
2025-05-27 12:13:06 +02:00
Miriam Baglioni c82ecb1426 Update graph.py
added the union to thepaths to be considered by affro. Remove serialization steps
2025-05-22 09:51:27 +02:00
Miriam Baglioni 9fe4517a06 Update graph.py
addead serialization step
2025-05-22 09:38:39 +02:00
Miriam Baglioni 355a11f24d Update graph.py
added a serialization point
2025-05-21 18:29:43 +02:00
Miriam Baglioni 29bf85307d Update schemas.py
fixed name of field in author affiliation
2025-05-21 18:18:19 +02:00
Miriam Baglioni 2728017f2b Update graph.py
another query fix
2025-05-21 17:27:34 +02:00
Miriam Baglioni ed7966f6f5 Update graph.py
Fixed issue in query
2025-05-21 16:01:27 +02:00
Miriam Baglioni f258d79f6f Update schemas.py
Fixed typo
2025-05-21 15:06:21 +02:00
Miriam Baglioni 2a8745a787 Update schemas.py
added schemas for the result entry in the graph
2025-05-21 13:58:14 +02:00
Miriam Baglioni 89ac8f5274 Update graph.py
update to provide the schema of the input results
2025-05-21 13:57:37 +02:00
Miriam Baglioni dc50bb9757 Update graph.py
fixed error in parameter name
2025-05-20 18:42:41 +02:00
Miriam Baglioni ac635da607 Update graph.py
swap commands in condition
2025-05-20 18:33:08 +02:00
Miriam Baglioni 8b99f38175 Update graph.py
Changes join
2025-05-20 16:30:30 +02:00
Miriam Baglioni 7ae5609fbd Update graph.py
make the none path become a string
2025-05-20 12:26:34 +02:00
Miriam Baglioni 7018e58015 Update utils.py
Updated with two more def for the graph application
2025-05-20 11:43:17 +02:00
Miriam Baglioni 5c66ce9719 Update graph.py
Fixed issue for not empty previous set
2025-05-20 11:39:28 +02:00
Miriam Baglioni d5d18145ec Update graph.py
the wrong file was pushed
2025-05-20 11:36:34 +02:00
Miriam Baglioni f2bf93e6af Update graph.py
Refactored and also changed to store previous run of affro in case the algorith is not update from one run to the other
2025-05-20 11:34:19 +02:00
Myrto Kallipoliti 5302fa5f8d Upload files to "/" 2025-05-15 19:58:13 +02:00
Myrto Kallipoliti 7567d9af6c Upload files to "dictionaries" 2025-05-14 11:58:34 +02:00
Myrto Kallipoliti e6c3672434 Upload files to "dictionaries" 2025-05-14 11:58:15 +02:00
Myrto Kallipoliti e66c149b38 Upload files to "dictionaries" 2025-05-14 11:57:36 +02:00
Myrto Kallipoliti 3657ed1b7a Upload files to "txt_files" 2025-05-14 11:56:26 +02:00
Myrto Kallipoliti ad7df5ba49 Upload files to "/" 2025-05-14 11:55:47 +02:00
Myrto Kallipoliti d419d7c137 Upload files to "dictionaries" 2025-05-12 10:00:44 +02:00
Myrto Kallipoliti f50c93b527 Upload files to "txt_files" 2025-05-12 07:16:22 +02:00
Myrto Kallipoliti 631409dbc4 Upload files to "/" 2025-05-12 07:15:38 +02:00
Myrto Kallipoliti 520ee36a9a Upload files to "/" 2025-05-05 11:45:21 +02:00
Miriam Baglioni c0e1241ce6 Update oalex.py
Removed not existent global variable. Its modifications made the code break and the matching set returned be always empty because of 
    except Exception as e:
        print(f"Error processing affiliation string {aff_string}: {str(e)}")
        return ()
2025-04-26 11:51:55 +02:00
Myrto Kallipoliti bde5bfeee0 Upload files to "dictionaries" 2025-04-23 16:16:26 +02:00
Miriam Baglioni 6b5170f945 [wfready] adding serialization steps in oalex execution 2025-04-23 14:16:07 +02:00
Miriam Baglioni 4550216303 Merge remote-tracking branch 'origin/openaire-workflow-ready' into openaire-workflow-ready 2025-04-23 13:49:52 +02:00
Myrto Kallipoliti 0d87e4f0ae Upload files to "txt_files" 2025-04-23 11:11:33 +02:00
Myrto Kallipoliti a82b862e85 Upload files to "dictionaries" 2025-04-23 10:16:33 +02:00
Myrto Kallipoliti 4c5d791c7b Upload files to "dictionaries" 2025-04-23 10:15:56 +02:00
Myrto Kallipoliti f2049819a6 Upload files to "dictionaries" 2025-04-23 10:14:47 +02:00
Myrto Kallipoliti fc7ca0e7d8 Upload files to "dictionaries" 2025-04-23 10:09:37 +02:00
Myrto Kallipoliti 8d04dc5550 Upload files to "/" 2025-04-23 10:07:29 +02:00
Miriam Baglioni e010726b45 Update functions_cluster.py 2025-04-16 09:43:58 +02:00
Miriam Baglioni 8cdfa270a7 Update functions_cluster.py 2025-04-15 17:06:11 +02:00
Myrto Kallipoliti 3853e69079 Upload files to "/" 2025-04-15 16:12:27 +02:00
Myrto Kallipoliti 255c43665e Upload files to "/" 2025-04-15 16:11:54 +02:00
Myrto Kallipoliti 7af622b2bd Upload files to "dictionaries" 2025-04-15 15:20:02 +02:00
Myrto Kallipoliti 8a8a8af732 Upload files to "dictionaries" 2025-04-15 15:18:50 +02:00
Miriam Baglioni 8b2352c919 Update dictionaries/replacements.json 2025-04-15 15:13:56 +02:00
Miriam Baglioni 0d74625e8b Update dictionaries/replacements_all.json 2025-04-15 15:11:48 +02:00
Miriam Baglioni 3e7cd72cd8 Merge remote-tracking branch 'origin/openaire-workflow-ready' into openaire-workflow-ready 2025-04-15 14:12:48 +02:00
Miriam Baglioni 592630adf4 Update oalex.py 2025-04-15 14:12:35 +02:00
Miriam Baglioni 7a9df9b72b Add oalex.py 2025-04-15 14:12:15 +02:00
Myrto Kallipoliti f5dae58a47 Update dictionaries/replacements.json 2025-04-14 15:23:46 +02:00
Myrto Kallipoliti dd89e31614 Upload files to "dictionaries" 2025-04-14 15:23:16 +02:00
Myrto Kallipoliti 0e11fdc9fb Delete dictionaries/dix_country_apr25.json 2025-04-14 15:22:42 +02:00
Myrto Kallipoliti 6e08f677ae Update dictionaries/dix_mult.json 2025-04-14 15:21:09 +02:00
Myrto Kallipoliti d3c23584b6 Update dictionaries/dix_status.json 2025-04-14 15:20:41 +02:00
Myrto Kallipoliti aa6a5699aa Update dictionaries/dix_mult_oaire.json 2025-04-14 15:20:12 +02:00
Myrto Kallipoliti 690d7995f4 Update dictionaries/dix_id_country_oaire.json 2025-04-14 15:19:59 +02:00
Myrto Kallipoliti 70ae671862 Update dictionaries/dix_country_oaire.json 2025-04-14 15:19:44 +02:00
Myrto Kallipoliti 5615e4d31b Update dictionaries/dix_city_oaire.json 2025-04-14 15:19:30 +02:00
Myrto Kallipoliti ab51386c85 Delete dictionaries/dix_city_apr25.json 2025-04-14 15:19:18 +02:00
Myrto Kallipoliti 79c5c2d402 Update dictionaries/dix_acad_oaire.json 2025-04-14 15:18:40 +02:00
Myrto Kallipoliti 27b7164b94 Update dictionaries/dix_acad.json 2025-04-14 15:18:12 +02:00
Myrto Kallipoliti 828d02f980 Upload files to "/" 2025-04-14 15:17:14 +02:00
Miriam Baglioni 38ef05dce9 [jan25]rebased 2025-04-14 15:04:46 +02:00
Myrto Kallipoliti 9909c49f66 Upload files to "txt_files" 2025-04-14 12:44:10 +02:00
Myrto Kallipoliti 8e37938f50 Delete dictionaries/dix_status.json 2025-04-14 12:43:33 +02:00
Myrto Kallipoliti 6a5dcde3d9 Delete dictionaries/dix_mult_oaire.json 2025-04-14 12:43:21 +02:00
Myrto Kallipoliti 1f5616758d Delete dictionaries/dix_mult.json 2025-04-14 12:43:07 +02:00
Myrto Kallipoliti e60699229c Delete dictionaries/dix_country_oaire.json 2025-04-14 12:42:52 +02:00
Myrto Kallipoliti 8980be55e7 Delete dictionaries/dix_acad_oaire.json 2025-04-14 12:42:33 +02:00
Myrto Kallipoliti 18fb501f68 Delete dictionaries/dix_country.json 2025-04-14 12:42:12 +02:00
Myrto Kallipoliti 7c9c7635a0 Delete dictionaries/dix_city.json 2025-04-14 12:42:03 +02:00
Myrto Kallipoliti 622e166a4a Delete dictionaries/dix_acad.json 2025-04-14 12:41:51 +02:00
Myrto Kallipoliti abc9fe937c Upload files to "dictionaries" 2025-04-14 12:41:14 +02:00
Myrto Kallipoliti b40c6c3499 Upload files to "dictionaries" 2025-04-14 12:40:54 +02:00
Myrto Kallipoliti ee4130b8f3 Upload files to "dictionaries" 2025-04-14 12:39:14 +02:00
Myrto Kallipoliti 7701f696fb Upload files to "/" 2025-04-14 12:37:00 +02:00
Myrto Kallipoliti a9d00653e6 Upload files to "txt_files" 2025-04-03 12:21:47 +02:00
Myrto Kallipoliti 2518046eb1 Upload files to "dictionaries" 2025-04-03 12:20:20 +02:00
Myrto Kallipoliti bdae908bb2 Upload files to "dictionaries" 2025-04-03 12:19:24 +02:00
Myrto Kallipoliti 9c871dd923 Upload files to "/" 2025-04-03 12:17:43 +02:00
Miriam Baglioni 4889f6482c [adding resources] adding dictionary and other minor changes 2025-04-03 11:54:32 +02:00
Miriam Baglioni 25a60409f8 [graph] adjusting pubmed to run on the graph horizzontally 2025-04-03 10:43:55 +02:00
Miriam Baglioni 58d2bd0603 [crossrefANDdatacite] removed double array in matchings. renamed id as DOI in datacite 2025-03-24 14:27:39 +01:00
Miriam Baglioni a8b2b0beb6 [pubmed] fixed parameter type to withColumn 2025-03-18 10:32:38 +01:00
Miriam Baglioni cbb57a2343 [pubmed] modified to take the doi PID among the alternate identifiers 2025-03-18 10:21:10 +01:00
Miriam Baglioni 5267b632f1 [datacite] - 2025-03-17 14:16:34 +01:00
Miriam Baglioni f522ed55af [datacite] added serialization step 2025-03-17 12:51:08 +01:00
Miriam Baglioni 130989c969 [datacite] fixed typo 2025-03-17 12:01:04 +01:00
Miriam Baglioni e3c5632314 [datacite] fixed issue in schema definition 2025-03-17 11:50:22 +01:00
Miriam Baglioni 8becc886a7 [datacite] adding schemas definition 2025-03-17 11:26:12 +01:00
Miriam Baglioni 5458d3e4ab [datacite] adding parameters to resume the execution and to decide if to extend to authors or not 2025-03-17 10:53:02 +01:00
Miriam Baglioni d4e02d93b5 [datacite] adding parameters to resume the execution and to decide if to extend to authors or not 2025-03-17 10:50:22 +01:00
Miriam Baglioni 7ccaf6e413 [datacite] adding parameters to resume the execution and to decide if to extend to authors or not 2025-03-14 19:09:47 +01:00
Miriam Baglioni 0ae91fc373 [datacite] adding parameters to resume the execution and to decide if to extend to authors or not 2025-03-14 18:32:03 +01:00
Miriam Baglioni fe2cfc5b91 [datacite] adding parameters to resume the execution and to decide if to extend to authors or not 2025-03-14 14:20:23 +01:00
Miriam Baglioni 29498eab83 [datacite] changed not to extend affiliation to the authors 2025-03-14 10:19:05 +01:00
Miriam Baglioni 4777ab4a40 [datacite] added serialization points 2025-03-12 14:00:02 +01:00
Miriam Baglioni 2cf614ec84 [datacite] removed not needed print 2025-03-10 16:04:07 +01:00
Miriam Baglioni ab6d437e1e [datacite] change the implementation 2025-03-10 16:02:38 +01:00
Myrto Kallipoliti 07ada4db55 Upload files to "/" 2025-03-10 10:52:54 +01:00
Myrto Kallipoliti 00adbb401f Upload files to "dictionaries" 2025-03-08 07:41:05 +01:00
Myrto Kallipoliti 5e30f5028d Upload files to "dictionaries" 2025-03-08 07:40:31 +01:00
Myrto Kallipoliti c379fffe21 Upload files to "dictionaries" 2025-03-08 07:40:05 +01:00
Myrto Kallipoliti dca233b205 Upload files to "txt_files" 2025-03-08 07:38:02 +01:00
Myrto Kallipoliti 38d8c23dbc Upload files to "/" 2025-03-08 07:37:42 +01:00
Myrto Kallipoliti 8624c0f0e4 Upload files to "txt_files" 2025-03-05 14:27:36 +01:00
Myrto Kallipoliti 9dccaa0ae4 Upload files to "/" 2025-03-05 14:26:11 +01:00
Myrto Kallipoliti c3ff71a7fd Upload files to "/" 2025-03-05 10:46:53 +01:00
Myrto Kallipoliti 4310051cb5 Upload files to "/" 2025-03-05 10:33:24 +01:00
Miriam Baglioni 4aca6faef8 [wfready_]removed scripts not for production 2025-03-04 16:15:21 +01:00
Myrto Kallipoliti 9e2c056d98 Merge pull request 'updates stop_words' (#18) from version-jan25 into openaire-workflow-ready
Reviewed-on: #18
2025-03-04 14:17:53 +01:00
mkallipo 31ee6c3460 updates stop_words 2025-03-04 14:13:55 +01:00
Myrto Kallipoliti f2dc62e481 Update txt_files/remove_list.txt 2025-03-03 19:16:29 +01:00
Myrto Kallipoliti 298c9a8b64 Update affro_cluster.py 2025-03-03 14:55:31 +01:00
Miriam Baglioni 6a92f5b1de resolved conflicts 2025-03-03 14:47:59 +01:00
Miriam Baglioni f21e6244ac Merge remote-tracking branch 'origin/version-jan25' into version-jan25 2025-03-03 14:33:22 +01:00
mkallipo e717bf36d8 no paths 2025-03-03 13:25:46 +01:00
Miriam Baglioni 1673c5e099 Merge remote-tracking branch 'origin/version-jan25' into version-jan25 2025-03-03 13:00:47 +01:00
mkallipo 5cd3229e52 no paths 2025-03-03 12:39:40 +01:00
Miriam Baglioni 77c2caad29 Merge remote-tracking branch 'origin/version-jan25' into version-jan25 2025-03-03 12:07:49 +01:00
mkallipo f7929ecc2f updates 2025-03-03 11:17:08 +01:00
mkallipo dbcd97d9be updates 2025-03-03 11:12:49 +01:00
mkallipo 10fcb7cca9 updates 2025-02-27 23:36:31 +01:00
mkallipo 129799ecd0 updates 2025-02-26 20:02:33 +01:00
Miriam Baglioni 414f07607a Merge remote-tracking branch 'origin/version-jan25' into version-jan25 2025-02-24 10:56:18 +01:00
mkallipo 7d26fc23ac updates 2025-02-24 09:58:33 +01:00
mkallipo 41605c2de0 updates 2025-02-23 17:18:18 +01:00
mkallipo 24aa99176b updates 2025-02-22 12:33:03 +01:00
mkallipo fdbdb19d75 updates 2025-02-22 12:25:17 +01:00
mkallipo 276b23b4fd corrections 2025-02-20 06:18:53 +01:00
mkallipo f365d415c5 updates 2025-02-19 11:07:48 +01:00
mkallipo cb782e95f1 test 2025-02-14 17:57:26 +01:00
mkallipo 157920ed02 test 2025-02-14 17:11:49 +01:00
Miriam Baglioni 6d7a2bfa97 [step 1]added examples 2025-02-14 16:51:14 +01:00
mkallipo a0dc9a3166 corrections 2025-02-13 15:22:04 +01:00
mkallipo 4c84c36f53 corrections 2025-02-13 15:20:29 +01:00
mkallipo 8c516fb558 corrections 2025-02-13 15:19:49 +01:00
mkallipo c9231e7064 corrections 2025-02-13 15:18:40 +01:00
Miriam Baglioni 418ba54def restored as pevious version 2025-02-11 17:35:54 +01:00
Miriam Baglioni 1222241c53 changed ost with get 2025-02-11 17:10:18 +01:00
Miriam Baglioni 30f0c17348 changed the argument number 2025-02-11 16:57:10 +01:00
Miriam Baglioni bcc0a4a144 lock and unlock mdstore if run only on one datasource (datacite/.pubmed) 2025-02-11 16:11:18 +01:00
Miriam Baglioni f7ce9637c6 Removing not needed import 2025-02-10 19:29:59 +01:00
mkallipo 4f8941cc22 update 2025-02-10 19:17:52 +01:00
Myrto Kallipoliti efa33eba22 Upload files to "dictionaries" 2025-02-10 16:54:12 +01:00
Miriam Baglioni 37703c6111 Remove monitor thread 2025-02-10 16:37:12 +01:00
Miriam Baglioni 6ca711d61c Delete .idea/vcs.xml 2025-02-10 16:35:18 +01:00
Miriam Baglioni 9ed2510be9 Delete .idea/modules.xml 2025-02-10 16:35:10 +01:00
Miriam Baglioni b2d4b1d2ac Delete .idea/misc.xml 2025-02-10 16:35:04 +01:00
Miriam Baglioni 57bae53c19 Delete .idea/affRo.iml 2025-02-10 16:34:58 +01:00
Miriam Baglioni d65bc6c385 Delete .idea/codeStyles/codeStyleConfig.xml 2025-02-10 16:34:51 +01:00
Miriam Baglioni f1ef61bf39 Delete .idea/codeStyles/Project.xml 2025-02-10 16:34:43 +01:00
Miriam Baglioni 730090670d Delete .idea/.gitignore 2025-02-10 16:34:31 +01:00
Miriam Baglioni c48f2e787f align with branch jan2025 2025-02-10 16:24:50 +01:00
Miriam Baglioni 0874760a8b align with branch jan2025 2025-02-10 16:22:17 +01:00
Miriam Baglioni d207df7563 align with branch jan2025 2025-02-10 15:47:35 +01:00
mkallipo ada983a780 corrections 2025-02-10 13:55:43 +01:00
mkallipo 8d4d57b33d corrections 2025-02-10 13:39:55 +01:00
mkallipo 8cc00a0e41 corrections 2025-02-06 10:59:23 +01:00
mkallipo 1890b44591 corrections 2025-02-06 00:02:18 +01:00
mkallipo ffa66e3354 corrections 2025-02-05 23:57:37 +01:00
mkallipo afe0c6e1a5 corrections 2025-02-05 23:23:24 +01:00
mkallipo b526a1f391 corrections 2025-02-05 17:11:20 +01:00
mkallipo 028f027f31 corrections 2025-02-05 15:33:40 +01:00
mkallipo f8d65d5ab1 corrections 2025-02-05 15:25:13 +01:00
mkallipo 6b78f2344b corrections 2025-02-05 14:56:03 +01:00
Miriam Baglioni 9146fe892f alignemt with version-june25 2025-02-03 16:12:24 +01:00
Miriam Baglioni 3ba978a636 added oalex file to compute affiliation links 2025-02-03 15:49:46 +01:00
Miriam Baglioni baa6e3b4d9 added run on the rawaffiliationstrings by oalex to find the matches 2025-02-03 14:44:26 +01:00
Miriam Baglioni d35a9563b4 added two new files and align old version with branch version-jan2025 2025-02-03 14:39:29 +01:00
Miriam Baglioni 58f46697bf added country_names.txt 2025-02-03 14:32:50 +01:00
mkallipo cbfca1922a corrections 2025-02-03 11:55:35 +01:00
mkallipo 29ec06a7de corrections 2025-02-03 11:42:53 +01:00
mkallipo 7037355f85 updates 2025-02-03 07:07:46 +01:00
mkallipo b70bcebb12 new version 2025-01-31 21:12:55 +01:00
mkallipo c0ee7428cf new version 2025-01-31 21:09:21 +01:00
mkallipo f4c44175ac corrected lower-upper case func 2025-01-23 11:08:54 +01:00
mkallipo f9432b9a2a oalex new method 2025-01-21 14:05:38 +01:00
mkallipo e5b9780075 oalex new method 2025-01-21 14:03:03 +01:00
mkallipo cbf99acebf oalex new method 2025-01-21 14:01:26 +01:00
Miriam Baglioni c3f8acf85e Add run1
Added new script to create a matchings dataset also in this branch
2025-01-20 09:53:36 +01:00
mkallipo bc0f33973e corrections 2025-01-17 17:04:05 +01:00
mkallipo fc241e1c81 update 2025-01-16 21:01:35 +01:00
mkallipo 076c46c0ff update 2025-01-16 20:47:35 +01:00
mkallipo 1bf09651c1 update 2025-01-16 19:07:23 +01:00
mkallipo e16048b50a author model for pubmed 2025-01-15 14:29:30 +01:00
mkallipo 4824f5dd92 author model for pubmed 2025-01-15 12:34:44 +01:00
mkallipo 5644d5adc4 distinct crossref 2025-01-10 10:12:15 +01:00
Miriam Baglioni a2d9463e81 Align branch with modifications done on master 2025-01-09 11:54:26 +01:00
mkallipo 0332c85b40 matchings_affro, crossref 2025-01-09 11:41:23 +01:00
mkallipo dd316ceb4e matchings_affro, crossref 2025-01-09 10:55:43 +01:00
Miriam Baglioni 3eadd45d1a Align branch with modifications done on master 2025-01-08 14:19:05 +01:00
mkallipo fd32a8dd90 new crossref, oalex 2025-01-07 12:04:43 +01:00
mkallipo 41c7fa661a new crossref, oalex 2025-01-07 12:04:23 +01:00
mkallipo 84a47f6fdc dictionaries, new crossref 2025-01-07 11:52:04 +01:00
Myrto Kallipoliti 44f0f9987f Merge pull request 'Oalex' (#13) from openaire-workflow-ready_2 into openaire-workflow-ready
Reviewed-on: #13
2024-12-09 18:51:22 +01:00
Miriam Baglioni ad691c28c2 [oalex] change to add a thread to monitor the number of operations done by affro up to a certain point 2024-12-06 10:19:53 +01:00
Miriam Baglioni 2806511e02 [oalex] change collec_list to collect_set so that the same match will be there just one time 2024-12-05 21:26:08 +01:00
Miriam Baglioni 0043e4051f [oalex] renaming 2024-12-05 18:44:06 +01:00
Miriam Baglioni a59d0ce9fc [oalex] avoid redefinition of explode function 2024-12-05 18:41:16 +01:00
Miriam Baglioni e2f8007433 [oalex] added fix 2024-12-05 16:50:10 +01:00
Miriam Baglioni f8479083f2 [oalex] pasing the schema to avoid changing in confidence type 2024-12-05 16:44:17 +01:00
Miriam Baglioni 9440f863c9 [oalex] changed implementation passing throguh rdd to avoi calling udf function 2024-12-05 16:36:38 +01:00
Miriam Baglioni f78456288c [oalex] fix issue 2024-12-05 12:54:10 +01:00
Miriam Baglioni 997f2e492f [oalex] change the call of the function in the dataframe 2024-12-05 12:35:59 +01:00
Miriam Baglioni 982a1b0b9f [oalex] change the call of the function in the dataframe 2024-12-05 12:21:21 +01:00
Miriam Baglioni 4fe3d31ed5 [oalex] register the UDF oalex_affro and the schema of the output to be used in the dataframe by pyspark 2024-12-05 12:18:45 +01:00
Miriam Baglioni efa4db4e52 [oalex] execute affRo on distinct affilitaion_strings 2024-12-05 12:02:40 +01:00
Miriam Baglioni ea2e27a9f4 [oalex] fix python syntax errors 2024-12-05 11:22:10 +01:00
Miriam Baglioni e33bf4ef14 [oalex] proposal to higher the parallelization 2024-12-05 10:39:00 +01:00
Miriam Baglioni f4704aef4d [oalex] proposal to higher the parallelization 2024-12-05 10:27:32 +01:00
Miriam Baglioni 0500fc586f Added input/output path as parameters 2024-12-04 15:14:58 +01:00
Miriam Baglioni 5568aa92ec Remove from path 2024-12-03 16:54:47 +01:00
Miriam Baglioni 600ddf8087 Remove directory name
Change to make the file discoverable on the cluster
2024-12-03 16:45:57 +01:00
mkallipo 03dc19fd3b add gitignore 2024-12-01 20:04:32 +01:00
mkallipo d9dbc679e3 updates 2024-12-01 20:00:49 +01:00
mkallipo 413ec3773e updates -datacite 2024-11-21 13:32:50 +01:00
mkallipo ba98a16bcb updates -openorgs 2024-11-21 12:39:26 +01:00
mkallipo 415b45e3ca updates 2024-10-28 11:13:55 +01:00
mkallipo 8c6f6a5a9a crosserf 2024-10-24 09:32:08 +02:00
mkallipo b4f79adc56 path 2024-10-18 13:19:41 +02:00
mkallipo 90426a6d29 path 2024-10-18 13:12:00 +02:00
mkallipo ad656121ed arguments 2024-10-18 10:48:18 +02:00
mkallipo ca6e8ad3b9 . 2024-10-16 13:29:39 +02:00
mkallipo 8325c94e56 strings.py 2024-10-16 12:42:51 +02:00
mkallipo 5795ec6493 general, afiliated stopwords 2024-10-07 11:45:41 +02:00
mkallipo 57569fbb3b dix_acad, zu stopword 2024-10-07 11:39:21 +02:00
mkallipo 968ecf9680 multi 2024-10-07 11:35:15 +02:00
mkallipo 2c6e7b7a70 multi 2024-10-07 11:25:16 +02:00
mkallipo 9473c30a09 dictionaries 2024-10-06 22:09:42 +02:00
mkallipo bace694d21 updates 2024-09-19 21:37:28 +02:00
mkallipo a7b703b67d updates german terms, / 2024-09-17 12:06:29 +02:00
mkallipo b38be012a0 updates abbr 2024-09-16 12:20:37 +02:00
mkallipo fbf55b3d5d redirection of non active ror ids 2024-09-12 15:56:26 +02:00
51 changed files with 36146 additions and 1520 deletions

7
.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
# Ignore macOS system files
.DS_Store
# Ignore Python cache files
__pycache__/
.idea

View File

@ -5,27 +5,396 @@ from matching_cluster import *
from create_input_cluster import *
import json
dix_org = load_json('dictionaries/dix_acad.json')
dix_mult = load_json('dictionaries/dix_mult.json')
dix_city = load_json('dictionaries/dix_city.json')
dix_country = load_json('dictionaries/dix_country.json')
#path_dict = "dictionaries/"
path_dict = ""
prefix = ""
dix_org_ror = load_json(path_dict + 'dix_acad'+prefix+'.json')
dix_mult_ror = load_json(path_dict + 'dix_mult'+prefix+'.json')
dix_city_ror = load_json(path_dict + 'dix_city'+prefix+'.json')
dix_country_ror = load_json(path_dict + 'dix_country'+prefix+'.json')
dix_org_oaire = load_json(path_dict + 'dix_acad_oaire.json')
dix_mult_oaire = load_json(path_dict + 'dix_mult_oaire.json')
dix_city_oaire = load_json(path_dict + 'dix_city_oaire.json')
dix_country_oaire = load_json(path_dict + 'dix_country_oaire.json')
dix_status = load_json(path_dict + 'dix_status'+prefix+'.json')
#dix_grids = load_json('dictionaries/dix_grids_rors.json')
dix_id_country_ror = load_json(path_dict + 'dix_id_country'+prefix+'.json')
dix_id_country_oaire = load_json(path_dict + 'dix_id_country_oaire.json')
dix_country_legalnames = load_json(path_dict + 'dix_country_legalnames'+prefix+'.json')
dix_org = dict(dix_org_ror)
dix_org.update(dix_org_oaire)
dix_mult = dict(dix_mult_ror)
dix_mult.update(dix_mult_oaire)
dix_city = dict(dix_city_ror)
dix_city.update(dix_city_oaire)
for x in dix_city_oaire:
if x in dix_city_ror:
if type(dix_city_ror[x][0]) == list:
if type(dix_city_oaire[x][0]) == str:
dix_city[x] = dix_city_ror[x] +[dix_city_oaire[x]]
else:
dix_city[x] = dix_city_ror[x]+dix_city_oaire[x]
else:
if type(dix_city_oaire[x][0]) == str:
dix_city[x] = [dix_city_ror[x],dix_city_oaire[x]]
else:
dix_city[x] = dix_city_oaire[x] + [dix_city_ror[x]]
dix_country = dict(dix_country_ror)
dix_country.update(dix_country_oaire)
for x in dix_country_oaire:
if x in dix_country_ror:
if type(dix_country_ror[x][0]) == list:
if type(dix_country_oaire[x][0]) == str:
dix_country[x] = dix_country_ror[x] +[dix_country_oaire[x]]
else:
dix_country[x] = dix_country_ror[x]+dix_country_oaire[x]
else:
if type(dix_country_oaire[x][0]) == str:
dix_country[x] = [dix_country_ror[x],dix_country_oaire[x]]
else:
dix_country[x] = dix_country_oaire[x] + [dix_country_ror[x]]
dix_id_country_nc = dict(dix_id_country_ror)
dix_id_country_nc.update(dix_id_country_oaire)
# dix_mult = dix_mult_ror | dix_mult_oaire
# dix_country = dix_country_ror | dix_country_oaire
# dix_city = dix_city_ror | dix_city_oaire
# dix_id_country_nc = dix_id_country_ror | dix_id_country_oaire
dix_id_country = {x:remove_stop_words(replace_double_consonants(dix_id_country_nc[x])) for x in list(dix_id_country_nc.keys())}
# print(dix_id_country_ror['https://ror.org/03vba6259'])
# print(dix_id_country['https://ror.org/03vba6259'])
#dix_org1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_org[x] for x in dix_org}
# dix_mult1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_mult[x] for x in dix_mult}
# dix_city1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_city[x] for x in dix_city}
# dix_country1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_country[x] for x in dix_country}
dix_status_new = {k :[dix_status[k][0], dix_status[k][1].split(', ')] for k in dix_status}
us_states = [
"alabama", "alaska", "arizona", "arkansas", "california",
"colorado", "connecticut", "delaware", "florida", "georgia",
"hawaii", "idaho", "illinois", "indiana", "iowa",
"kansas", "kentucky", "louisiana", "maine", "maryland",
"massachusetts", "michigan", "minnesota", "mississippi", "missouri",
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
"new mexico", "new york", "north carolina", "north dakota", "ohio",
"oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina",
"south dakota", "tennessee", "texas", "utah", "vermont",
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
]
def contains_us_state(text):
text = text.lower()
return any(state in text for state in us_states)
def find_ror_new1(input, simU, simG, limit):
light_aff = input[0]
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
# print('res', result)
results_upd = []
for r in result:
if "openorgs" in r[2]:
results_upd.append([r[1], 'OpenOrgs', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][0] == 'active':
results_upd.append([r[1], 'ROR', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][1][0] == '':
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
# elif len(dix_status[r[2]][1]) == 1:
# results_upd.append([r[1], 'ROR', r[2], dix_status[r[2]][0]])
# results_upd.append([r[1], 'ROR', dix_status[r[2]][1][0], 'active'])
else:
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
for link in (dix_status_new[r[2]][1]):
results_upd.append([r[1], 'ROR', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
# print('results_upd',results_upd)
# print('len(set(description(light_aff)[1]))', len(set(description(light_aff)[1])))
if len(results_upd) > len(set(description(light_aff)[1])):
final_matching = []
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
# print('light_aff_tokens',light_aff_tokens)
for id_ in results_upd:
country = dix_id_country[id_[2]]
# print(id_, country)
if country == 'united states':
# print('united states')
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
# print('found')
final_matching.append(id_)
elif country == 'united kingdom':
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
final_matching.append(id_)
elif 'korea' in country:
if 'korea' in light_aff_tokens:
final_matching.append(id_)
elif country in light_aff:
final_matching.append(id_)
if len(final_matching)>0:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in final_matching]
else:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
elif len(results_upd) == len(set(description(light_aff)[1])) ==1:
light_aff_tokens = [clean_string_ror(x) for x in set(re.split(r'[ ,]+', light_aff))]
# print('light_aff_tokens',light_aff_tokens)
country = dix_id_country[results_upd[0][2]]
if country == 'united states':
# print('united states')
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
# print('found')
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
elif country == 'united kingdom':
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
elif 'korea' in country:
if 'korea' in light_aff_tokens:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
elif country in light_aff:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
else:
return []
elif len(results_upd)>0:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
else:
return []
def find_ror_new(input, simU, simG, limit):
light_aff = input[0]
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
# print('res', result)
results_upd = []
for r in result:
if "openorgs" in r[2]:
results_upd.append([r[1], 'OpenOrgs', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][0] == 'active':
results_upd.append([r[1], 'ROR', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][1][0] == '':
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
# elif len(dix_status[r[2]][1]) == 1:
# results_upd.append([r[1], 'ROR', r[2], dix_status[r[2]][0]])
# results_upd.append([r[1], 'ROR', dix_status[r[2]][1][0], 'active'])
else:
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
for link in (dix_status_new[r[2]][1]):
results_upd.append([r[1], 'ROR', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
# print('results_upd',results_upd)
# print('len(set(description(light_aff)[1]))', len(set(description(light_aff)[1])))
if len(results_upd) > len(set(description(light_aff)[1])):
final_matching = []
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
# print('light_aff_tokens',light_aff_tokens)
for id_ in results_upd:
country = dix_id_country[id_[2]]
# print(id_, country)
if country == 'united states':
# print('united states')
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
# print('found')
final_matching.append(id_)
elif country == 'united kingdom':
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
final_matching.append(id_)
elif 'korea' in country:
if 'korea' in light_aff_tokens:
final_matching.append(id_)
elif country in light_aff:
final_matching.append(id_)
if len(final_matching)>0:
result_dict = [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in final_matching]
return result_dict
else:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
elif len(results_upd)>0:
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
else:
result_dict = []
return result_dict
def affro(raw_aff_string):
lucky_guess = clean_string_ror(raw_aff_string)
try:
result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country, 0.65, 0.82)
if len(result)>0:
result_dict = [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1]}) for x in result]
if lucky_guess in dix_org:
# print('lucky')
if dix_mult[lucky_guess] == "unique":
# print('unique')
if 'openorgs' in dix_org[lucky_guess]:
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
else:
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
# print('lucky, active')
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
elif dix_status_new[dix_org[lucky_guess]][1][0]== '':
# print('lucky not active, not succesor')
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0], 'Country':dix_id_country[dix_org[lucky_guess]]}]
else:
# print('lucky not active, succesor')
res = [{'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0], 'Country':dix_id_country[dix_org[lucky_guess]]}]
# print('res',res)
for successor in dix_status_new[dix_org[lucky_guess]][1]:
res.append({'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': successor, 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]})
return res
else:
cand_ids = [x[1] for x in dix_city_ror[lucky_guess] if dix_status_new[x[1]][0] == 'active']
# print('cand_ids', cand_ids)
if len(cand_ids) == 1:
if 'OpenOrgs' in dix_org[lucky_guess]:
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
else:
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
else:
return []
else:
result_dict = []
return result_dict
# print("not lucky")
result = find_ror_new(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
return result
except Exception as e:
# Return some indication of an error, or log the row
print(f"Error: {str(e)}")
print(raw_aff_string)
pass
#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
def affro_config(raw_aff_string, rad_u, sim_u, sim_g, limit):
lucky_guess = clean_string_ror(raw_aff_string)
try:
if lucky_guess in dix_org:
if dix_mult[lucky_guess] == "unique":
if 'openorgs' in dix_org[lucky_guess]:
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
else:
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
elif dix_status_new[dix_org[lucky_guess]][1] == '':
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0]}]
else:
res = [{'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0]}]
for successor in dix_status_new[dix_org[lucky_guess]][1]:
res.append({'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': successor, 'Confidence': 1, 'Status': 'active'})
return res
else:
cand_ids = [x[1] for x in dix_city_ror[lucky_guess] if dix_status_new[x[1]][0] == 'active']
# print('cand_ids', cand_ids)
if len(cand_ids) == 1:
if 'OpenOrgs' in dix_org[lucky_guess]:
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
else:
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
else:
return []
else:
result = find_ror_new(create_df_algorithm(raw_aff_string, rad_u), sim_u, sim_g, limit)
return result
except Exception as e:
# Return some indication of an error, or log the row
print(f"Error: {str(e)}")
print(raw_aff_string)
pass
def matchings_affro(aff_string):
# global operation_counter
try:
matchings = affro(aff_string)
# operation_counter += 1
# Ensure matchings is a list, even if affro returns a single dict
if not isinstance(matchings, list):
matchings = [matchings]
# Create the result as a tuple that matches matchings_schema
result = []
for matching in matchings:
# Assuming 'matching' is a dictionary that contains 'Provenance', 'PID', 'Value', 'Confidence', 'Status'
result.append((
matching.get("Provenance", None),
matching.get("PID", None),
matching.get("Value", None),
float(matching.get("Confidence", None)),
matching.get("Status", None)
))
if len(result)>0:
return result
except Exception as e:
print(f"Error processing affiliation string {aff_string}: {str(e)}")
return ()
if __name__ == "__main__":
@ -38,3 +407,5 @@ if __name__ == "__main__":
# float_arg2 = float(sys.argv[3])
print(affro(string_arg))

View File

@ -1,28 +0,0 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import sys
from affro_cluster import *
# Initialize SparkSession
spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
# Register the function as a UDF
affro_udf = udf(affro, StringType())
# Input list of strings
input_data = ["university of athens", "university of vienna", "UCLA"]
# # Convert the list to a Spark DataFrame
df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
# # Apply your custom UDF to the DataFrame
df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
df_with_custom_value.show(truncate=False)
# Stop the SparkSession
spark.stop()

163
authors_model.py Normal file
View File

@ -0,0 +1,163 @@
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
import sys
from threading import Thread
from affro_cluster import *
from schemas import *
spark = SparkSession.builder.appName("AffRo - Author model").getOrCreate()
input_file = sys.argv[1]
author_file = sys.argv[2]
matching_array_schema = ArrayType(
StructType([
StructField("Provenance", StringType(), nullable=False),
StructField("PID", StringType(), nullable=False),
StructField("Value", StringType(), nullable=False),
StructField("Confidence", DoubleType(), nullable=False),
StructField("Status", StringType(), nullable=False)
])
)
affro_udf = udf(matchings_affro, matching_array_schema)
exploded = spark.read.json(input_file) \
.filter(col("id").isNotNull()) \
.select(
col("id"),
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
)
# Explode the "author.rawAffiliationString" column into separate rows
affiliations_exploded = exploded.withColumn(
"Affiliation",
explode(col("author.rawAffiliationString"))
)
matching_df = (
affiliations_exploded
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
.select("Affiliation") # Select only the "Affiliation" column
.distinct() # Remove duplicate rows based on "Affiliation"
.select(
col("Affiliation"),
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
)
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
)
#x id id, author, rawaffiliationstring, matchings
def toAuthorModel(x):
name = {}
name['Full'] = x['author']['fullname']
name['First'] = x['author']['name']
name['Last'] = x['author']['surname']
orcid = None
if 'pid' in x['author'] :
try:
for p in x['author']['pid']:
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
orcid = p['value']
except:
pass
name['orcid'] = orcid
ret = {'key': x['id']+x['author']['fullname'],'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
return ret
def getMatchings(matches):
matchings = []
for m in matches:
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
return matchings
def regroupAndSelectDistinctMatch(x):
ret = []
dic = {}
for m in x:
for e in m:
if e['Status'] == 'active':
if not e['Value'] in dic:
dic[e['Value']] = 0
if dic[e['Value']] < float(e['Confidence']):
dic[e['Value']] = float(e['Confidence'])
for e in dic:
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
return ret
def aggregateAuthor(group):
affiliations = []
matchings = []
for e in group:
affiliations.append(e['Raw_affiliation'])
matchings.append(e['Matchings'])
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
def aggregateResult(group):
authors = []
matchings = []
for e in group:
amatch = getMatchings(e["Matchings"])
authors.append({
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
"Corresponding": None,
"Contributor_roles": None,
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
"Matchings": amatch
})
matchings.append(amatch)
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
return ret
applyMatch_udf = udf(toAuthorModel, match_author_schema)
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
aggregateResult_udf = udf(aggregateResult, author_schema)
#spark = SparkSession.builder.appName("AffRo-Matchings").getOrCreate()
#matching_df = spark.read.json(maching_file)
# exploded = spark.read.schema(result_schema).json(input_file) \
# .select(
# col("id"),
# explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
# )
# affiliations_exploded = exploded.withColumn(
# "Affiliation",
# explode(col("author.rawAffiliationString"))
# )
extend = (
matching_df
.join(affiliations_exploded, on="Affiliation")
.filter(col("Matchings").isNotNull())
)
applyMatchDf = extend \
.withColumn("application", applyMatch_udf(struct("*"))) \
.select("application.*")
groupedAuthorDf = applyMatchDf \
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
.withColumn("aggresult", aggregate_udf("group")) \
.select("aggresult.*")
groupedResultDf = groupedAuthorDf \
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
.withColumn("result", aggregateResult_udf("group")) \
.select("result.*")
groupedResultDf.write.mode("overwrite").json(author_file, compression="gzip")

View File

@ -1,77 +1,91 @@
from functions_cluster import *
def create_df_algorithm(raw_aff_string):
aff_no_symbols_d = substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
substring_list = list(aff_no_symbols_d.values())
def valueToCategory(value):
flag = 0
for k in categ_dicts:
if k in value and categ_dicts[k] in categ_string.split('|'):
flag = 1
return flag
# tokenization
protect = ['national univer ireland',
'univer',
'univer california',
'univer colege hospital',
'univer colege',
'univer hospital',
'imperial colege',
'city univer',
'univer medical school',
'california state univer',
'national techn univer',
'techn univer',
'islamic azad univer',
'univer nevada',
'univer maryland',
'state univer',
'rijksuniver',
'rijks univer',
'univer medical center'
]
def create_df_algorithm(raw_aff_string, radius_u):
clean_aff = clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))
#print(0, clean_aff)
countries_list = description(clean_aff)[1]
aff_no_symbols_d = substrings_dict(reduce(clean_aff))
#print(0.5, aff_no_symbols_d)
substring_list = [replace_abbr_univ(x) for x in list(aff_no_symbols_d.values())]
#print(1, substring_list)
i = 0
while i < len(substring_list) - 1:
if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]):
substring_list.pop(i)
elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
substring_list.pop(i)
elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or is_contained('school', substring_list[i]) or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i])
or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]):
if not is_contained('univ', substring_list[i]):
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names): #substring_list[i+1] in city_names:
substring_list[i] = substring_list[i] + ' ' + substring_list[i+1]
i = i+2
continue
elif ('assistant' in substring_list[i] or 'researcher' in substring_list[i] or 'phd' in substring_list[i] or 'student' in substring_list[i] or 'section' in substring_list[i] or 'prof' in substring_list[i] or 'director' in substring_list[i]) and (not 'school' in substring_list[i+1] or 'univ', substring_list[i+1] or 'inst' in substring_list[i+1] or 'lab' in substring_list[i+1] or 'fac' in substring_list[i+1]):
if not 'univ' in substring_list[i]:
substring_list.pop(i)
else:
i = i+1
elif ('engineer' in substring_list[i] or 'progr'in substring_list[i] or 'unit' in substring_list[i] or 'dep' in substring_list[i] or 'school' in substring_list[i] #or 'lab' in substring_list[i] # or 'inst' in substring_list[i] #or is_contained('hosp', substring_list[i])
or 'fac' in substring_list[i]) and 'univ' in substring_list[i+1]:
if not 'univ' in substring_list[i]:
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('school', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1]) or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
# elif 'lab' in substring_list[i] and ('colege' in substring_list[i+1] or 'dep' in substring_list[i+1] or 'school' in substring_list[i+1]):
# if not 'univ' in substring_list[i]: #'inst' in substring_list[i+1] or
# substring_list.pop(i)
# else:
# i = i+1
# continue
else:
i += 1
# print(1.4, substring_list)
light_aff = (', '.join((substring_list)))
for x in substring_list:
if x in city_names+remove_list:
substring_list.remove(x)
substring_list = [shorten_keywords_spark([x])[0] for x in substring_list]
# print(1.5, light_aff)
substring_list = [x for x in substring_list if x.replace(' gmbh','') not in city_names+remove_list]
# print(1.7,substring_list)
substring_list0 = [shorten_keywords([x], radius_u) for x in substring_list if len(shorten_keywords([x],radius_u))>0]
# print(2,substring_list0 )
def valueToCategory(value):
flag = 0
substring_list1 = [inner for outer in substring_list0 for inner in outer]
# print(3,substring_list1 )
for k in categ_dicts:
if k in value:
flag = 1
return flag
aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))]
aff_list = [{"index": i, "keywords": substring_list1[i], "category": valueToCategory(substring_list1[i])} for i in range(len(substring_list1))]
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
return [light_aff, filtered_list]
return [clean_aff, light_aff, filtered_list, countries_list]

62
crossref.py Normal file
View File

@ -0,0 +1,62 @@
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
from utils import *
from affro_cluster import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size
import sys
spark = SparkSession.builder.appName("AffRo-Crossref").getOrCreate()
folder_path = sys.argv[1]
hdfs_output_path = sys.argv[2]
matchings_schema = ArrayType(
StructType([
StructField("Provenance", StringType(), nullable=True),
StructField("PID", StringType(), nullable=True),
StructField("Value", StringType(), nullable=True),
StructField("Confidence", DoubleType(), nullable=True),
StructField("Status", StringType(), nullable=True)
])
)
affro_udf = udf(matchings_affro, matchings_schema)
distinct_matchings_udf = udf(regroupAndSelectDistinctMatch, matchings_schema)
exploded = spark.read.json(folder_path) \
.filter(col("DOI").isNotNull()) \
.select(
col("DOI").alias("DOI"),
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
)
affiliations_exploded = exploded.withColumn("affiliation", explode(col("author.affiliation")))
result = (
affiliations_exploded
.filter(col("affiliation.name").isNotNull())
.select(
col("DOI"),
col("affiliation.name").alias("raw_affiliation_string")
)
.withColumn("raw_affiliation_string", affro_udf(col("raw_affiliation_string")))
.groupBy("DOI")
.agg(collect_set("raw_affiliation_string").alias("group"))
.withColumn("Matchings", aggregate_udf("group")) # Use collect_set for unique values
.filter((col("Matchings").isNotNull()) & (size(col("Matchings")) > 0))
)
result.write \
.mode("overwrite") \
.option("compression", "gzip") \
.json(hdfs_output_path)

104
datacite.py Normal file
View File

@ -0,0 +1,104 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
from affro_cluster import *
from schemas import *
from utils import *
import sys
folder_path = sys.argv[1]
hdfs_output_path = sys.argv[2]
startFrom = sys.argv[3]
affiliationToAuthor = False
# Initialize Spark session
spark = SparkSession.builder.appName("AffRo-DataCite").getOrCreate()
affro_udf = udf(matchings_affro, matching_array_schema)
applyMatch_udf = udf(toAuthorModel, match_author_schema)
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
aggregateResult_udf = udf(aggregateResult, author_schema)
aggregateResultNoAuthor_udf = udf(aggregateResultNoAuthor, author_schema)
if(startFrom == 'Matchings'):
df = spark.read.option("mode", "PERMISSIVE").parquet(folder_path)
df_parsed = df.withColumn("json_parsed", from_json(col("json"), json_schema))
exploded = df_parsed.select(
col("json_parsed.attributes.doi").alias("id"), # Mapping "id" to "doi"
explode(col("json_parsed.attributes.creators")).alias("author") # Exploding creators instead of attributes
).filter(col("id").isNotNull())
exploded.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/exploded", compression="gzip")
affiliations_exploded = exploded.withColumn(
"Affiliation",
explode(col("author.affiliation").alias("Affiliation"))
)
affiliations_exploded.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/affiliations_exploded", compression="gzip")
matching_df = (
affiliations_exploded
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
.select("Affiliation") # Select only the "Affiliation" column
.distinct() # Remove duplicate rows based on "Affiliation"
.select(
col("Affiliation"),
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
)
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
)
matching_df.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/matching_df", compression="gzip")
matching_df = spark.read.schema(affiliation_schema).json("/tmp/miriam/affroOnDatacite/matching_df")
affiliations_exploded = spark.read.schema(affiliation_exploded_schema).json("/tmp/miriam/affroOnDatacite/affiliations_exploded")
if(not affiliationToAuthor):
aff_per_paper = (affiliations_exploded
.select("id","Affiliation")
.distinct())
aff_per_paper.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/aff_per_paper", compression="gzip")
(matching_df
.join(aff_per_paper, on="Affiliation")
.filter(col("Matchings").isNotNull())
.groupBy("id").agg(collect_list(struct("*")).alias("group"))
.withColumn("result", aggregateResultNoAuthor_udf("group"))
.select("result.*")
.write.mode("overwrite").json(hdfs_output_path, compression="gzip")
)
else:
extend = (
matching_df
.join(affiliations_exploded, on="Affiliation")
.filter(col("Matchings").isNotNull())
)
extend.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/extend", compression="gzip")
applyMatchDf = extend \
.withColumn("application", applyMatch_udf(struct("*"))) \
.select("application.*")
applyMatchDf.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/applyMatchDf", compression="gzip")
groupedAuthorDf = applyMatchDf \
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
.withColumn("aggresult", aggregate_udf("group")) \
.select("aggresult.*")
groupedAuthorDf.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/groupedAuthorDf", compression="gzip")
groupedResultDf = groupedAuthorDf \
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
.withColumn("result", aggregateResult_udf("group")) \
.select("result.*")
groupedResultDf.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/groupedResultDf", compression="gzip")
groupedResultDf.write.mode("overwrite").json(hdfs_output_path, compression="gzip")

30
dataset.py Normal file
View File

@ -0,0 +1,30 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
import sys
from affro_cluster import *
from schemas import *
from utils import *
spark = SparkSession.builder.appName("AffRo - Dataset").getOrCreate()
#the path of the dataset to be mapped. It contains all the affiliation strings to be processed
#The outcoma will be written in Append to the output path.
input_path = sys.argv[1]
#the output file
output_path = sys.argv[2]
affro_udf = udf(matchings_affro, matching_array_schema)
input_dataset = spark.read.schema(affiliation_string_schema).json(input_path)
( input_dataset
.select(col("raw_affiliation_string").alias("Affiliation"),
affro_udf(col("raw_affiliation_string")).alias("Matchings") )
.filter(col("Matchings").isNotNull())
.write
.mode("append")
.json(output_path)
)

BIN
dictionaries/.DS_Store vendored Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,60 @@
{"copernicus institu sustainable development" : "openorgs____::0000102848",
"copernicus institu sustainable development energy resources" : "openorgs____::0000102849",
"copernicus institu sustainable development environmental governance" : "openorgs____::0000102850",
"copernicus institu sustainable development environmental scien" : "openorgs____::0000102851",
"copernicus institu sustainable development inovation studies" : "openorgs____::0000102852",
"leiden institu advanced computer scien" : "openorgs____::0000102908",
"gorlaeus labora" : "openorgs____::0000102887",
"leiden institu chemistry gorlaeus labora" : "openorgs____::0000102887",
"institu biology leiden" : "openorgs____::0000102854",
"leiden institu chemistry" : "openorgs____::0000102880",
"leiden malaria research group": "openorgs____::0000102937",
"sylvius labora" : "openorgs____::0000102938",
"leiden mathematical institu" : "openorgs____::0000102939",
"leiden faculty archaeology" : "openorgs____::0000102940",
"erevnitiko idrima pl" : "openorgs____::0000103020",
"frederick research center" : "openorgs____::0000103018",
"cyens center excelence" : "openorgs____::0000103019",
"space systems solutions ltd" : "openorgs____::0000103021",
"eratosthenes center excelence" : "openorgs____::0000103022",
"cyric cyprus research and innovation center ltd" : "openorgs____::0000103023",
"danaos shiping company limited" : "openorgs____::0000103024",
"cyprus space exploration organization" : "openorgs____::0000103025",
"cyprus museum" : "openorgs____::0000103027",
"larnaca general hospital" : "openorgs____::0000103028",
"agricultural research institu" : "openorgs____::0000103029",
"research education institu child health" : "openorgs____::0000103030",
"german oncology center" : "openorgs____::0000103031",
"cyprus neuroscien techn institu" : "openorgs____::0000103032",
"salzgiter manesman forschung" : "openorgs____::0000103035",
"aesculap ag" : "openorgs____::0000103036",
"telekom inovation labora" : "openorgs____::0000103037",
"dlr institu vernetzte energiesysteme" : "openorgs____::0000103038",
"akademie ofentliches gesundheitswesen duseldorf" : "openorgs____::0000103039",
"ibe rd institu lung health" : "openorgs____::0000103040",
"herzentrum leipzig" : "openorgs____::0000103041",
"bundesforschungsanstalt fischerei" : "openorgs____::0000098305",
"osteuropa institu" : "openorgs____::0000103042",
"hochschule politik Munchen" : "openorgs____::0000103043",
"qualcom cdma techn" : "openorgs____::0000103046",
"kompetenzentrum obstbau bodensee" : "openorgs____::0000103047",
"institu angewandte qualitatsforderung forschung im gesundheitswesen" : "openorgs____::0000103048",
"dresearch digital media systems":"openorgs____::0000103049",
"eemagine medical imaging solutions": "openorgs____::0000103050",
"forschungszentrum energietechnologie":"openorgs____::0000103051",
"european radiation dosimetry group" : "openorgs____::0000103052",
"ge healthcare":"openorgs____::0000103053",
"global energy interconection research institu europe":"openorgs____::0000103054",
"proteros biostructures": "openorgs____::0000103055",
"frankfurter algemeine zeitung" : "openorgs____::0000103056",
"stiftung neanderthal museum": "openorgs____::0000103057",
"nvision imaging techn" : "openorgs____::0000103058",
"institu scien networking oldenburg" :"openorgs____::0000103059",
"zentrum internationale bildungsvergleichstudien" : "openorgs____::0000103060",
"evangelisches klinikum bethel": "openorgs____::0000103061",
"internationale hochschule liebenzel": "openorgs____::0000103062",
"walter schotky institu": "openorgs____::0000103063",
"institu phytopathologie": "openorgs____::0000103065",
"helios klinikum wupertal": "openorgs____::0000103066",
"univer herzentrum hamburg": "openorgs____::0000103067",
"zentrum graphische datenverarbeitung" : "openorgs____::0000017671"}

View File

@ -1 +1 @@
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
{"research": "Academia", "centro": "Academia", "recherche": "Academia", "uniwersytet": "Academia", "investigacions": "Academia", "institu": "Academia", "istitut": "Academia", "univ": "Academia", "col": "Academia", "center": "Academia", "polytechnic": "Academia", "tech": "Academia", "politecnico": "Academia", "polutekhneio": "Academia", "prifysgol": "Academia", "centre": "Academia", "kentro": "Academia", "politechnika": "Academia", "szkola": "Academia", "faculty": "Academia", "school": "Academia", "academ": "Academia", "akadem": "Academia", "schule": "Academia", "ecole": "Academia", "observ": "Academia", "escuela": "Academia", "escola": "Academia", "unibertsi": "Academia", "yliopisto": "Academia", "instytut": "Academia", "zentrum": "Academia", "panepist": "Academia", "lab": "Academia", "scien": "Academia", "cientific": "Academia", "engingeering": "Academia", "egyetem": "Academia", "sterewacht": "Academia", "tecnol": "Academia", "data": "Academia", "hospital": "Hospitals", "clinic": "Hospitals", "hopital": "Hospitals", "klinik": "Hospitals", "oncol": "Hospitals", "medical": "Hospitals", "health": "Hospitals", "medicin": "Hospitals", "nosokomei": "Hospitals", "krankenhaus": "Hospitals", "ziekenhuis": "Hospitals", "spital": "Hospitals", "cancer": "Hospitals", "pharma": "Hospitals", "therapeutics": "Hospitals", "ospedale": "Hospitals", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "ltd": "Company", "foundation": "Foundations", "asociation": "Foundations", "organization": "Foundations", "society": "Foundations", "group": "Foundations", "royal": "Foundations", "ofice": "Foundations", "trust": "Foundations", "survey": "Foundations", "museum": "Foundations", "library": "Foundations", "bank": "Foundations", "comision": "Foundations", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "squib": "Specific", "intel": "Specific", "motorola": "Specific", "samsung": "Specific", "hitachi": "Specific", "roche": "Specific", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "novartis": "Specific", "microsoft": "Specific", "ibm": "Specific", "alergan": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific", "boehringer ingelheim": "Specific", "demokritos": "Specific", "siemens": "Specific", "forth": "Specific", "eli lily": "Specific", "boeing": "Specific", "alphabet": "Specific", "johnson johnson": "Specific", "moderna": "Specific", "bayer": "Specific", "huawei": "Specific", "amazon": "Specific", "sinopec": "Specific", "novo nordisk": "Specific", "sanofi": "Specific", "amgen": "Specific", "abvie": "Specific", "basf": "Specific", "toyota": "Specific", "gilead": "Specific", "teagasc": "Specific", "apc microbiome": "Specific", "nestle": "Specific", "perkinelmer": "Specific", "shel": "Specific", "visteon": "Specific", "hewlet packard": "Specific", "fujitsu": "Specific", "toshiba": "Specific", "eth zurich": "Acronyms", "riken": "Acronyms", "ucd": "Acronyms", "tcd": "Acronyms", "nasa": "Acronyms", "bgi": "Acronyms", "ntt": "Acronyms", "dow": "Acronyms", "ustc": "Acronyms", "zju": "Acronyms", "pku": "Acronyms", "ucas": "Acronyms", "nju": "Acronyms", "scu": "Acronyms", "mit": "Acronyms", "hust": "Acronyms", "ucla": "Acronyms", "uc san diego": "Acronyms", "uc berkeley": "Acronyms", "jhu": "Acronyms", "uchicago": "Acronyms", "caltech": "Acronyms", "usp": "Acronyms", "conicet": "Acronyms", "unicamp": "Acronyms", "cnrs": "Acronyms", "ucl": "Acronyms", "csic": "Acronyms", "epfl": "Acronyms", "ucph": "Acronyms", "tum": "Acronyms", "lmu": "Acronyms", "ku leuven": "Acronyms", "ircs": "Acronyms", "nhs": "Acronyms", "discovery programe": "Acronyms", "ncsr demokritos": "Acronyms", "nui galway": "Acronyms", "nui maynooth": "Acronyms", "tu wien": "Acronyms", "tu dublin": "Acronyms", "athena": "Acronyms", "openaire": "Acronyms", "erasmus": "Acronyms", "ist austria": "Acronyms", "nit": "Acronyms", "cern": "Acronyms", "lg": "Acronyms", "chu": "Acronyms", "jst": "Acronyms", "lumc": "Acronyms"}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,66 @@
{"copernicus institu sustainable development" : ["utrecht" , "openorgs____::0000102848"],
"copernicus institu sustainable development energy resources" : ["utrecht" , "openorgs____::0000102849"],
"copernicus institu sustainable development environmental governance": ["utrecht", "openorgs____::0000102850"],
"copernicus institu sustainable development environmental scien" : ["utrecht","openorgs____::0000102851"],
"copernicus institu sustainable development inovation studies" : ["utrecht", "openorgs____::0000102852"],
"leiden institu advanced computer scien" : ["leiden", "openorgs____::0000102908"],
"gorlaeus labora" : ["leiden", "openorgs____::0000102887"],
"leiden institu chemistry gorlaeus labora" : ["leiden", "openorgs____::0000102887"],
"institu biology leiden" : ["leiden", "openorgs____::0000102854"],
"leiden institu chemistry" : ["leiden", "openorgs____::0000102880"],
"leiden malaria research group": ["leiden", "openorgs____::0000102937"],
"sylvius labora" : ["leiden", "openorgs____::0000102938"],
"leiden mathematical institu" : ["leiden", "openorgs____::0000102939"],
"leiden faculty archaeology" : ["leiden", "openorgs____::0000102940"],
"erevnitiko idrima pl":["nicosia","openorgs____::0000103020"],
"frederick research center":["nicosia","openorgs____::0000103018"],
"cyens center excelence":["nicosia","openorgs____::0000103019"],
"space systems solutions ltd":["nicosia","openorgs____::0000103021"],
"eratosthenes center excelence":["limasol","openorgs____::0000103022"],
"cyric cyprus research and innovation center ltd":["nicosia","openorgs____::0000103023"],
"danaos shiping company limited":["limasol","openorgs____::0000103024"],
"cyprus space exploration organization":["nicosia","openorgs____::0000103025"],
"cyprus museum":["nicosia","openorgs____::0000103027"],
"larnaca general hospital":["larnaca","openorgs____::0000103028"],
"agricultural research institu":["nicosia","openorgs____::0000103029"],
"research education institu child health":["nicosia","openorgs____::0000103030"],
"german oncology center":["limasol","openorgs____::0000103031"],
"cyprus neuroscien techno institu":["nicosia","openorgs____::0000103032"],
"salzgiter manesman forschung":["salzgiter","openorgs____::0000103035"],
"aesculap ag" : ["tutlingen","openorgs____::0000103036"],
"telekom inovation labora" : ["berlin","openorgs____::0000103037"],
"dlr institu vernetzte energiesysteme" : ["oldenburg","openorgs____::0000103038"],
"akademie ofentliches gesundheitswesen duseldorf" : ["duseldorf","openorgs____::0000103039"],
"ibe rd institu lung health" : ["munster","openorgs____::0000103040"],
"herzentrum leipzig" : ["leipzig","openorgs____::0000103041"],
"bundesforschungsanstalt fischerei" : ["hamburg","openorgs____::0000098305"],
"osteuropa institu" : ["berlin","openorgs____::0000103042"],
"hochschule politik Munchen" : ["munich","openorgs____::0000103043"],
"qualcom cdma techn" : ["munich","openorgs____::0000103046"],
"kompetenzentrum obstbau bodensee" : ["ravensburg","openorgs____::0000103047"],
"institu angewandte qualitatsforderung forschung im gesundheitswesen" : ["gotingen","openorgs____::0000103048"],
"dresearch digital media systems": ["berlin","openorgs____::0000103049"],
"eemagine medical imaging solutions": ["berlin","openorgs____::0000103050"],
"forschungszentrum energietechnologie": ["oldenburg,","openorgs____::0000103051"],
"european radiation dosimetry group" : ["neuherberg","openorgs____::0000103052"],
"ge healthcare": ["duseldorf","openorgs____::0000103053"],
"global energy interconection research institu europe": ["berlin","openorgs____::0000103054"],
"proteros biostructures": ["martinsried,","openorgs____::0000103055"],
"frankfurter algemeine zeitung" : ["martinsried","openorgs____::0000103056"],
"stiftung neanderthal museum": ["metman","openorgs____::0000103057"],
"nvision imaging techn" : ["ulm","openorgs____::0000103058"],
"institu scien networking oldenburg" : ["oldenburg","openorgs____::0000103059"],
"zentrum internationale bildungsvergleichstudien" : ["munich","openorgs____::0000103060"],
"evangelisches klinikum bethel": ["bielefeld","openorgs____::0000103061"],
"internationale hochschule liebenzel": ["bad liebenzel,","openorgs____::0000103062"],
"walter schotky institu": ["munich","openorgs____::0000103063"],
"institu phytopathologie": ["kiel","openorgs____::0000103065"],
"helios klinikum wupertal": ["wupertal","openorgs____::0000103066"],
"univer herzentrum hamburg": ["hamburg","openorgs____::0000103067"],
"zentrum graphische datenverarbeitung" : ["darmstadt","openorgs____::0000017671"]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,65 @@
{
"copernicus institu sustainable development" : ["netherlands","openorgs____::0000102848"],
"copernicus institu sustainable development energy resources" : ["netherlands", "openorgs____::0000102849"],
"copernicus institu sustainable development environmental governance" : ["netherlands", "openorgs____::0000102850"],
"copernicus institu sustainable development environmental scien" : ["netherlands", "openorgs____::0000102851"],
"copernicus institu sustainable development inovation studies": ["netherlands", "openorgs____::0000102852"],
"leiden institu advanced computer scien" : ["netherlands", "openorgs____::0000102908"],
"gorlaeus labora" : ["netherlands", "openorgs____::0000102887"],
"leiden institu chemistry gorlaeus labora" : ["netherlands", "openorgs____::0000102887"],
"institu biology leiden" : ["netherlands", "openorgs____::0000102854"],
"Leiden malaria research group": ["netherlands", "openorgs____::0000102937"],
"sylvius labora" : ["netherlands", "openorgs____::0000102938"],
"leiden mathematical institu" : ["netherlands", "openorgs____::0000102939"],
"leiden faculty archaeology" : ["netherlands", "openorgs____::0000102940"],
"erevnitiko idrima pl":["cyprus","openorgs____::0000103020"],
"frederick research center":["cyprus","openorgs____::0000103018"],
"cyens center excelence":["cyprus","openorgs____::0000103019"],
"space systems solutions ltd":["cyprus","openorgs____::0000103021"],
"eratosthenes center excelence":["cyprus","openorgs____::0000103022"],
"cyric cyprus research and innovation center ltd":["cyprus","openorgs____::0000103023"],
"danaos shiping company limited":["cyprus","openorgs____::0000103024"],
"cyprus space exploration organization":["cyprus","openorgs____::0000103025"],
"cyprus museum":["cyprus","openorgs____::0000103027"],
"larnaca general hospital":["cyprus","openorgs____::0000103028"],
"agricultural research institu":["cyprus","openorgs____::0000103029"],
"research education institu child health":["cyprus","openorgs____::0000103030"],
"german oncology center":["cyprus","openorgs____::0000103031"],
"cyprus neuroscien techno institu":["cyprus","openorgs____::0000103032"],
"salzgiter manesman forschung":["germany","openorgs____::0000103035"],
"aesculap ag" : ["germany","openorgs____::0000103036"],
"telekom inovation labora" : ["germany","openorgs____::0000103037"],
"dlr institu vernetzte energiesysteme" : ["germany","openorgs____::0000103038"],
"akademie ofentliches gesundheitswesen duseldorf" : ["germany","openorgs____::0000103039"],
"ibe rd institu lung health" : ["germany","openorgs____::0000103040"],
"herzentrum leipzig" : ["germany","openorgs____::0000103041"],
"bundesforschungsanstalt fischerei" : ["germany","openorgs____::0000098305"],
"osteuropa institu" : ["germany","openorgs____::0000103042"],
"hochschule politik Munchen" : ["germany","openorgs____::0000103043"],
"qualcom cdma techn" : ["germany","openorgs____::0000103046"],
"kompetenzentrum obstbau bodensee" : ["germany","openorgs____::0000103047"],
"institu angewandte qualitatsforderung forschung im gesundheitswesen" : ["germany","openorgs____::0000103048"],
"dresearch digital media systems": ["germany","openorgs____::0000103049"],
"eemagine medical imaging solutions": ["germany","openorgs____::0000103050"],
"forschungszentrum energietechnologie": ["germany","openorgs____::0000103051"],
"european radiation dosimetry group" : ["germany","openorgs____::0000103052"],
"ge healthcare": ["germany","openorgs____::0000103053"],
"global energy interconection research institu europe": ["germany","openorgs____::0000103054"],
"proteros biostructures": ["germany","openorgs____::0000103055"],
"frankfurter algemeine zeitung" : ["germany","openorgs____::0000103056"],
"stiftung neanderthal museum": ["germany","openorgs____::0000103057"],
"nvision imaging techn" : ["germany","openorgs____::0000103058"],
"institu scien networking oldenburg" : ["germany","openorgs____::0000103059"],
"zentrum internationale bildungsvergleichstudien" : ["germany","openorgs____::0000103060"],
"evangelisches klinikum bethel": ["germany","openorgs____::0000103061"],
"internationale hochschule liebenzel": ["germany","openorgs____::0000103062"],
"walter schotky institu": ["germany","openorgs____::0000103063"],
"institu phytopathologie": ["germany","openorgs____::0000103065"],
"helios klinikum wupertal": ["germany","openorgs____::0000103066"],
"univer herzentrum hamburg": ["germany","openorgs____::0000103067"],
"zentrum graphische datenverarbeitung" : ["germany","openorgs____::0000017671"]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,62 @@
{"openorgs____::0000102848" : "netherlands",
"openorgs____::0000102849" : "netherlands",
"openorgs____::0000102850" : "netherlands",
"openorgs____::0000102851" : "netherlands",
"openorgs____::0000102852" : "netherlands",
"openorgs____::0000102908" : "netherlands",
"openorgs____::0000102887" : "netherlands",
"openorgs____::0000102854" : "netherlands",
"openorgs____::0000102880" : "netherlands",
"openorgs____::0000102937" : "netherlands",
"openorgs____::0000102938" : "netherlands",
"openorgs____::0000102939" : "netherlands",
"openorgs____::0000102940" : "netherlands",
"openorgs____::0000103018" : "cyprus",
"openorgs____::0000103019" : "cyprus",
"openorgs____::0000103020" : "cyprus",
"openorgs____::0000103021" : "cyprus",
"openorgs____::0000103022" : "cyprus",
"openorgs____::0000103023" : "cyprus",
"openorgs____::0000103024" : "cyprus",
"openorgs____::0000103025" : "cyprus",
"openorgs____::0000103027" : "cyprus",
"openorgs____::0000103028" : "cyprus",
"openorgs____::0000103029" : "cyprus",
"openorgs____::0000103030" : "cyprus",
"openorgs____::0000103031" : "cyprus",
"openorgs____::0000103032" : "cyprus",
"openorgs____::0000103035" : "germany",
"openorgs____::0000103036" : "germany",
"openorgs____::0000103037" : "germany",
"openorgs____::0000103038" : "germany",
"openorgs____::0000103039" : "germany",
"openorgs____::0000103040" : "germany",
"openorgs____::0000103041" : "germany",
"openorgs____::0000098305" : "germany",
"openorgs____::0000103042" : "germany",
"openorgs____::0000103043" : "germany",
"openorgs____::0000103046" : "germany",
"openorgs____::0000103047" : "germany",
"openorgs____::0000103048" : "germany",
"openorgs____::0000103049" : "germany",
"openorgs____::0000103050" : "germany",
"openorgs____::0000103051" : "germany",
"openorgs____::0000103052" : "germany",
"openorgs____::0000103053" : "germany",
"openorgs____::0000103054" : "germany",
"openorgs____::0000103055" : "germany",
"openorgs____::0000103056" : "germany",
"openorgs____::0000103057" : "germany",
"openorgs____::0000103058" : "germany",
"openorgs____::0000103059" : "germany",
"openorgs____::0000103060" : "germany",
"openorgs____::0000103061" : "germany",
"openorgs____::0000103062" : "germany",
"openorgs____::0000103063" : "germany",
"openorgs____::0000103065" : "germany",
"openorgs____::0000103066" : "germany",
"openorgs____::0000103067" : "germany",
"openorgs____::0000017671" : "germany"}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,62 @@
{
"copernicus institu sustainable development": "unique",
"copernicus institu sustainable development energy resources": "unique",
"copernicus institu sustainable development environmental governance": "unique",
"copernicus institu sustainable development environmental scien": "unique",
"copernicus institu sustainable development inovation studies": "unique",
"leiden institu advanced computer scien": "unique",
"gorlaeus labora": "unique",
"institu biology leiden": "unique",
"leiden institu chemistry": "unique",
"leiden malaria research group": "unique",
"sylvius labora": "unique",
"leiden mathematical institu": "unique",
"leiden faculty archaeology": "unique",
"leiden institu chemistry gorlaeus labora": "unique",
"erevnitiko idrima pl": "unique",
"frederick research center": "unique",
"cyens center excelence": "unique",
"space systems solutions ltd": "unique",
"eratosthenes center excelence": "unique",
"cyric cyprus research and innovation center ltd": "unique",
"danaos shiping company limited": "unique",
"cyprus space exploration organization": "unique",
"cyprus museum": "unique",
"larnaca general hospital": "unique",
"agricultural research institu": "many",
"research education institu child health": "unique",
"german oncology center": "unique",
"cyprus neuroscien techn institu": "unique",
"salzgiter manesman forschung": "unique",
"aesculap ag": "unique",
"telekom inovation labora": "unique",
"dlr institu vernetzte energiesysteme": "unique",
"akademie ofentliches gesundheitswesen duseldorf": "unique",
"ibe rd institu lung health": "unique",
"herzentrum leipzig": "unique",
"bundesforschungsanstalt fischerei": "unique",
"osteuropa institu": "unique",
"hochschule politik Munchen": "unique",
"qualcom cdma techn": "unique",
"kompetenzentrum obstbau bodensee": "unique",
"institu angewandte qualitatsforderung forschung im gesundheitswesen": "unique",
"dresearch digital media systems": "unique",
"eemagine medical imaging solutions": "unique",
"forschungszentrum energietechnologie": "unique",
"european radiation dosimetry group": "unique",
"ge healthcare": "many",
"global energy interconection research institu europe": "unique",
"proteros biostructures": "unique",
"frankfurter algemeine zeitung": "unique",
"stiftung neanderthal museum": "unique",
"nvision imaging techn": "unique",
"institu scien networking oldenburg": "unique",
"zentrum internationale bildungsvergleichstudien": "unique",
"evangelisches klinikum bethel": "unique",
"internationale hochschule liebenzel": "unique",
"walter schotky institu": "unique",
"institu phytopathologie": "unique",
"helios klinikum wupertal": "unique",
"univer herzentrum hamburg": "unique",
"zentrum graphische datenverarbeitung": "unique"
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"hopital": "hospital", "hosp.": "hospital", "med. ctr." : "medical center" , "lab.":"labora","czechoslovak": "czech", "saint": "st", "aghia": "agia","kyprou":"kuprou", "technologiko": "tekhnologiko", "panepistimio":"panepistemio","universitatsfrauenklinik": "university hospital", "Universitätsaugenklinik": "university eye hospital", "universitatsklinikum": "univer hospital", "universitetshospital": "univer hospital", "universitatskinderklinik": "univer childrens hospital", "universitatskliniken": "univer hospital", "universit\u00e4tsklinik": "univer hospital", "universitatsmedizin": "univer medicine", "universitatsbibliothek": "univer library", "nat.": "national", "uniaersity":"univer", "univesity":"university", "unversity":"univer", "uni-versity": "univer", "un iversity": "univer", "uniuersity": "univer", "unviersity":"university", "uni versity": "univer", "unive rsity": "univer", "univ ersity": "univer", "ilniversity": "univer","inst ": "institu ", "adv ": "advanced ", "univ ": "univer ", "stud ": "studies ", "inst.": "institu", "sci." : "scien", "acad.": "academy" , "adv.": "advanced", "univ.": "univer", "stud.": "studies", "univcrsity" : "university", "uniuersity": "university", "unirersity": "university", "univsity": "university", "techniche": "technological", "univ col": "university colege", "univ. col.": "university colege", "col.": "colege", "medical school university": "university", "hipokration": "hipocration", "belfield, dublin": "dublin", "balsbridge, dublin": "dublin", "earlsfort terace, dublin": "dublin", "bon secours hospital, cork": "bon secours hospital cork", "bon secours hospital, dublin": "bon secours hospital dublin", "bon secours hospital, galway": "bon secours hospital galway", "bon secours hospital, tralee": "bon secours hospital tralee", "bon secours health system": "bon secours hospital dublin", "bon secours hospital, glasnevin": "bon secours hospital dublin", "imperial colege science, technology medicine": "imperial colege science technology medicine", "ucl queen square institute neurology": "ucl, london", "ucl institute neurology": "ucl, london", "royal holoway, university london": "royal holoway univer london", "city, university london": "city univer london", "city university, london": "city univer london", "aeginition": "eginition", "national technical university, athens": "national technical university athens", "leiden institu physics" : "univer leiden", "leyden":"leiden", " leiden law school": "univer leiden", "leiden umc" : "leiden univer medical center", "rijksuniver" : "univer", "leids" : "leiden", "lumc leiden" : "lumc,leiden", "medisch":"medical"}

File diff suppressed because it is too large Load Diff

104
graph.py Normal file
View File

@ -0,0 +1,104 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
import sys
from affro_cluster import *
from schemas import *
from utils import *
spark = SparkSession.builder.appName("AffRo - Graph").getOrCreate()
#the path of the graph to be used
input_path = sys.argv[1]
#eventual already mapped affiliation strings with respective matchings (only for the same version of the algorithm)
already_matched_path = sys.argv[2]
#the output file
output_path = sys.argv[3]
#matching already done for this iteration
matching_done = sys.argv[4]
affro_udf = udf(matchings_affro, matching_array_schema)
applyMatch_udf = udf(toAuthorModelGraph, match_author_schema)
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
aggregateResult_udf = udf(aggregateResultGraph, author_schema)
if already_matched_path == "none":
already_matched_df = spark.createDataFrame([], affiliation_schema)
else:
already_matched_df = spark.read.schema(affiliation_schema).json(already_matched_path)
exploded = spark.read.schema(graph_entry_schema).json(input_path + "/publication") \
.union(spark.read.schema(graph_entry_schema).json(input_path + "/dataset")) \
.union(spark.read.schema(graph_entry_schema).json(input_path + "/software")) \
.union(spark.read.schema(graph_entry_schema).json(input_path + "/otherresearchproduct")) \
.select(
col("id"), # Extract desired value
explode(col("author")).alias("author") # Keep exploding author as before
) \
.filter(col("id").isNotNull()) # Remove rows where 'id' is NULL
# Explode the "author.rawAffiliationString" column into separate rows
affiliations_exploded = exploded.withColumn(
"Affiliation",
explode(col("author.rawAffiliationString"))
)
if matching_done == 'no':
affiliations_df = (
affiliations_exploded
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
.select("Affiliation") # Select only the "Affiliation" column
.distinct() # Remove duplicate rows based on "Affiliation"
)
affiliations_df.write.mode("overwrite").json("/tmp/miriam/exploded", compression="gzip")
#Select only the not already matches affiliation strings
to_be_matched_df = (
affiliations_df
.join(already_matched_df, affiliations_df["Affiliation"] == already_matched_df["Affiliation"], "left")
.filter(already_matched_df["Affiliation"].isNull())
.select(affiliations_df["Affiliation"])
)
matching_df = (
to_be_matched_df # Remove duplicate rows based on "Affiliation"
.select(
col("Affiliation"),
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
)
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
)
if already_matched_path == "none":
already_matched_path = "/tmp/miriam/affroOnGraph/matching_df"
matching_df.write.mode("append").json(already_matched_path, compression="gzip")
matching_df = spark.read.schema(affiliation_schema).json(already_matched_path)
extend = (
affiliations_exploded
.join(matching_df, on="Affiliation")
.filter(col("Matchings").isNotNull())
)
extend.write.mode("overwrite").json("/tmp/miriam/affro-serializations/extend", compression="gzip")
applyMatchDf = extend \
.withColumn("application", applyMatch_udf(struct("*"))) \
.select("application.*")
applyMatchDf.write.mode("overwrite").json("/tmp/miriam/affro-serializations/applyMatchDf", compression="gzip")
groupedAuthorDf = applyMatchDf \
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
.withColumn("aggresult", aggregate_udf("group")) \
.select("aggresult.*")
groupedAuthorDf.write.mode("overwrite").json("/tmp/miriam/affro-serializations/groupedAuthorDf", compression="gzip")
groupedResultDf = groupedAuthorDf \
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
.withColumn("result", aggregateResult_udf("group")) \
.select("result.*")
groupedResultDf.write.mode("overwrite").json(output_path, compression="gzip")

View File

@ -1,121 +1,223 @@
from collections import defaultdict
from collections import Counter
import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from functions_cluster import *
from create_input_cluster import *
def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
def index_multiple_matchings(pairs):
d = {}
for p in pairs:
d[p[0][0]] = len(p)
return d
def keep_highest_url(lst):
best = {}
for item in lst:
name, score, url = item
if name not in best or url > best[name][2]: # Keep the highest URL
best[name] = item # Store the full entry
return list(best.values()) # Convert dictionary values back to list
def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
vectorizer = CountVectorizer()
similar_k = []
pairs_k = []
total_pairs = 0
# if keyword in dix_org:
# print('lucky')
# pairs_k.append((keyword,keyword,1,dix_org[keyword], dix_id_country[dix_org[keyword]]))
for x in candidates_:
# print('keyword', keyword)
if is_contained(keyword, x):
# print(0,x,total_pairs)
x_vector = vectorizer.fit_transform([x]).toarray()
keyword_vector = vectorizer.transform([keyword]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, keyword_vector)[0][0]
if similarity > min(simU, simG):
if ('univ' in keyword and 'univ' in x) and similarity > simU:
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif (not 'univ'in keyword and not 'univ' in x) and similarity > simG:
# print('pass', keyword, x, similarity)
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif is_contained(x, keyword):
# print(0.5,x,total_pairs)
if ('univ'in keyword and 'univ' in x):
# print(1,x,total_pairs)
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
if similarity > simU: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif not 'univ' in keyword and not 'univ' in x:
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
if similarity > simG: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
# total_pairs += len(pairs_k) # Track total number of pairs
if total_pairs >= limit: # Stop if we reach
return []
return pairs_k
def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU, simG):
"""
Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
---> corrects special cases in the main map that follows
Args:
light_raw
l2 candidate_num: number of candidates.
l3 pairs_list: List of pairs. (s, x, score)
l4 m: mult
Returns:
List: Resulting list containing OpenAIRE names and their similarity scores.
Finds the best match between a keyword (clean_aff) and legal names from the PID database.
"""
vectorizer = CountVectorizer()
univ_num = light_raw.lower().count('univ')
univ_num = light_raw.lower().count('univ')
result = []
best = []
s = light_raw
for j in range(len(pairs_list)):
x = pairs_list[j][1]
if [x, pairs_list[j][2]] in result:
continue
if m[pairs_list[j][0]] == 1:
if is_contained('univ', x.lower()) and pairs_list[j][2] > simU:
result.append([x, pairs_list[j][2]])
elif pairs_list[j][2] > simG:
result.append([x, pairs_list[j][2]])
best = []
elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or is_contained("center", x.lower()) or is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
result.append([pairs_list[j][1], 1])
for pair_group in pairs_list:
best_j = []
affil = pair_group[0][0]
num_uni_p = affil.count('univ')
# print('AFFIL', affil)
for p in pair_group:
organization, confidence = p[1], p[2]
# Skip duplicates
if [organization, confidence] in result:
continue
else:
try:
if not is_contained("univ", x.lower()):
continue # Skip if x does not contain "university" or "univ"
# Check similarity conditions
if multi[p[0]] == 1:
if 'univ' in organization.lower() and confidence > simU:
result.append([organization, confidence])
elif confidence > simG:
result.append([organization, confidence])
elif confidence >= 0.98:
result.append([organization, 1])
else:
if "univ" not in organization:
continue # Skip if 'univ' is missing
try:
s_vector = vectorizer.fit_transform([light_raw]).toarray()
x_vector = vectorizer.transform([organization]).toarray()
similarity = cosine_similarity(x_vector, s_vector)[0][0]
if similarity > 0.1: #use Levenshtein to better handle misspellings
similarity_l = 1 - Levenshtein.distance(organization, affil) / max(len(organization), len(affil))
best_j.append([organization, similarity, similarity_l])
# if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
# continue
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, s_vector)[0][0]
if similarity > 0.1:
similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0]))
except Exception as ex:
print("Error:", ex)
best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2])
except:
KeyError
if best:
# max_numbers = defaultdict(float)
# Assuming best is a list of three-element lists
# Each element is (string, number1, number2)
# Step 2: Keep only the best similarity per organization
max_numbers = defaultdict(float)
for item in best:
string, number1, number2 = item # Unpack the three elements
max_numbers[string] = max(max_numbers[string], number1)
for org, sim, sim_l in best_j:
max_numbers[org] = max(max_numbers[org], sim)
reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
reduced_best = [[org, sim, sim_l] for org, sim, sim_l in best_j if sim == max_numbers[org]]
# Sort by number1 decreasingly and then by number2 in descending order
# Sort by similarity score (descending) and then lexicographically
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
# print('REDUCED BEST: ', reduced_best)
result = result + reduced_best
univ_list = []
other_list = []
for r in result:
if is_contained('univ', r[0]):
univ_list.append(r)
else:
other_list.append(r)
limit = min(univ_num, candidate_num)
result.extend(reduced_best)
# print('RESULT EXT: ', result)
if len(univ_list) > limit:
result = univ_list[:limit] + other_list
# Step 3: Limit university-related matches
univ_list = [r for r in result if 'univ' in r[0]]
other_list = [r for r in result if 'univ' not in r[0]]
limit = min(num_uni_p, candidate_num)
if len(univ_list) > limit:
result = univ_list[:limit] + other_list
best.append(best_j)
# Step 4: Construct final dictionary **with highest confidence values**
pairs_dict = {p[1]: p[2] for group in pairs_list for p in group}
# Select the best confidence score for each organization
result_dict = {}
pairs_dict = {}
for l in pairs_list:
pairs_dict[l[1]] = l[2]
for p in result:
result_dict[p[0]] = pairs_dict[p[0]]
result_dict_list = [[y[0], result_dict[y[0]]] for y in result]
return result_dict_list
for res in result:
org = res[0]
similarity_score = res[1]
if org in pairs_dict:
best_confidence = pairs_dict[org] # Original confidence score from pairs_list
if org not in result_dict or similarity_score > result_dict[org][1]:
result_dict[org] = [best_confidence, similarity_score]
# Convert to list format
final_result = [[key, value[0]] for key, value in sorted(result_dict.items(), key=lambda x: x[1][1], reverse=True)]
# print("RESULT TO USE: ", final_result)
return final_result
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG, limit):
"""
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
@ -130,190 +232,207 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
Returns:
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
"""
df_list = input[1]
light_aff = input[0]
clean_aff = input[0]
# print('CLEAN_AFF (LVL1): ', clean_aff)
light_aff = input[1].replace(' gmbh', ' ').strip()
# print('LIGHT_AFF (LVL2): ', light_aff)
df_list = input[2]
countries_list = input[3]
# print('COUNTRIES_LIST: ', countries_list)
vectorizer = CountVectorizer()
lnamelist = list(dix_org.keys())
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
#pairs = []
result = {}
pairs = []
def get_keywords(filtered_list):
# Extract the "keywords" values from the dictionaries in filtered_list
keywords_list = [entry["keywords"] for entry in filtered_list]
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
candidates = get_candidates(countries_list)
# print('KEYWORDS: ', keywords)
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
for k,s in enumerate(keywords):
pairs_k = []
# print('try', s)
try:
pairs_k.append((s,s,1,dix_org[s],dix_id_country[dix_org[s]]))
# print('LUCKY')
# pairs.append((s,s,similarity,dix_org[s], dix_id_country[dix_org[s]]))
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
except:
# print('NOT LUCKY')
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
# print('PAIRS K: ', pairs_k)
return keywords_list
keywords = get_keywords(df_list)
result[k] = pairs_k
if len(pairs_k)>0:
# print('PAIRS K>0: ', pairs_k)
for k,s in enumerate(keywords):
similar_k = []
pairs_k = []
if s in lnamelist:
similarity = 1
similar_k.append(similarity)
pairs.append(pairs_k)
pairs_k.append((s,s,similarity,dix_org[s]))
pairs.append((s,s,similarity,dix_org[s]))
# print('PAIRS: ', pairs)
multi = index_multiple_matchings(pairs)
# print('MULTIL ',multi)
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
else:
for x in lnamelist:
if is_contained(s, x):
x_vector = vectorizer.fit_transform([x]).toarray()
s_vector = vectorizer.transform([s]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, s_vector)[0][0]
if similarity > min(simU, simG):
if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif is_contained(x, s):
if (is_contained('univ', s) and is_contained('univ', x)):
s_vector = vectorizer.fit_transform([s]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(s_vector, x_vector)[0][0]
if similarity > simU: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif not is_contained('univ', s) and not is_contained('univ', x):
s_vector = vectorizer.fit_transform([s]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(s_vector, x_vector)[0][0]
if similarity > simG: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
result[k] = pairs_k
multi = index_multiple_matchings(list(set(pairs)))
# need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
# print('here', multi)
# need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
need_check_keys = []
for i in range(len(keywords)):
ready_keys = []
ready_best = []
for keyword in multi:
try:
if multi[keywords[i]]>1:
need_check_keys.append(keywords[i])
if multi[keyword]>1:
need_check_keys.append(keyword)
else:
for p in pairs:
if keyword in p[0]:
if p[0][1] not in ready_keys:
ready_keys.append(p[0][1])
ready_best.append([p[0][1], p[0][2]])
except:
pass
# print('READY KEYWORD: ', ready_keys)
# print('READY BEST: ', ready_best)
# print('NEED CHECK KEYWORD: ', need_check_keys)
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
# print('NEED CHECK PAIRS: ', pairs_check)
if len(need_check_keys)>0:
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
# print('OUTPUT BEST: ', best0)
best1 = {x[0]:dix_org[x[0]] for x in best0 }
best01 = unique_subset(best0, best1)
matched_org = list(set([x[0] for x in best01])) + ready_keys
best = best01 + ready_best
# print('NEW BEST',best01)
else:
best = ready_best
matched_org = ready_keys
best = best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG)
matched_org = [x[0] for x in best]
# best_o = []
# best_s = []
# best_result = []
# for x in best:
# best_o.append([x[i][0] for i in range(len(x))])
# best_s.append([round(x[i][1],2) for i in range(len(x))])
# num_mathced = [len(best_s[i]) for i in range(len(need_check))]
ids = [dix_org[x[0]] for x in best]
for i,x in enumerate(matched_org):
# id_list = []
if dix_mult[x] != 'unique':
if x in list(dix_city_ror.keys()):
match_found0 = False
# print('FINAL BEST: ', best)
## print('MATCHED: ', matched_org)
id_list = []
for org_list in best:
org = org_list[0]
conf = org_list[1]
if dix_mult[org] == 'unique':
# print('unique:', org)
if 'institu' in org and 'univ' in org:
#print('both inst and univ', clean_aff)
if dix_city_ror[org][0] not in clean_aff and dix_country_ror[org][0] not in clean_aff:
#print('pass')
pass
else:
#print('correct')
id_list.append([org, conf, dix_org[org]])
else:
id_list.append([org, conf, dix_org[org]])
else:
# print('not unique:', org)
if org in dix_city_ror:
match_found = False
for city in dix_city_ror[x]:
if city[0] in light_aff:
if city[0] not in x:
ids[i] = city[1]
match_found0 = True
for city in dix_city_ror[org]:
if city[0] in clean_aff:
if city[0] not in org:
# print('city', city[0], org)
id_list.append([org, conf, city[1]])
match_found = True
break
else:
if clean_aff.count(city[0]) >1:
id_list.append([org, conf, city[1]])
match_found = True
break
if not match_found:
for city in dix_city_ror[x]:
if city[0] in light_aff and city[0] not in x:
ids[i] = city[1]
match_found0 = True
print('ok')
for city in dix_city_ror[org]:
if city[0] in clean_aff and city[0] not in org:
id_list.append([org, conf, city[1]])
break
if not match_found:
match_found2 = False
match_found3 = False
for country in dix_country_ror[x]:
if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff):
ids[i] = country[1]
match_found2 = True
match_found3 = True
break
for country in dix_country_ror[org]:
# print('country', country[0], org)
if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff):
ids[i] = country[1]
tokens = set(clean_aff.lower().split())
text = clean_aff.lower()
if country[0] == 'united states' and (
'united states' in text
or {'usa', 'usa.'} & tokens
or 'u.s.a.' in text
):
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
elif country[0] in light_aff:
if country[0] == 'united kingdom' and (
'united kingdom' in text
or {'uk', 'uk.'} & tokens
or 'u.k.' in text
):
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
# print('check country', clean_aff)
# if country[0] == 'united states' and (country[0] in clean_aff or 'usa' in clean_aff.split() or 'usa.' in clean_aff.split() or 'u.s.a.' in clean_aff):
# id_list.append([org, conf, country[1]])
# match_found2 = True
# match_found3 = True
# break
# if country[0] == 'united kingdom' and (country[0] in clean_aff or 'uk' in clean_aff.split() or 'u.k.' in clean_aff):
# id_list.append([org, conf, country[1]])
# match_found2 = True
# match_found3 = True
# break
if country[0] not in x:
ids[i] = country[1]
elif country[0].split()[0] in clean_aff:
if country[0] not in org:
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
if not match_found3:
for country in dix_country_ror[x]:
if country[0] in light_aff and country[0] in x:
ids[i] = country[1]
for country in dix_country_ror[org]:
if country[0] in clean_aff and country[0] in org:
id_list.append([org, conf, country[1]])
match_found2 = True
break
if not match_found2:
for sp in specific:
if sp in org:
id_list.append([org, conf, dix_org[org]])
results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
return results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]
# print("RESULT: ", id_list)
id_list_final = keep_highest_url(id_list)
return id_list_final

80
oalex.py Normal file
View File

@ -0,0 +1,80 @@
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
from affro_cluster import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set
import sys
from schemas import *
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
folder_path = sys.argv[1]
hdfs_output_path = sys.argv[2]
#Version of affro application on a single raw_aff_string and returns just the Matchins set
def oalex_affro(aff_string):
try:
matchings = affro(aff_string)
# Ensure matchings is a list, even if affro returns a single dict
if not isinstance(matchings, list):
matchings = [matchings]
# Create the result as a tuple that matches matchings_schema
result = []
for matching in matchings:
# Assuming 'matching' is a dictionary that contains 'Provenance', 'PID', 'Value', 'Confidence', 'Status'
result.append((
matching.get("Provenance", None),
matching.get("PID", None),
matching.get("Value", None),
float(matching.get("Confidence", None)),
matching.get("Status", None)
))
return result
except Exception as e:
print(f"Error processing affiliation string {aff_string}: {str(e)}")
return ()
oalex_affro_udf = udf(oalex_affro, matching_array_schema)
exploded = spark.read.schema(ddl_schema_aff).json(folder_path) \
.filter(col("doi").isNotNull()) \
.select(
col("doi"),
explode("authorships").alias("authors")
) \
.select(
col("doi"),
col("authors.raw_affiliation_strings").alias("raw_aff_string")
) \
.select(
col("doi").alias("DOI"),
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
)
exploded.write.mode("overwrite").json("/tmp/miriam/affroOnOalex/exploded", compression="gzip")
affs = exploded \
.select("affiliation") \
.distinct() \
.withColumn("Matchings", oalex_affro_udf(col("affiliation")))
affs.write.mode("overwrite").json("/tmp/miriam/affroOnOalex/matchings", compression="gzip")
affs.join(exploded, on="affiliation") \
.select(col("DOI"),
explode(col("Matchings")).alias("match")
) \
.groupBy("DOI") \
.agg(
collect_set("match").alias("Matchings") #each exploded match is collected again
) \
.write \
.mode("overwrite") \
.option("compression","gzip") \
.json(hdfs_output_path)

BIN
out/._SUCCESS.crc Normal file

Binary file not shown.

0
out/_SUCCESS Normal file
View File

151
pubmed.py Normal file
View File

@ -0,0 +1,151 @@
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
import sys
from affro_cluster import *
from schemas import *
spark = SparkSession.builder.appName("AffRo - Authors model").getOrCreate()
input_file = sys.argv[1]
author_file = sys.argv[2]
matching_array_schema = ArrayType(
StructType([
StructField("Provenance", StringType(), nullable=False),
StructField("PID", StringType(), nullable=False),
StructField("Value", StringType(), nullable=False),
StructField("Confidence", DoubleType(), nullable=False),
StructField("Status", StringType(), nullable=False)
])
)
affro_udf = udf(matchings_affro, matching_array_schema)
exploded = spark.read.json(input_file) \
.withColumn("exploded", explode(col("instance"))) \
.withColumn("altId", explode(col("exploded.alternateIdentifier"))) \
.filter("altId.qualifier.classid == 'doi'") \
.select(
col("altId.value").alias("id"), # Extract desired value
explode(col("author")).alias("author") # Keep exploding author as before
) \
.filter(col("id").isNotNull()) # Remove rows where 'id' is NULL
# Explode the "author.rawAffiliationString" column into separate rows
affiliations_exploded = exploded.withColumn(
"Affiliation",
explode(col("author.rawAffiliationString"))
)
matching_df = (
affiliations_exploded
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
.select("Affiliation") # Select only the "Affiliation" column
.distinct() # Remove duplicate rows based on "Affiliation"
.select(
col("Affiliation"),
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
)
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
)
#x id id, author, rawaffiliationstring, matchings
def toAuthorModel(x):
name = {}
name['Full'] = x['author']['fullname']
name['First'] = x['author']['name']
name['Last'] = x['author']['surname']
orcid = None
if 'pid' in x['author'] :
try:
for p in x['author']['pid']:
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
orcid = p['value']
except:
pass
name['orcid'] = orcid
ret = {'key': x['id']+x['author']['fullname'],'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
return ret
def getMatchings(matches):
matchings = []
for m in matches:
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
return matchings
def regroupAndSelectDistinctMatch(x):
ret = []
dic = {}
for m in x:
for e in m:
if e['Status'] == 'active':
if not e['Value'] in dic:
dic[e['Value']] = 0
if dic[e['Value']] < float(e['Confidence']):
dic[e['Value']] = float(e['Confidence'])
for e in dic:
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
return ret
def aggregateAuthor(group):
affiliations = []
matchings = []
for e in group:
affiliations.append(e['Raw_affiliation'])
matchings.append(e['Matchings'])
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
def aggregateResult(group):
authors = []
matchings = []
for e in group:
amatch = getMatchings(e["Matchings"])
authors.append({
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
"Corresponding": None,
"Contributor_roles": None,
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
"Matchings": amatch
})
matchings.append(amatch)
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
return ret
applyMatch_udf = udf(toAuthorModel, match_author_schema)
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
aggregateResult_udf = udf(aggregateResult, author_schema)
extend = (
matching_df
.join(affiliations_exploded, on="Affiliation")
.filter(col("Matchings").isNotNull())
)
applyMatchDf = extend \
.withColumn("application", applyMatch_udf(struct("*"))) \
.select("application.*")
groupedAuthorDf = applyMatchDf \
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
.withColumn("aggresult", aggregate_udf("group")) \
.select("aggresult.*")
groupedResultDf = groupedAuthorDf \
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
.withColumn("result", aggregateResult_udf("group")) \
.select("result.*") \
.withColumnRenamed("id", "DOI")
groupedResultDf.write.mode("overwrite").json(author_file, compression="gzip")

81
run1_relations.py Normal file
View File

@ -0,0 +1,81 @@
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size
from affro_cluster import *
spark = SparkSession.builder.appName("AffRo - Relations").getOrCreate()
input_path = sys.argv[1]
maching_file = sys.argv[2]
try:
matching_df = spark.read.json(maching_file)
affro_dict = dict(matching_df.rdd.map(lambda row: (row[0], row[1])).collect())
except:
schema = StructType([
StructField("Affiliation", StringType(), True),
StructField("Matchings", StringType(), True)
])
matching_df = spark.createDataFrame([], schema)
affro_dict = {}
matchings_schema = ArrayType(
StructType([
StructField("Provenance", StringType(), nullable=False),
StructField("PID", StringType(), nullable=False),
StructField("Value", StringType(), nullable=False),
StructField("Confidence", DoubleType(), nullable=False),
StructField("Status", StringType(), nullable=False)
])
)
affro_udf = udf(matchings_affro, matchings_schema)
exploded = spark.read.json(input_path) \
.filter(col("id").isNotNull()) \
.select(
col("id").alias("ID"),
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
)
# Explode the "author.rawAffiliationString" column into separate rows
affiliations_exploded = exploded.withColumn(
"Affiliation",
explode(col("author.rawAffiliationString"))
)
extend = (
matching_df
.join(affiliations_exploded, on="Affiliation", how="right")
.filter(col("Matchings").isNull())
)
result = (
extend
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
.select("Affiliation") # Select only the "Affiliation" column
.distinct() # Remove duplicate rows based on "Affiliation"
.select(
col("Affiliation"),
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
)
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
)
result.write \
.format("json") \
.option("compression", "gzip") \
.mode("append") \
.save(maching_file)

117
run2_matchings.py Normal file
View File

@ -0,0 +1,117 @@
from schemas import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
import sys
#x id id, author, rawaffiliationstring, matchings
def toAuthorModel(x):
name = {}
name['Full'] = x['author']['fullname']
name['First'] = x['author']['name']
name['Last'] = x['author']['surname']
orcid = None
if 'pid' in x['author'] :
for p in x['author']['pid']:
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
orcid = p['value']
name['orcid'] = orcid
ret = {'key': x['id']+x['author']['fullname'],'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
return ret
def getMatchings(matches):
matchings = []
for m in matches:
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
return matchings
def regroupAndSelectDistinctMatch(x):
ret = []
dic = {}
for m in x:
for e in m:
if e['Status'] == 'active':
if not e['Value'] in dic:
dic[e['Value']] = 0
if dic[e['Value']] < float(e['Confidence']):
dic[e['Value']] = float(e['Confidence'])
for e in dic:
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
return ret
def aggregateAuthor(group):
affiliations = []
matchings = []
for e in group:
affiliations.append(e['Raw_affiliation'])
matchings.append(e['Matchings'])
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
def aggregateResult(group):
authors = []
matchings = []
for e in group:
amatch = getMatchings(e["Matchings"])
authors.append({
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
"Corresponding": None,
"Contributor_roles": None,
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
"Matchings": amatch
})
matchings.append(amatch)
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
return ret
maching_file = sys.argv[2]
input_file = sys.argv[1]
author_file = sys.argv[3]
applyMatch_udf = udf(toAuthorModel, match_author_schema)
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
aggregateResult_udf = udf(aggregateResult, author_schema)
spark = SparkSession.builder.appName("AffRo-Matchings").getOrCreate()
matching_df = spark.read.json(maching_file)
exploded = spark.read.schema(result_schema).json(input_file) \
.select(
col("id"),
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
)
affiliations_exploded = exploded.withColumn(
"Affiliation",
explode(col("author.rawAffiliationString"))
)
extend = (
matching_df
.join(affiliations_exploded, on="Affiliation")
.filter(col("Matchings").isNotNull())
)
applyMatchDf = extend \
.withColumn("application", applyMatch_udf(struct("*"))) \
.select("application.*")
groupedAuthorDf = applyMatchDf \
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
.withColumn("aggresult", aggregate_udf("group")) \
.select("aggresult.*")
groupedResultDf = groupedAuthorDf \
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
.withColumn("result", aggregateResult_udf("group")) \
.select("result.*")
groupedResultDf.write.mode("overwrite").json(author_file, compression="gzip")

147
schemas.py Normal file
View File

@ -0,0 +1,147 @@
from pyspark.sql.types import (
StringType, ArrayType, StructType, StructField, FloatType, IntegerType, MapType, BooleanType, DoubleType
)
# Schema per il nome dell'autore
author_info_schema = StructType([
StructField("Full", StringType(), True),
StructField("First", StringType(), True),
StructField("Last", StringType(), True),
StructField("orcid", StringType(), True)
])
# Schema per un matching
matching_schema = StructType([
StructField("Provenance", StringType(), True),
StructField("PID", StringType(), True),
StructField("Value", StringType(), True),
StructField("Confidence", DoubleType(), True),
StructField("Status", StringType(), True)
])
matching_array_schema = ArrayType(matching_schema)
affiliation_schema = StructType([
StructField("Affiliation", StringType(), nullable=False),
StructField("Matchings", matching_array_schema, True)
])
creator_schema = StructType([
StructField("name", StringType(), True),
StructField("givenName", StringType(), True),
StructField("familyName", StringType(), True),
StructField("nameType", StringType(), True),
StructField("affiliation", ArrayType(StringType()), True),
StructField("nameIdentifiers", ArrayType(StringType()), True)
])
json_schema = StructType([
StructField("doi", StringType(), True),
StructField("attributes", StructType([
StructField("doi", StringType(), True),
StructField("identifiers", ArrayType(StringType()), True),
StructField("creators", ArrayType(creator_schema), True),
]), True)
])
graph_author_schema = StructType([
StructField("fullname", StringType(), True),
StructField("rawAffiliationString", ArrayType(StringType()), True)])
graph_entry_schema = StructType([
StructField("id", StringType(), True),
StructField("author", ArrayType(graph_author_schema), True)])
# Schema per un autore con affiliazione e matching
match_author_schema = StructType([
StructField("id", StringType(), True),
StructField("key", StringType(), True),
StructField("author", author_info_schema, True),
StructField("Raw_affiliation", StringType(), True),
StructField("Matchings", ArrayType(matching_schema), True)
])
# Schema per la versione aggregata degli autori
match_author_grouped_schema = StructType([
StructField("id", StringType(), True),
StructField("author", author_info_schema, True),
StructField("Raw_affiliation", ArrayType(StringType()), True),
StructField("Matchings", ArrayType(matching_schema), True)
])
# Schema per un'organizzazione
organization_schema = matching_array_schema
# Schema per un autore con informazioni dettagliate
detailed_author_schema = StructType([
StructField("Name", author_info_schema, True),
StructField("Corresponding", BooleanType(), True),
StructField("Contributor_roles", ArrayType(MapType(StringType(), StringType())), True),
StructField("Raw_affiliations", ArrayType(StringType()), True),
StructField("Matchings", ArrayType(matching_schema), True)
])
# Schema per il set di autori e organizzazioni
author_schema = StructType([
StructField("id", StringType(), True),
StructField("Authors", ArrayType(detailed_author_schema), True),
StructField("Organizations", organization_schema, True)
])
# Schema per un identificatore con qualificatori
qualifier_schema = StructType([
StructField("classid", StringType(), True),
StructField("classname", StringType(), True),
StructField("schemeid", StringType(), True),
StructField("schemename", StringType(), True)
])
pid_schema = StructType([
StructField("key", StringType(), True),
StructField("qualifier", qualifier_schema, True),
StructField("value", StringType(), True)
])
# Schema per gli autori nel grafo
graph_author_schema = StructType([
StructField("fullname", StringType(), True),
StructField("name", StringType(), True),
StructField("surname", StringType(), True),
StructField("rank", IntegerType(), True),
StructField("pid", ArrayType(pid_schema), True),
StructField("rawAffiliationString", ArrayType(StringType()), True)
])
# Schema per il dataset finale
result_schema = StructType([
StructField("id", StringType(), False),
StructField("author", ArrayType(graph_author_schema), True)
])
affiliation_exploded_schema = StructType([
StructField("id", StringType(), True),
StructField("author", creator_schema, True),
StructField("Affiliation", StringType(), True)
])
ddl_schema_aff = StructType([
StructField("id", StringType(), True),
StructField("doi", StringType(), True),
StructField("publication_year", StringType(), True),
StructField("authorships", ArrayType(
StructType([
StructField("institutions", ArrayType(
StructType([
StructField("ror", StringType(), True),
StructField("country_code", StringType(), True)
])
), True),
StructField("raw_affiliation_strings", ArrayType(StringType()), True)
])
), True)
])
affiliation_string_schema = StructType([
StructField("raw_affiliation_string", StringType(), nullable=False)
])

Binary file not shown.

File diff suppressed because one or more lines are too long

8
test/step1/part-00000 Normal file

File diff suppressed because one or more lines are too long

8
test/step2/part-00000 Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

239
txt_files/country_names.txt Normal file
View File

@ -0,0 +1,239 @@
trinidad
united
hong
niederland
holand
deutschland
usa
uk
korea
africa
turkey
afghanistan
aland
albania
algeria
american samoa
andora
angola
antarctica
antigua barbuda
argentina
armenia
aruba
australia
austria
azerbaijan
bahamas
bahrain
bangladesh
barbados
belarus
belgium
belize
benin
bermuda
bhutan
bolivia
bonaire sint eustatius saba
bosnia herzegovina
botswana
brazil
brunei
bulgaria
burkina faso
burundi
cabo verde
cambodia
cameroon
canada
cayman islands
central african republic
chad
chile
china
colombia
comoros
congo republic
costa rica
croatia
cuba
curacao
cyprus
czechia
denmark
djibouti
dominica
dominican republic
dr congo
ecuador
egypt
el salvador
equatorial guinea
eritrea
estonia
eswatini
ethiopia
falkland islands
faroe islands
fiji
finland
france
french guiana
french polynesia
gabon
gambia
georgia
germany
ghana
gibraltar
greece
greenland
grenada
guadeloupe
guam
guatemala
guinea
guinea bisau
guyana
haiti
honduras
hong kong
hungary
iceland
india
indonesia
iran
iraq
ireland
isle man
israel
italy
ivory coast
jamaica
japan
jersey
jordan
kazakhstan
kenya
kiribati
kosovo
kuwait
kyrgyzstan
laos
latvia
lebanon
lesotho
liberia
libya
liechtenstein
lithuania
luxembourg
macao
madagascar
malawi
malaysia
maldives
mali
malta
martinique
mauritania
mauritius
mayote
mexico
micronesia
moldova
monaco
mongolia
montenegro
montserat
moroco
mozambique
myanmar
namibia
nepal
netherlands
new caledonia
new zealand
nicaragua
niger
nigeria
niue
north korea
north macedonia
northern mariana islands
norway
oman
pakistan
palau
palestine
panama
papua new guinea
paraguay
peru
philipines
poland
portugal
puerto rico
qatar
reunion
romania
rusia
rwanda
samoa
san marino
sao tome principe
saudi arabia
senegal
serbia
seycheles
siera leone
singapore
sint maarten
slovakia
slovenia
solomon islands
somalia
south africa
south korea
south sudan
spain
sri lanka
st kits nevis
st lucia
st vincent grenadines
sudan
suriname
svalbard jan mayen
sweden
switzerland
syria
taiwan
tajikistan
tanzania
thailand
timor leste
togo
tonga
trinidad tobago
tunisia
turkiye
turkmenistan
turks caicos islands
tuvalu
uganda
ukraine
united arab emirates
united kingdom
united states
uruguay
us virgin islands
uzbekistan
vanuatu
vatican city
venezuela
vietnam
western sahara
yemen
zambia
zimbabwe

View File

@ -1,28 +1,36 @@
universi
colege street
research institu
laboratory
labora
gmbh
inc
universi of
ltd
research center
foundation
faculty
national institu
school medicine
universi school
univer school
graduate school
graduate school engineering
institu tropical medicine
institu virology
faculty medicine
laboratory
universi park
labora
univer park
institu science
polytechnic universi
universi 1
ciudad universi
universi campus
universi hospitals
polytechnic univer
univer 1
ciudad univer
univer campus
univer hospitals
colege
universi road
universitetska str
univer road
univer str
clinic
techn
univer
institu st
po box
rijksuniver
institu
hochschule

View File

@ -5,7 +5,6 @@ at
de
for
et
für
des
in
as
@ -14,3 +13,11 @@ and
fur
for
und
der
aus
dem
di
l
street
post-box
e.v.

View File

@ -5,4 +5,7 @@ universitatskliniken
universitetshospital
universitatsmedizin
universitatsbibliothek
universitatspital
universitatspital
universitetsjukhuset
universitatsaugenklinik
univesitatsfrauenklinik

View File

@ -1,116 +0,0 @@
import json
import os
from pyspark.sql import SparkSession
from affro_cluster import *
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
#folder_path = 'check'
json_file_names = []
# Loop through all files in the directory
for file_name in os.listdir(folder_path):
# Check if the file is a JSON file (you can adjust the check as needed)
if file_name != '_SUCCESS':
json_file_names.append(file_name)
# json_file_names now contains the names of all JSON files in the folder
# Initialize Spark session
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
def remove_duplicates(list_of_dicts):
# Use a set to store tuples of dictionary items to filter out duplicates
seen = set()
unique_list_of_dicts = []
for d in list_of_dicts:
# Convert the dictionary to a tuple of items
items = tuple(d.items())
if items not in seen:
seen.add(items)
unique_list_of_dicts.append(d)
return unique_list_of_dicts
def update_record(record):
id = record['id']
authors = []
try:
for author in record['authors']:
author_object = {}
if 'orcid.org/0' in author['fullName']:
author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
author_object['ORCID'] = author['fullName'].split(',')[0][:36]
else:
author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
author_object['ORCID'] = None
author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
all_affs_with_ror = []
have_ror = False
for affiliation in author['affiliations']:
# author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
if 'ORCID: 0' in affiliation['raw_affiliation_string']:
x = affiliation['raw_affiliation_string']
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID: ')[1]
elif 'ORCID 0' in affiliation['raw_affiliation_string']:
x = affiliation['raw_affiliation_string']
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
if 'ror.org' in affiliation['raw_affiliation_string']:
have_ror = True
all_affs_with_ror.append({
'Origin': 'data',
'RORid': affiliation['raw_affiliation_string'][0:25],
'Confidence': None
})
else:
if len(affro(affiliation['raw_affiliation_string']))>0:
author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
else:
author_object['Organization_PIDs'] = []
if have_ror == True:
author_object['Organization_PIDs'] = all_affs_with_ror
order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
reordered_data = {k: author_object[k] for k in order}
authors.append(reordered_data)
organizations = remove_duplicates([x for author in authors for x in author['Organization_PIDs']])
updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
return updt
except Exception as e:
print(f"Error processing record with id {record.get('id')}: {str(e)}")
return None
for file in json_file_names:
print('start processing '+str(file))
df = spark.read.json(folder_path + '/' + file)
# Apply the update_record function
updated_rdd = df.rdd.map(lambda row: update_record(row.asDict()))
# Convert updated RDD to JSON strings
json_rdd = updated_rdd.map(lambda record: json.dumps(record))
# Collect the data and write to an output file with a unique name
json_data = json_rdd.collect()
# Create a new filename by appending "_output.json" to the original filename (without extension)
output_file_name = file+'_output.json'
print('end processing '+str(file))
with open(output_file_name, 'w') as f:
for i, item in enumerate(json_data):
print('write '+str(i))
f.write(item + '\n')

123
utils.py Normal file
View File

@ -0,0 +1,123 @@
#x id id, author, rawaffiliationstring, matchings
def toAuthorModelGraph(x):
name = {}
name['Full'] = x['author']['fullname']
orcid = None
if 'pid' in x['author'] :
try:
for p in x['author']['pid']:
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
orcid = p['value']
except:
pass
name['orcid'] = orcid
if name['Full'] is None:
key = x['id']
else:
key = x['id'] + name['Full']
return {'key': key,'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
def toAuthorModel(x):
name = {}
name['Full'] = x['author']['name']
name['First'] = x['author']['givenName']
name['Last'] = x['author']['familyName']
if name['Full'] is None:
if not x['author']['familyName'] is None and not x['author']['givenName'] is None:
name['Full'] = x['author']['familyName'] + ", " + x['author']['givenName']
elif not x['author']['familyName'] is None:
name['Full'] = x['author']['familyName']
else:
name['Full'] = x['author']['givenName']
orcid = None
if 'pid' in x['author'] :
try:
for p in x['author']['nameIdentifiers']:
if 'nameIdentifierScheme' in p and p['nameIdentifierScheme'].lower() == 'orcid':
orcid = p['nameIdentifier']
except:
pass
name['orcid'] = orcid
if name['Full'] is None:
key = x['id']
else:
key = x['id'] + name['Full']
return {'key': key,'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
def getMatchings(matches):
matchings = []
for m in matches:
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
return matchings
def regroupAndSelectDistinctMatch(x):
ret = []
dic = {}
for m in x:
for e in m:
if e['Status'] == 'active':
if not e['Value'] in dic:
dic[e['Value']] = 0
if dic[e['Value']] < float(e['Confidence']):
dic[e['Value']] = float(e['Confidence'])
for e in dic:
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
return ret
def aggregateAuthor(group):
affiliations = []
matchings = []
for e in group:
affiliations.append(e['Raw_affiliation'])
matchings.append(e['Matchings'])
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
def aggregateResultGraph(group):
authors = []
matchings = []
for e in group:
amatch = getMatchings(e["Matchings"])
authors.append({
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
"Corresponding": None,
"Contributor_roles": None,
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
"Matchings": amatch
})
matchings.append(amatch)
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
return ret
def aggregateResult(group, graph=False):
authors = []
matchings = []
for e in group:
amatch = getMatchings(e["Matchings"])
authors.append({
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
"Corresponding": None,
"Contributor_roles": None,
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
"Matchings": amatch
})
matchings.append(amatch)
if not graph:
ret = {"DOI": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
else:
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
return ret
def aggregateResultNoAuthor(group):
matchings = []
for e in group:
matchings.append(getMatchings(e["Matchings"]))
ret = {"id": group[0]["id"], "Authors": [], "Organizations":regroupAndSelectDistinctMatch(matchings)}
return ret