Compare commits

..

310 Commits

Author SHA1 Message Date
Giambattista Bloisi dac3849ced initial stage 2024-08-06 12:09:57 +02:00
Giambattista Bloisi e072311240 initial stage 2024-08-06 11:37:23 +02:00
Giambattista Bloisi 2f8c04a6a2 initial stage 2024-08-06 11:21:52 +02:00
Giambattista Bloisi 72ddac35cb initial stage 2024-08-06 11:20:52 +02:00
Giambattista Bloisi a2e7c4beb6 initial stage 2024-08-06 11:19:01 +02:00
Giambattista Bloisi 118e29f462 initial stage 2024-08-01 11:10:44 +02:00
Giambattista Bloisi b23ddd3002 initial stage 2024-08-01 11:04:06 +02:00
Giambattista Bloisi 9581a86313 initial stage 2024-08-01 10:57:08 +02:00
Giambattista Bloisi f22f89b54c initial stage 2024-08-01 10:55:36 +02:00
Giambattista Bloisi 9435d23083 initial stage 2024-08-01 10:45:27 +02:00
Giambattista Bloisi fa24f85997 initial stage 2024-08-01 10:43:14 +02:00
Giambattista Bloisi 88076612b7 initial stage 2024-07-27 14:46:40 +02:00
Giambattista Bloisi 10c27e578d initial stage 2024-07-27 14:44:38 +02:00
Giambattista Bloisi 792d069234 initial stage 2024-07-27 14:43:00 +02:00
Giambattista Bloisi 47665d151e initial stage 2024-07-27 14:40:52 +02:00
Giambattista Bloisi afc31fd17c initial stage 2024-07-27 14:34:37 +02:00
Giambattista Bloisi 4f41c48e0d initial stage 2024-07-27 14:33:21 +02:00
Giambattista Bloisi 97fa9a986b initial stage 2024-07-27 14:28:42 +02:00
Giambattista Bloisi 2ede28e998 initial stage 2024-07-27 14:27:08 +02:00
Giambattista Bloisi c9365d18e1 initial stage 2024-07-27 14:23:54 +02:00
Giambattista Bloisi fcc5344cc8 initial stage 2024-07-27 14:20:18 +02:00
Giambattista Bloisi 4063a33550 initial stage 2024-07-27 14:15:45 +02:00
Giambattista Bloisi 169505c75e initial stage 2024-07-27 12:10:00 +02:00
Giambattista Bloisi 625eaaf1af initial stage 2024-07-27 12:09:14 +02:00
Giambattista Bloisi 828ddb2f82 initial stage 2024-07-27 12:04:33 +02:00
Giambattista Bloisi d15bc299c0 initial stage 2024-07-27 12:03:10 +02:00
Giambattista Bloisi c6a12baeba initial stage 2024-07-26 22:23:33 +02:00
Giambattista Bloisi 19bde3bcef initial stage 2024-07-26 22:22:36 +02:00
Giambattista Bloisi 77e4ddbe79 initial stage 2024-07-26 19:15:32 +02:00
Giambattista Bloisi 43743187ba initial stage 2024-07-26 19:09:45 +02:00
Giambattista Bloisi 48d2f69bc6 initial stage 2024-07-26 18:05:18 +02:00
Giambattista Bloisi 1ad367efcc initial stage 2024-07-26 13:21:25 +02:00
Giambattista Bloisi 418ad5e430 initial stage 2024-07-11 00:13:51 +02:00
Giambattista Bloisi bfb1ebd349 initial stage 2024-07-11 00:07:08 +02:00
Giambattista Bloisi 36e995f66c initial stage 2024-07-11 00:01:45 +02:00
Giambattista Bloisi ed8bb5bc25 initial stage 2024-07-10 23:57:31 +02:00
Giambattista Bloisi 95cd1d7573 initial stage 2024-07-10 22:07:43 +02:00
Giambattista Bloisi eccf5d396c initial stage 2024-07-10 22:06:27 +02:00
Giambattista Bloisi 227ec44a21 initial stage 2024-07-10 22:00:58 +02:00
Giambattista Bloisi 8018975863 initial stage 2024-07-03 01:21:43 +02:00
Giambattista Bloisi c7729c44e1 initial stage 2024-07-03 01:18:55 +02:00
Giambattista Bloisi c7703bb4df initial stage 2024-07-03 01:01:37 +02:00
Giambattista Bloisi 52dd788d15 initial stage 2024-07-03 00:59:53 +02:00
Giambattista Bloisi 0339a92de5 initial stage 2024-07-02 11:45:36 +02:00
Giambattista Bloisi 833ea1538a initial stage 2024-07-02 11:40:52 +02:00
Giambattista Bloisi a07bc0da2b initial stage 2024-07-01 19:50:41 +02:00
Giambattista Bloisi 0aba5ef69f initial stage 2024-07-01 18:44:11 +02:00
Giambattista Bloisi 2a54a3e325 initial stage 2024-07-01 10:36:51 +02:00
Giambattista Bloisi 0fa9e585ac initial stage 2024-07-01 08:04:27 +02:00
Giambattista Bloisi aa38362f26 initial stage 2024-06-30 19:40:21 +02:00
Giambattista Bloisi 1e3d7595ea initial stage 2024-06-30 18:24:32 +02:00
Giambattista Bloisi 6f405f0cbb initial stage 2024-06-30 18:23:46 +02:00
Giambattista Bloisi 5600a23f06 initial stage 2024-06-29 20:41:31 +02:00
Giambattista Bloisi e7f84f9df0 initial stage 2024-06-29 18:50:21 +02:00
Giambattista Bloisi 1db5cb5cbd initial stage 2024-06-29 11:24:33 +02:00
Giambattista Bloisi ece4184d8a initial stage 2024-06-29 10:58:55 +02:00
Giambattista Bloisi 69b3688ba4 initial stage 2024-06-29 10:51:03 +02:00
Giambattista Bloisi 387ddce398 initial stage 2024-06-29 10:13:01 +02:00
Giambattista Bloisi c683be854a initial stage 2024-06-27 19:45:25 +02:00
Giambattista Bloisi 7c892c5d62 initial stage 2024-06-27 18:54:23 +02:00
Giambattista Bloisi b0f8161e80 initial stage 2024-06-27 18:24:24 +02:00
Giambattista Bloisi a4c75d072b initial stage 2024-06-27 15:08:52 +02:00
Giambattista Bloisi 71289af27f initial stage 2024-06-27 14:56:11 +02:00
Giambattista Bloisi 0ed1c3f762 initial stage 2024-06-27 14:45:36 +02:00
Giambattista Bloisi ae327daa61 initial stage 2024-06-27 14:31:34 +02:00
Giambattista Bloisi c987cdea54 initial stage 2024-06-27 14:29:31 +02:00
Giambattista Bloisi 19509f7f60 initial stage 2024-06-27 14:22:50 +02:00
Giambattista Bloisi 52be021867 initial stage 2024-06-27 14:07:59 +02:00
Giambattista Bloisi df9cab15c3 initial stage 2024-06-19 09:45:15 +02:00
Giambattista Bloisi 9378856e9a initial stage 2024-06-18 21:35:08 +02:00
Giambattista Bloisi 430b0ac41a initial stage 2024-06-15 14:23:50 +02:00
Giambattista Bloisi 05592343e0 initial stage 2024-06-13 12:50:29 +02:00
Giambattista Bloisi 8307ebd406 initial stage 2024-06-13 01:11:13 +02:00
Giambattista Bloisi a50db121d1 initial stage 2024-06-13 00:49:59 +02:00
Giambattista Bloisi 2563f70bfe initial stage 2024-06-13 00:19:52 +02:00
Giambattista Bloisi 47bcc93c7d initial stage 2024-06-13 00:11:39 +02:00
Giambattista Bloisi f807ce4911 initial stage 2024-06-12 23:55:19 +02:00
Giambattista Bloisi eca293419d initial stage 2024-06-12 23:49:26 +02:00
Giambattista Bloisi beef14d721 initial stage 2024-06-12 23:41:03 +02:00
Giambattista Bloisi ee97fe9659 initial stage 2024-06-12 23:23:44 +02:00
Giambattista Bloisi 398794a080 initial stage 2024-06-12 23:21:50 +02:00
Giambattista Bloisi 64e83a0cdd initial stage 2024-06-12 23:16:06 +02:00
Giambattista Bloisi f048d7df03 initial stage 2024-06-12 23:14:19 +02:00
Giambattista Bloisi 354ae6ad41 initial stage 2024-06-12 23:04:19 +02:00
Giambattista Bloisi fe9030cdba initial stage 2024-06-12 23:02:18 +02:00
Giambattista Bloisi 3bc48791ce initial stage 2024-06-12 23:00:21 +02:00
Giambattista Bloisi f155b5e8d1 initial stage 2024-06-12 22:58:19 +02:00
Giambattista Bloisi 6d3af5e50d initial stage 2024-06-12 22:56:17 +02:00
Giambattista Bloisi 565763faac initial stage 2024-06-12 01:16:02 +02:00
Giambattista Bloisi 56b27c940d initial stage 2024-06-11 22:36:36 +02:00
Giambattista Bloisi 577e0fcb4d initial stage 2024-06-11 22:03:20 +02:00
Giambattista Bloisi 23e91ec335 initial stage 2024-06-11 21:58:07 +02:00
Giambattista Bloisi 66d09d37aa initial stage 2024-06-10 14:31:26 +02:00
Giambattista Bloisi 8e7613625e initial stage 2024-06-10 13:58:32 +02:00
Giambattista Bloisi 2e72b11447 initial stage 2024-06-10 13:43:52 +02:00
Giambattista Bloisi 26e8789d30 initial stage 2024-06-10 13:41:03 +02:00
Giambattista Bloisi 132d3a45b1 initial stage 2024-06-10 10:57:35 +02:00
Giambattista Bloisi e0e04ac22e initial stage 2024-06-10 10:55:32 +02:00
Giambattista Bloisi fe50bf1475 initial stage 2024-06-10 09:28:36 +02:00
Giambattista Bloisi d7e3e7a1b7 initial stage 2024-06-10 09:23:18 +02:00
Giambattista Bloisi 5fceeb8b61 initial stage 2024-06-10 09:06:41 +02:00
Giambattista Bloisi 5318979b01 initial stage 2024-06-10 08:57:21 +02:00
Giambattista Bloisi bc42ccb8ba initial stage 2024-06-10 00:48:45 +02:00
Giambattista Bloisi 6aab7198f7 initial stage 2024-06-10 00:46:04 +02:00
Giambattista Bloisi e3d2c52092 initial stage 2024-06-09 10:30:31 +02:00
Giambattista Bloisi d1c08458bb initial stage 2024-06-07 19:31:47 +02:00
Giambattista Bloisi a4d8a48c87 initial stage 2024-06-07 19:17:13 +02:00
Giambattista Bloisi 7e12b9e3dc initial stage 2024-06-07 18:40:59 +02:00
Giambattista Bloisi 7a08db26cd initial stage 2024-06-07 18:40:21 +02:00
Giambattista Bloisi c1833f6d75 initial stage 2024-06-07 18:36:19 +02:00
Giambattista Bloisi b3b0472400 initial stage 2024-06-07 18:24:13 +02:00
Giambattista Bloisi 4068e9d702 initial stage 2024-06-07 18:22:18 +02:00
Giambattista Bloisi 4bb806d008 initial stage 2024-06-07 18:12:47 +02:00
Giambattista Bloisi b3a9ad8342 initial stage 2024-06-07 18:10:48 +02:00
Giambattista Bloisi 541581c8b2 initial stage 2024-06-07 09:10:57 +02:00
Giambattista Bloisi 21f89da1ed initial stage 2024-06-07 00:19:37 +02:00
Giambattista Bloisi 4c7faec554 initial stage 2024-06-06 22:10:03 +02:00
Giambattista Bloisi 6754f7bbec initial stage 2024-06-06 22:08:12 +02:00
Giambattista Bloisi 336026b6d8 initial stage 2024-06-06 19:52:42 +02:00
Giambattista Bloisi f77274ce4f initial stage 2024-06-06 19:51:24 +02:00
Giambattista Bloisi 151d305417 initial stage 2024-06-06 19:50:06 +02:00
Giambattista Bloisi 94b4add8cd initial stage 2024-06-06 19:48:13 +02:00
Giambattista Bloisi 1bc94cd835 initial stage 2024-06-03 22:03:06 +02:00
Giambattista Bloisi d9e7528927 initial stage 2024-06-03 15:29:06 +02:00
Giambattista Bloisi 09b603925d initial stage 2024-04-18 13:13:15 +02:00
Giambattista Bloisi f89898e99b initial stage 2024-04-18 12:24:59 +02:00
Giambattista Bloisi 26b0d7219d initial stage 2024-04-11 16:48:28 +02:00
Giambattista Bloisi 5486d48817 initial stage 2024-04-11 16:43:56 +02:00
Giambattista Bloisi 4f4c236b19 initial stage 2024-04-11 16:41:32 +02:00
Giambattista Bloisi e293990c27 initial stage 2024-04-09 12:16:32 +02:00
Giambattista Bloisi bf6a9e3d61 initial stage 2024-04-09 11:08:10 +02:00
Giambattista Bloisi 735f08aee8 initial stage 2024-04-09 11:04:06 +02:00
Giambattista Bloisi b2329a7b63 initial stage 2024-04-09 09:40:44 +02:00
Giambattista Bloisi 28d2e96842 initial stage 2024-04-08 14:25:11 +02:00
Giambattista Bloisi ba37ed66eb initial stage 2024-04-08 14:22:56 +02:00
Giambattista Bloisi 51b695c1b7 initial stage 2024-04-08 14:15:02 +02:00
Giambattista Bloisi b89d7f2646 initial stage 2024-04-08 14:11:50 +02:00
Giambattista Bloisi 684230b314 initial stage 2024-04-06 14:55:37 +02:00
Giambattista Bloisi c798eb0aff initial stage 2024-04-06 11:05:37 +02:00
Giambattista Bloisi 8461dc62cc initial stage 2024-04-05 19:09:36 +02:00
Giambattista Bloisi 3aab558117 initial stage 2024-04-05 18:04:42 +02:00
Giambattista Bloisi 2fe306fdae initial stage 2024-04-05 17:59:20 +02:00
Giambattista Bloisi 3b27f4ea1c initial stage 2024-04-05 17:55:30 +02:00
Giambattista Bloisi 801516be67 initial stage 2024-04-05 17:50:45 +02:00
Giambattista Bloisi 2eb2a94da5 initial stage 2024-04-05 17:41:22 +02:00
Giambattista Bloisi 32e76e9f2d initial stage 2024-03-27 23:01:07 +01:00
Giambattista Bloisi 5fd2558a3a initial stage 2024-03-27 22:58:50 +01:00
Giambattista Bloisi 7c919f5278 initial stage 2024-03-27 22:48:13 +01:00
Giambattista Bloisi 33cb4ce636 initial stage 2024-03-27 13:00:23 +01:00
Giambattista Bloisi f6fbce36e1 initial stage 2024-03-27 12:57:02 +01:00
Giambattista Bloisi 6aa4108b2d initial stage 2024-03-27 12:54:10 +01:00
Giambattista Bloisi e684e4cae5 initial stage 2024-03-27 12:49:33 +01:00
Giambattista Bloisi 6c76a3e0b8 initial stage 2024-03-27 12:47:34 +01:00
Giambattista Bloisi ab5c8a4b7f initial stage 2024-03-27 12:37:08 +01:00
Giambattista Bloisi 08ed592711 initial stage 2024-03-27 12:33:00 +01:00
Giambattista Bloisi 43eb5cb43d initial stage 2024-03-27 12:19:01 +01:00
Giambattista Bloisi 1a91dcf3d6 initial stage 2024-03-27 00:15:26 +01:00
Giambattista Bloisi f04459666a initial stage 2024-03-26 22:25:02 +01:00
Giambattista Bloisi fc5f884f4d initial stage 2024-03-26 14:20:45 +01:00
Giambattista Bloisi 75221b489d initial stage 2024-03-26 12:31:04 +01:00
Giambattista Bloisi 185ca78f71 initial stage 2024-03-26 11:12:43 +01:00
Giambattista Bloisi 26c2e3eaad initial stage 2024-03-26 11:03:05 +01:00
Giambattista Bloisi 7e41f71d32 initial stage 2024-03-26 10:54:46 +01:00
Giambattista Bloisi 10c29f86c2 initial stage 2024-03-26 10:52:07 +01:00
Giambattista Bloisi 4398546095 initial stage 2024-03-25 22:09:41 +01:00
Giambattista Bloisi c9f23d2796 initial stage 2024-03-25 22:08:17 +01:00
Giambattista Bloisi 8594587ee5 initial stage 2024-03-25 21:33:32 +01:00
Giambattista Bloisi b86cf359f5 initial stage 2024-03-25 21:19:50 +01:00
Giambattista Bloisi 00514edfbd initial stage 2024-03-25 18:22:10 +01:00
Giambattista Bloisi f79eb140eb initial stage 2024-03-25 17:54:23 +01:00
Giambattista Bloisi 4e1955b673 initial stage 2024-03-25 17:52:56 +01:00
Giambattista Bloisi c07ddc03d9 initial stage 2024-03-25 16:05:34 +01:00
Giambattista Bloisi 0c27895e13 initial stage 2024-03-25 15:54:49 +01:00
Giambattista Bloisi 349db6f602 initial stage 2024-03-25 15:45:43 +01:00
Giambattista Bloisi 072fb76a26 initial stage 2024-03-24 19:12:26 +01:00
Giambattista Bloisi 172703df7c initial stage 2024-03-24 19:11:29 +01:00
Giambattista Bloisi f1e619c7fb initial stage 2024-03-24 19:11:16 +01:00
Giambattista Bloisi 6b2ef00c25 initial stage 2024-03-24 19:07:12 +01:00
Giambattista Bloisi 921ce0bf48 initial stage 2024-03-24 19:04:57 +01:00
Giambattista Bloisi 99ef9b3980 initial stage 2024-03-24 19:01:00 +01:00
Giambattista Bloisi 8bea0251f1 initial stage 2024-03-24 18:56:43 +01:00
Giambattista Bloisi d97972b85e initial stage 2024-03-22 14:09:36 +01:00
Giambattista Bloisi 2f5430d9c8 initial stage 2024-03-22 14:06:07 +01:00
Giambattista Bloisi 0738f8bebc initial stage 2024-03-22 00:39:43 +01:00
Giambattista Bloisi 83b86b50ab simple test DAG 2024-03-21 10:49:44 +01:00
Giambattista Bloisi d660233e8e simple test DAG 2024-03-21 10:45:58 +01:00
Giambattista Bloisi 10fedb06f1 simple test DAG 2024-03-20 17:33:12 +01:00
Giambattista Bloisi a7e485a8c6 simple test DAG 2024-03-20 17:06:39 +01:00
Giambattista Bloisi 587c43872b simple test DAG 2024-03-20 17:04:25 +01:00
Giambattista Bloisi 0ca0da3cc9 simple test DAG 2024-03-20 17:02:14 +01:00
Giambattista Bloisi dead48e9b2 simple test DAG 2024-03-20 15:57:36 +01:00
Giambattista Bloisi 620c6fadea simple test DAG 2024-03-20 15:31:32 +01:00
Giambattista Bloisi b71bcfabf8 simple test DAG 2024-03-19 15:46:25 +01:00
Giambattista Bloisi 65daefb971 simple test DAG 2024-03-19 09:58:52 +01:00
Giambattista Bloisi 1152e14920 simple test DAG 2024-03-18 13:48:13 +01:00
Giambattista Bloisi 65cba81f20 simple test DAG 2024-03-18 13:16:39 +01:00
Giambattista Bloisi 5502f449a5 simple test DAG 2024-03-18 12:15:28 +01:00
Giambattista Bloisi cbdb6f3640 simple test DAG 2024-03-18 12:13:36 +01:00
Giambattista Bloisi 68a16e6c5a simple test DAG 2024-03-18 11:02:07 +01:00
Giambattista Bloisi ef67d70961 simple test DAG 2024-03-18 10:15:29 +01:00
Giambattista Bloisi bf939c0254 simple test DAG 2024-03-18 10:14:59 +01:00
Giambattista Bloisi fa3214dc2c simple test DAG 2024-03-18 10:11:51 +01:00
Giambattista Bloisi cb4f9c838a simple test DAG 2024-03-18 01:11:10 +01:00
Giambattista Bloisi 47505e885f simple test DAG 2024-03-18 01:07:23 +01:00
Giambattista Bloisi 78e2aaf404 simple test DAG 2024-03-18 01:00:54 +01:00
Giambattista Bloisi f4fa06a634 simple test DAG 2024-03-18 00:54:50 +01:00
Giambattista Bloisi ec8e00d7a4 simple test DAG 2024-03-18 00:37:22 +01:00
Giambattista Bloisi b8aa473fff simple test DAG 2024-03-17 21:33:42 +01:00
Giambattista Bloisi fd53c5af5b simple test DAG 2024-03-17 21:32:40 +01:00
Giambattista Bloisi fd25f9bf59 simple test DAG 2024-03-17 19:56:26 +01:00
Giambattista Bloisi 0c272f7ff2 simple test DAG 2024-03-17 18:27:42 +01:00
Giambattista Bloisi a7a6f8e95f simple test DAG 2024-03-17 18:10:49 +01:00
Giambattista Bloisi df6cd00621 simple test DAG 2024-03-17 18:06:08 +01:00
Giambattista Bloisi c221f80d1b simple test DAG 2024-03-17 15:51:07 +01:00
Giambattista Bloisi d9170a0d1a simple test DAG 2024-03-17 15:50:05 +01:00
Giambattista Bloisi 3406662572 simple test DAG 2024-03-17 15:49:09 +01:00
Giambattista Bloisi 5cc3b050ce simple test DAG 2024-03-15 16:34:17 +01:00
Giambattista Bloisi c0bfa81d97 simple test DAG 2024-03-15 15:56:22 +01:00
Giambattista Bloisi 8262871be8 simple test DAG 2024-03-15 14:14:57 +01:00
Giambattista Bloisi ab172a39ff simple test DAG 2024-03-15 13:12:17 +01:00
Giambattista Bloisi 4c7d80a0a8 simple test DAG 2024-03-15 12:57:53 +01:00
Giambattista Bloisi f1cec0cfeb simple test DAG 2024-03-15 12:44:19 +01:00
Giambattista Bloisi 636a4e38e9 simple test DAG 2024-03-15 12:31:14 +01:00
Giambattista Bloisi 55f3a06e1d simple test DAG 2024-03-15 12:28:10 +01:00
Giambattista Bloisi 599625c472 simple test DAG 2024-03-15 12:26:44 +01:00
Giambattista Bloisi c87b207ef2 simple test DAG 2024-03-15 12:25:15 +01:00
Giambattista Bloisi 95cc6095c3 simple test DAG 2024-03-15 12:21:33 +01:00
Giambattista Bloisi f01ba4efb2 simple test DAG 2024-03-15 12:17:13 +01:00
Giambattista Bloisi 679797cfe5 simple test DAG 2024-03-14 22:33:58 +01:00
Giambattista Bloisi 602fedc6cb simple test DAG 2024-03-14 22:27:51 +01:00
Giambattista Bloisi c513072be9 simple test DAG 2024-03-14 22:26:06 +01:00
Giambattista Bloisi 5a5aaccbeb simple test DAG 2024-03-14 21:46:33 +01:00
Giambattista Bloisi 7959c1bc08 simple test DAG 2024-03-14 21:14:23 +01:00
Giambattista Bloisi 0d4ef9cb1f simple test DAG 2024-03-14 21:13:39 +01:00
Giambattista Bloisi d2bbaaece3 simple test DAG 2024-03-14 21:12:48 +01:00
Giambattista Bloisi dd6a192da2 simple test DAG 2024-03-14 21:04:35 +01:00
Giambattista Bloisi d19198c2ba simple test DAG 2024-03-12 15:59:46 +01:00
Giambattista Bloisi 815ce27e34 simple test DAG 2024-03-12 15:59:20 +01:00
Giambattista Bloisi f5ef2d3754 simple test DAG 2024-03-12 15:58:23 +01:00
Giambattista Bloisi e2a5f3e90e simple test DAG 2024-03-12 15:57:14 +01:00
Giambattista Bloisi 2cb40d2276 simple test DAG 2024-03-12 12:56:24 +01:00
Giambattista Bloisi 32992c79e8 simple test DAG 2024-03-11 19:16:27 +01:00
Giambattista Bloisi 8ee696c145 simple test DAG 2024-03-11 19:04:39 +01:00
Giambattista Bloisi b4f8ba1bd0 simple test DAG 2024-03-11 18:44:45 +01:00
Giambattista Bloisi dd07466aae simple test DAG 2024-03-10 14:18:46 +01:00
Giambattista Bloisi 5f07513b35 simple test DAG 2024-03-10 14:07:59 +01:00
Giambattista Bloisi d1a944b8f5 simple test DAG 2024-03-10 13:42:37 +01:00
Giambattista Bloisi f8f0141d50 simple test DAG 2024-03-10 13:29:51 +01:00
Giambattista Bloisi 5a181be26a simple test DAG 2024-03-10 13:23:35 +01:00
Giambattista Bloisi edc6976a47 simple test DAG 2024-03-10 13:23:08 +01:00
Giambattista Bloisi 4c2062e3b9 simple test DAG 2024-03-10 13:18:02 +01:00
Giambattista Bloisi ddbf71cca4 simple test DAG 2024-03-10 13:14:27 +01:00
Giambattista Bloisi e81e28f5f9 simple test DAG 2024-03-10 12:58:45 +01:00
Giambattista Bloisi 7cfae9f1bc simple test DAG 2024-03-10 12:53:05 +01:00
Giambattista Bloisi 546cc75763 simple test DAG 2024-03-09 23:59:12 +01:00
Giambattista Bloisi c8ffe36fbc simple test DAG 2024-03-09 23:48:10 +01:00
Giambattista Bloisi 222b5e66c6 simple test DAG 2024-03-09 23:34:51 +01:00
Giambattista Bloisi 07f8645a60 simple test DAG 2024-03-09 23:00:36 +01:00
Giambattista Bloisi fcbc01fed4 simple test DAG 2024-03-09 22:54:31 +01:00
Giambattista Bloisi b19e4f8ae8 simple test DAG 2024-03-09 19:42:58 +01:00
Giambattista Bloisi 8840091813 simple test DAG 2024-03-09 19:41:04 +01:00
Giambattista Bloisi 38bbf4f449 simple test DAG 2024-03-09 19:36:02 +01:00
Giambattista Bloisi 30181573cf simple test DAG 2024-03-09 19:26:37 +01:00
Giambattista Bloisi 908644d005 simple test DAG 2024-03-09 19:25:51 +01:00
Giambattista Bloisi a7b1d25fdb simple test DAG 2024-03-09 19:23:45 +01:00
Giambattista Bloisi 027996069c simple test DAG 2024-03-09 18:47:08 +01:00
Giambattista Bloisi ba99672349 simple test DAG 2024-03-09 18:15:21 +01:00
Giambattista Bloisi 0a62276c42 simple test DAG 2024-03-09 18:09:15 +01:00
Giambattista Bloisi ec02290442 simple test DAG 2024-03-08 17:42:28 +01:00
Giambattista Bloisi d505df8d36 simple test DAG 2024-03-08 17:26:29 +01:00
Giambattista Bloisi 031b11a3db simple test DAG 2024-03-08 17:20:37 +01:00
Giambattista Bloisi c259c529bc simple test DAG 2024-03-08 16:51:39 +01:00
Giambattista Bloisi deb6567a73 simple test DAG 2024-03-08 16:42:51 +01:00
Giambattista Bloisi 6e8f2c3664 simple test DAG 2024-03-08 16:22:00 +01:00
Giambattista Bloisi d281fb043a simple test DAG 2024-03-08 16:19:35 +01:00
Giambattista Bloisi 3342e20571 simple test DAG 2024-03-08 16:15:07 +01:00
Giambattista Bloisi a7c82b0d61 simple test DAG 2024-03-08 16:11:31 +01:00
Giambattista Bloisi 5a30741e29 simple test DAG 2024-03-08 16:10:14 +01:00
Giambattista Bloisi 4128d1c863 simple test DAG 2024-03-08 16:06:05 +01:00
Giambattista Bloisi 7edb0c5a7e simple test DAG 2024-03-08 16:01:11 +01:00
Giambattista Bloisi 1ad289e948 simple test DAG 2024-03-08 15:35:45 +01:00
Giambattista Bloisi 9682e09eb4 simple test DAG 2024-03-08 15:34:01 +01:00
Giambattista Bloisi 31b05ff2fb simple test DAG 2024-03-07 11:21:59 +01:00
Giambattista Bloisi d4f33496aa simple test DAG 2024-03-07 11:01:55 +01:00
Giambattista Bloisi 7d2da06118 simple test DAG 2024-03-07 10:50:49 +01:00
Giambattista Bloisi 7fcc6a9bd0 simple test DAG 2024-03-07 09:21:30 +01:00
Giambattista Bloisi 550da2c190 simple test DAG 2024-03-06 23:54:10 +01:00
Giambattista Bloisi e99002329e simple test DAG 2024-03-06 23:51:53 +01:00
Giambattista Bloisi 3e6c175901 simple test DAG 2024-03-06 23:46:50 +01:00
Giambattista Bloisi bc50df0413 simple test DAG 2024-03-06 23:43:31 +01:00
Giambattista Bloisi 2c81ded53c simple test DAG 2024-03-06 23:42:25 +01:00
Giambattista Bloisi d6bfc955a3 simple test DAG 2024-03-06 23:38:08 +01:00
Giambattista Bloisi 379920e21b simple test DAG 2024-03-06 23:31:24 +01:00
Giambattista Bloisi 5d073deaa7 simple test DAG 2024-03-06 23:29:08 +01:00
Giambattista Bloisi 2937d77cba simple test DAG 2024-03-06 17:58:28 +01:00
Giambattista Bloisi 91739b26b8 simple test DAG 2024-03-06 17:52:09 +01:00
Giambattista Bloisi 52179da636 simple test DAG 2024-03-06 17:49:38 +01:00
Giambattista Bloisi f0169ca158 simple test DAG 2024-03-06 17:47:14 +01:00
Giambattista Bloisi c3761a161e simple test DAG 2024-03-06 17:33:17 +01:00
Giambattista Bloisi c80a5e6eb8 simple test DAG 2024-03-06 17:29:24 +01:00
Giambattista Bloisi 76981a01ba simple test DAG 2024-03-06 15:17:38 +01:00
Giambattista Bloisi e343e95a9b simple test DAG 2024-03-06 15:15:22 +01:00
Giambattista Bloisi 080d30cc33 simple test DAG 2024-03-05 16:30:37 +01:00
Giambattista Bloisi 991930f934 simple test DAG 2024-03-05 16:13:18 +01:00
Giambattista Bloisi 4a6f8568eb simple test DAG 2024-03-05 16:11:00 +01:00
Giambattista Bloisi 7b0bc4e5b4 simple test DAG 2024-03-05 15:51:38 +01:00
Giambattista Bloisi 6998573b79 simple test DAG 2024-03-04 19:07:10 +01:00
Giambattista Bloisi 4e6f4ee2fb simple test DAG 2024-03-04 19:04:22 +01:00
Giambattista Bloisi de9796a376 simple test DAG 2024-03-04 19:02:28 +01:00
Giambattista Bloisi cf1e7914ca simple test DAG 2024-03-04 17:45:33 +01:00
62 changed files with 62027 additions and 2895 deletions

View File

@ -1,75 +0,0 @@
https://kluctl.io/
kubectl
AIRFLOW
AIRBYTE
https://medium.com/apache-airflow/what-we-learned-after-running-airflow-on-kubernetes-for-2-years-0537b157acfd
https://github.com/opensearch-project/helm-charts/blob/main/README.md#installing
SETUP
kind create cluster --config clusters/local/kind-cluster-config.yaml
./clusters/local/kind-with-registry.sh kind-openaire-data-platform
kubectl apply --context kind-openaire-data-platform -f ./clusters/local/nginx-kind-deploy.yaml
Using The Registry
The registry can be used like this.
First well pull an image docker pull gcr.io/google-samples/hello-app:1.0
Then well tag the image to use the local registry docker tag gcr.io/google-samples/hello-app:1.0 localhost:5001/hello-app:1.0
Then well push it to the registry docker push localhost:5001/hello-app:1.0
And now we can use the image kubectl create deployment hello-server --image=localhost:5001/hello-app:1.0
If you build your own image and tag it like localhost:5001/image:foo and then use it in kubernetes as localhost:5001/image:foo. And use it from inside of your cluster application as kind-registry:5000.
CLEANUP
kind delete cluster --name openaire-data-platform
OPENASEARCH OPERATOR
HELM
kubectl use-context openaire-data-platform
helm repo add opensearch https://opensearch-project.github.io/helm-charts/
helm repo update
helm upgrade --install oa-opensearch opensearch/opensearch --version 2.17.2 -f envs/local/common.yaml -f envs/local/opensearch.yaml
opensearch/opensearch 2.17.2 2.11.1 A Helm chart for OpenSearch
opensearch/opensearch-dashboards 2.15.1 2.11.1 A Helm chart for OpenSearch Dashboards
helm repo add jetstack https://charts.jetstack.io
helm repo update
helm install \
cert-manager jetstack/cert-manager \
--namespace cert-manager \
--create-namespace \
--version v1.14.4 \
--set installCRDs=true \
--set global.leaderElection.namespace=cert-manager
➜ dataplatform kubectl apply -f envs/gcp/letsencrypt-prod.yaml
clusterissuer.cert-manager.io/letsencrypt-prod created
➜ dataplatform kubectl apply -f envs/gcp/duckdnsdomain.yaml
certificate.cert-manager.io/openaire-duckdns created
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
kubectl apply -f envs/gcp/local-path-storage.yaml
helm repo add opensearch-operator https://opensearch-project.github.io/opensearch-k8s-operator/
helm install opensearch-operator opensearch-operator/opensearch-operator --version 2.5.1 -f envs/gcp/opensearch-operator.yaml
kubectl port-forward svc/opensearch-cluster-dashboards 5601 -n oa-opensearch
helm repo add airbyte https://airbytehq.github.io/helm-charts/airbyte
helm upgrade --install oa-airbyte airbyte/airbyte --namespace oa-airbyte --create-namespace --version 0.53.196 -f envs/local/common.yaml -f envs/local/airbyte.yaml

View File

@ -0,0 +1,140 @@
from typing import Dict, Any, List
def map_access_right(ar: str) -> str:
match ar:
case 'open':
return 'Open Access'
case 'closed':
return 'Closed'
case 'embargo':
return 'Embargo'
case 'restricted':
return 'Restricted'
case _:
return ''
def trasform_graph_entity(p: dict) -> dict:
p['_id'] = p['local_identifier']
return p
def trasform_catalog_entity(p: dict) -> dict:
p['_id'] = p['id']
return p
def map_fos_topic_to_domain(fos: str):
if fos.startswith('01'):
return 'Natural Sciences'
elif fos.startswith('02'):
return 'Engineering & Technology'
elif fos.startswith('03'):
return 'Medical & Health Sciences'
elif fos.startswith('04'):
return 'Agricultural Sciences'
elif fos.startswith('05'):
return 'Social Sciences'
elif fos.startswith('06'):
return 'Humanities'
return None
def trasform_interoperability(p: dict) -> dict:
p = trasform_catalog_entity(p)
if 'domain' in p:
p['domain'] = {"domain": p['domain']}
p['licenseDetails'] = p['license']
p['license'] = p['license']['identifier'] if 'identifier' in p['license'] else ''
return p
def trasform_product(p: dict) -> dict:
p = trasform_graph_entity(p)
p['accessRights'] = list(set(
filter(lambda ar: ar != '', map(lambda m: map_access_right(m.get('access_right')), p.get('manifestations')))))
p['keyword'] = list(set(
map(lambda topic: topic.get('topic').get('value'),
filter(lambda topic: topic.get('topic').get('scheme') == 'keyword', p.get('topics')))))
p['domain'] = list(
map(lambda fos: {"domain": fos},
set(filter(lambda fos: fos is not None,
map(lambda topic: map_fos_topic_to_domain(topic.get('topic').get('value')),
filter(lambda topic: topic.get('topic').get('scheme') == 'FOS', p.get('topics')))))))
p['firstPublishDate'] = next(
iter(
sorted(
map(lambda date: date.get('value'),
filter(lambda date: date.get('type') == 'publishing',
[date for m in (p.get('manifestations') or []) for date in (m.get('dates') or [])])))),
None)
return p
transform_entities = {
# SKG-IF graph entities
"datasource": trasform_graph_entity,
"grants": trasform_graph_entity,
"organizations": trasform_graph_entity,
"persons": trasform_graph_entity,
"products": trasform_product,
"topics": trasform_graph_entity,
"venues": trasform_graph_entity,
# EOSC catalog entities
"interoperability": trasform_interoperability,
"services": trasform_catalog_entity,
"training": trasform_catalog_entity,
}
def isEmpty(current_value: Dict[str, Any], labels: List[str]) -> bool:
if len(labels) <= 0:
return True
for label in labels:
if isinstance(current_value, list) and len(current_value) > 0:
current_value = current_value[0]
if isinstance(current_value, dict) and label in current_value:
current_value = current_value[label]
else:
return True
if current_value is None:
return True
if isinstance(current_value, list):
if len(current_value) > 0:
return current_value[0] == ""
else:
return True
return str(current_value) == ""
#
# Filter products that do not meet inclusion policy
#
def filter_product(p: dict) -> bool:
if isEmpty(p, ["titles", "none"]):
return True
if isEmpty(p, ["firstPublishDate"]):
return True
if p['product_type'] == "literature":
if isEmpty(p, ["abstracts", "none"]):
return True
if isEmpty(p, ["contributions", "person", "local_identifier"]):
return True
elif p['product_type'] in ["research data", "other"]:
if isEmpty(p, ["contributions", "person", "local_identifier"]):
return True
return False
filter_entities = {
"products": filter_product
}

1107
airflow/dags/EOSC_indexes.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,53 @@
from __future__ import annotations
from airflow.decorators import dag
from airflow.decorators import task
from airflow.hooks.base import BaseHook
from airflow.models.baseoperator import chain
from opensearchpy import OpenSearch
import init_ams_topics
import init_opensearch_templates
@dag(
dag_id="mkg_prepare_environment",
#dag_display_name="Prepare MKG Environment",
schedule=None,
dagrun_timeout=None,
start_date=None,
catchup=False,
params={
"OPENSEARCH_CONN_ID": "opensearch_default",
"ARGO_CONN_ID": "ams_default",
"RESET_AMS": False
},
tags=["MKG", "opensearch", "argo"]
)
def prepare_environment():
@task
def prepare_opensearch(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180)
init_opensearch_templates.init_opensearch(client)
@task
def prepare_ams(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["ARGO_CONN_ID"])
extra = conn.extra_dejson
init_ams_topics.init_ams(conn.host, extra['project'], extra['token'], kwargs["params"]["RESET_AMS"])
chain(
prepare_opensearch.override(task_id="prepare_opensearch")(),
# prepare_ams.override(task_id="prepare_ams")(),
)
prepare_environment()

View File

@ -0,0 +1,112 @@
import os
from datetime import timedelta
import time
import pendulum
import requests
from airflow.decorators import dag
from airflow.decorators import task
from airflow.hooks.base import BaseHook
from opensearchpy import OpenSearch, helpers
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@dag(
dag_id="open_data_portal_harvest",
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
dagrun_timeout=None,
catchup=False,
default_args=default_args,
params={
"S3_CONN_ID": "s3_conn",
"OPENSEARCH_CONN_ID": "opensearch_default",
"OS_INDEX_NAME": "euodp_raw"
},
tags=["aggregation"]
)
def harvest():
@task
def everything(**context):
index_name = context["params"]["OS_INDEX_NAME"]
conn = BaseHook.get_connection(context["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20
)
if not client.indices.exists(index_name):
client.indices.create(index_name, {
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 0,
"codec": "zstd_no_dict",
"replication.type": "SEGMENT"
},
},
"mappings": {
"dynamic": False
}
})
def store_results(hits):
def _generate_data():
for r in hits:
r['_index'] = index_name
r['_id'] = r['id']
yield r
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
raise_on_exception=False,
raise_on_error=False,
chunk_size=5000,
max_chunk_bytes=50 * 1024 * 1024,
timeout=180):
if success:
succeeded = succeeded + 1
else:
print(item["index"]["error"])
failed = failed + 1
headers = {'Accept': 'application/json'}
r = requests.get('https://data.europa.eu/api/hub/search/search?filter=dataset&aggregation=false&limit=300&showScore=true&scroll=true', headers=headers).json()
scroll_id = r['result']['scrollId']
results = r['result']['results']
store_results(results)
max_retries = 10
while scroll_id:
try:
r = requests.get('https://data.europa.eu/api/hub/search/scroll?scrollId=' + scroll_id, headers=headers)
r.raise_for_status()
except Exception as e:
print(f"Error:" + str(e))
time.sleep(0.1)
max_retries = max_retries - 1
if max_retries == 0:
raise Exception("Cannot fetch data")
continue
max_retries = 10
r = r.json()
scroll_id = r['result']['scrollId']
results = r['result']['results']
if len(results) <= 0:
return
store_results(results)
everything()
harvest()

42
airflow/dags/S3_delete.py Normal file
View File

@ -0,0 +1,42 @@
import os
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@dag(
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"prefix": "Key prefix of files to delete",
"bucket": "bucket containing files to delete",
},
tags=["s3"],
)
def s3_delete():
@task
def delete(**context):
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
keys = hook.list_keys(bucket_name=context["params"]["bucket"], prefix=context["params"]["prefix"])
hook.delete_objects(bucket=context["params"]["bucket"], keys=keys)
for key in keys:
print(f"{key} deleted!")
delete()
s3_delete()

98
airflow/dags/S3_untar.py Normal file
View File

@ -0,0 +1,98 @@
import os
import tarfile
import time
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from botocore.exceptions import ClientError
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
def check_for_key_with_backoff(hook: S3Hook, key:str, bucket:str) -> bool:
delay = 10 # initial delay
delay_incr = 10 # additional delay in each loop
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
while delay < max_delay:
try:
return hook.check_for_key(key=key, bucket_name=bucket)
except ClientError as err:
code = err.response.get('Error',{}).get('Code', '')
if code in ['NoSuchBucket']:
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
raise err
time.sleep(delay)
delay += delay_incr
def load_file_obj_with_backoff(hook: S3Hook, fileobj, key:str, bucket:str, replace:bool) -> bool:
delay = 10 # initial delay
delay_incr = 10 # additional delay in each loop
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
while delay < max_delay:
try:
return hook.load_file_obj(fileobj,
key,
bucket,
replace=replace)
except ClientError as err:
code = err.response.get('Error',{}).get('Code', '')
if code in ['NoSuchBucket']:
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
raise err
time.sleep(delay)
delay += delay_incr
@dag(
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"src_key": "File to untar",
"src_bucket": "bucket containing the zip file",
"dst_key_prefix": "",
"dst_bucket": "bucket that will contain unzipped files"
},
tags=["s3"],
)
def s3_untar():
@task
def untar(**context):
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=context["params"]["src_bucket"])
with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar:
for member in tar:
dst_key = context["params"]["dst_key_prefix"] + "/" + member.name
dst_key = os.path.normpath(dst_key)
# Ignore directories, links, devices, fifos, etc.
if (not member.isfile()) or member.name.endswith('/'):
print(f"Skipping {member.name}: is not a file")
continue
if check_for_key_with_backoff(hook, key=dst_key, bucket=context["params"]["dst_bucket"]):
print(f"Skipping {member.name}: already exists")
continue
print(f"Extracting {member.name} to {dst_key}")
fileobj = tar.extractfile(member)
fileobj.seekable = lambda: False
load_file_obj_with_backoff(hook, fileobj,
dst_key,
context["params"]["dst_bucket"],
replace=True)
untar()
s3_untar()

55
airflow/dags/S3_unzip.py Normal file
View File

@ -0,0 +1,55 @@
import os
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.file import TemporaryDirectory
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
def s3_dowload_unzip_upload(s3conn: str, src_key: str, src_bucket: str, dest_bucket: str):
hook = S3Hook(s3conn, transfer_config_args={'use_threads': False})
with TemporaryDirectory() as dwl_dir:
with TemporaryDirectory() as tmp_dir:
archive = f'{dwl_dir}/{src_key}'
hook.download_file(key=src_key, bucket_name=src_bucket, local_path=dwl_dir, preserve_file_name=True,
use_autogenerated_subdir=False)
with zipfile.ZipFile(archive, 'r') as zip_ref:
for info in zip_ref.infolist():
with zip_ref.open(info.filename) as file:
hook.load_file_obj(file, info.filename, dest_bucket, replace=True)
@dag(
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"zipfile": "File to unzip",
"src_bucket": "bucket containing the zip file",
"dst_bucket": "bucket that will contain unzipped files"
},
tags=["s3"],
)
def s3_unzip():
@task
def unzip(**context):
s3_dowload_unzip_upload(S3_CONN_ID,
context["params"]["zipfile"],
context["params"]["src_bucket"],
context["params"]["dst_bucket"])
unzip()
s3_unzip()

View File

@ -0,0 +1,68 @@
import os
from datetime import timedelta, datetime
import pendulum
from airflow import DAG
from airflow.hooks.base import BaseHook
from airflow.models.baseoperator import chain
from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
from airflow.providers.cncf.kubernetes.secret import Secret
default_args = {
"execution_timeout": timedelta(days=6),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
conn = BaseHook.get_connection("opensearch_default")
dag = DAG(
'antispam_batch_check',
default_args=default_args,
schedule=None,
dagrun_timeout=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
schedule_interval=timedelta(days=1)
)
secrets = [
Secret(
deploy_type='env',
deploy_target='CURATION_OPENSEARCH__USER',
secret='opensearch-conn-secrets',
key='username',
),
Secret(
deploy_type='env',
deploy_target='CURATION_OPENSEARCH__PASSWORD',
secret='opensearch-conn-secrets',
key='password',
),
]
# Define the KubernetesPodOperator
task = KubernetesPodOperator(
task_id='antispam_checker',
name='antispam_checker',
namespace='kg-airflow',
image='gbloisi/curation:1.0.0',
image_pull_policy="Always",
cmds=['python3'],
arguments=['/antispam-batch.py',
"--opensearch.host", conn.host,
"--opensearch.port", str(conn.port),
"--openai.host", "local-ai.kg-airflow.svc.cluster.local",
"--openai.port", "8000",
"--parallelism", "36"
],
secrets=secrets,
is_delete_operator_pod=True,
in_cluster=True,
get_logs=True,
dag=dag
)
# Set the task dependencies
chain(task)

View File

@ -0,0 +1,314 @@
from datetime import datetime
from opensearchpy import OpenSearch
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
from catalogue.vocabulary import CATALOG_VOCABULARY
class RawCatalogOpensearch:
entities = ["datasources",
"interoperability-records",
"providers",
"resource-interoperability-records",
"services",
"training-resources"]
mapped_entities = ["interoperability-records", "training-resources", "services"]
def __init__(self, os_client: OpenSearch, suffix: str | None):
self.os_client = os_client
self.suffix = suffix
def get_index(self, name: str):
return f"catalog_{name}_{self.suffix}"
def get_alias(self, name: str):
return f"catalog_{name}"
def get_mapped_index(self, name: str):
match name:
case "interoperability-records":
return f"interoperability_{self.suffix}"
case "training-resources":
return f"training_{self.suffix}"
case "services":
return f"services_{self.suffix}"
return None
def get_mapped_alias(self, name: str):
match name:
case "interoperability-records":
return f"interoperability"
case "training-resources":
return f"training"
case "services":
return f"services"
return None
def get_resource_interoperability_records(self, resource_id):
response = self.os_client.search(
body={
'query': {
'term': {
'resourceInteroperabilityRecord.resourceId.keyword': resource_id,
}
},
"fields": [
"resourceInteroperabilityRecord.interoperabilityRecordIds"
],
"_source": False
},
index=self.get_index('resource-interoperability-records')
)
interoperability_ids = []
interoperability_records = []
for hit in response['hits']['hits']:
interoperability_ids.extend(
extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
if len(interoperability_ids) > 0:
response = self.os_client.search(
body={
"query": {
"ids": {
"values": interoperability_ids,
}
},
},
index=self.get_index('interoperability-records')
)
for hit in response['hits']['hits']:
interoperability_records.append(extract_nested(hit, ['_source']))
return interoperability_records
def get_providers(self, provider_ids: list[str]) -> list:
provider_records = []
if provider_ids is not None and len(provider_ids) > 0:
response = self.os_client.search(
body={
"query": {
"ids": {
"values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
}
},
},
index=self.get_index('providers')
)
for hit in response['hits']['hits']:
provider_records.append(extract_nested(hit, ['_source']))
return provider_records
def get_provider(self, provider_id: str):
if provider_id is not None:
providers = self.get_providers([provider_id])
if providers is not None and len(providers) > 0:
return providers[0]
return {}
def get_services(self, service_ids: list[str]) -> list:
service_records = []
if service_ids is not None and len(service_ids) > 0:
response = self.os_client.search(
body={
"query": {
"ids": {
"values": service_ids if isinstance(service_ids, list) else [
service_ids],
}
},
},
index=self.get_index('services')
)
for hit in response['hits']['hits']:
service_records.append(extract_nested(hit, ['_source']))
return service_records
def get_datasource_of_service(self, service_id: str):
response = self.os_client.search(
body={
'query': {
'term': {
'datasource.serviceId.keyword': service_id,
}
}
},
index=self.get_index('datasources')
)
for hit in response['hits']['hits']:
return extract_nested(hit, ['_source'])
return {}
def get_services_of_interoperability(self, interoperability_id: str):
svc_ids = []
response = self.os_client.search(
body={
'query': {
'term': {
'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': interoperability_id,
}
},
"fields": [
"resourceInteroperabilityRecord.resourceId"
],
"_source": False
},
index=self.get_index('resource-interoperability-records')
)
for hit in response['hits']['hits']:
svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
return svc_ids
def map_service(self, raw_svc: dict) -> dict:
interoperability_records = self.get_resource_interoperability_records(raw_svc['id'])
organization = self.get_provider(extract_nested(raw_svc, ['service', 'resourceOrganisation']))
provider_records = self.get_providers(list(
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'resourceProviders']) or [])))
related_resources_records = self.get_services(list(
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'relatedResources']) or [])))
datasource = self.get_datasource_of_service(raw_svc['id'])
res = {
"accessRestriction": extract_nested(raw_svc,
"service.geographicalAvailabilities".split(".")),
"accessTypes": extract_map_nested(raw_svc, 'access_type', "service.accessTypes".split(".")),
"access_modes": extract_map_nested(raw_svc, 'access_mode', "service.accessModes".split(".")),
"category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
"subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
extract_nested(raw_svc, "service.categories".split(".")))),
"description": extract_nested(raw_svc, "service.description".split(".")),
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
extract_nested(raw_svc, "service.scientificDomains".split(".")))),
"grantProjectNames": extract_nested(raw_svc, "service.grantProjectNames".split(".")),
"helpdeskPage": extract_nested(raw_svc, "service.helpdeskPage".split(".")),
"horizontal": extract_nested(raw_svc, "service.horizontalService".split(".")) or False,
"id": extract_nested(raw_svc, "service.id".split(".")),
"interoperabilityGuidelines": list(
map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
"language": extract_map_nested(raw_svc, 'languages', "service.languageAvailabilities".split(".")),
"name": extract_nested(raw_svc, "service.name".split(".")),
"orderType": extract_map_nested(raw_svc, 'order_type', "service.orderType".split(".")),
"organization": extract_nested(organization, "provider.name".split(".")),
"pricing": extract_nested(raw_svc, "service.pricing".split(".")),
"privacyPolicy": extract_nested(raw_svc, "service.privacyPolicy".split(".")),
"providers": list(map(lambda p: p['provider']['name'], provider_records)),
"relatedPlatforms": extract_map_nested(raw_svc, 'related_platform', "service.relatedPlatforms".split(".")),
"relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
"tags": extract_nested(raw_svc, "service.tags".split(".")),
"targetUsers": extract_map_nested(raw_svc, 'target_user', "service.targetUsers".split(".")),
"termsOfUse": extract_nested(raw_svc, "service.termsOfUse".split(".")),
"thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
"trl": extract_map_nested(raw_svc, 'trl', "service.trl".split(".")),
"type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
"useCases": extract_nested(raw_svc, "service.useCases".split(".")),
"userManual": extract_nested(raw_svc, "service.userManual".split(".")),
"webpage": extract_nested(raw_svc, "service.webpage".split(".")),
"year": datetime.fromtimestamp(
int(extract_nested(raw_svc, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)
def map_training(self, raw_trn: dict) -> dict:
organization = self.get_provider(extract_nested(raw_trn, ['trainingResource', 'resourceOrganisation']))
res = {
"accessRights": extract_map_nested(raw_trn, 'tr_access', "trainingResource.accessRights".split(".")),
"alternativeIdentifiers": extract_nested(raw_trn,
"trainingResource.alternativeIdentifiers".split(".")),
"authors": extract_nested(raw_trn,
"trainingResource.authors".split(".")),
"contentResourceType": extract_map_nested(raw_trn, 'tr_content',
"trainingResource.contentResourceTypes".split(".")),
"description": extract_nested(raw_trn,
"trainingResource.description".split(".")),
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
extract_nested(raw_trn, "trainingResource.scientificDomains".split(".")))),
"duration": extract_nested(raw_trn,
"trainingResource.duration".split(".")),
"expertiseLevel": extract_map_nested(raw_trn, 'expertise_level',
"trainingResource.expertiseLevel".split(".")),
"id": extract_nested(raw_trn,
"trainingResource.id".split(".")),
"keyword": extract_nested(raw_trn,
"trainingResource.keywords".split(".")),
"language": extract_map_nested(raw_trn, 'languages', "trainingResource.languages".split(".")),
"learningOutcomes": extract_nested(raw_trn,
"trainingResource.learningOutcomes".split(".")),
"learningResourceType": extract_map_nested(raw_trn, 'tr_dcmi',
"trainingResource.learningResourceTypes".split(".")),
"license": extract_nested(raw_trn,
"trainingResource.license".split(".")),
"organization": extract_nested(organization, "provider.name".split(".")),
"qualifications": extract_map_nested(raw_trn, 'qualification',
"trainingResource.qualifications".split(".")),
"targetGroup": extract_map_nested(raw_trn, 'target_user', "trainingResource.targetGroups".split(".")),
"title": extract_nested(raw_trn,
"trainingResource.title".split(".")),
"type": 'trainingResource',
"url": extract_nested(raw_trn,
"trainingResource.url".split(".")),
"year": datetime.fromtimestamp(
int(extract_nested(raw_trn, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)
def map_interoperability(self, raw_itr: dict) -> dict:
organization = self.get_provider(extract_nested(raw_itr, ['interoperabilityRecord', 'providerId']))
service_records = self.get_services(self.get_services_of_interoperability(raw_itr['id']))
res = {
"alternativeIdentifiers": extract_nested(raw_itr,
"interoperabilityRecord.alternativeIdentifiers".split(".")),
"creators": list(map(lambda c: {
"affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
"givenName": extract_nested(c, ['givenName']),
"familyName": extract_nested(c, ['familyName']),
"fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
"type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
}, extract_nested(raw_itr, "interoperabilityRecord.creators".split(".")))),
"description": extract_nested(raw_itr,
"interoperabilityRecord.description".split(".")),
"doi": extract_nested(raw_itr, ['identifierInfo', 'identifier']) if
extract_nested(raw_itr, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
"domain": {'domain': extract_map_nested(raw_itr, 'domains',
"interoperabilityRecord.domain".split("."))},
"guidelineType": extract_map_nested(raw_itr, 'guideline_type',
"interoperabilityRecord.eoscGuidelineType".split(".")),
"id": extract_nested(raw_itr,
"interoperabilityRecord.id".split(".")),
"license": extract_nested(raw_itr, "interoperabilityRecord.rights.rightIdentifier".split(".")),
"licenseDetails": list(map(lambda c: {
"identifier": extract_nested(c, ['rightIdentifier']),
"title": extract_nested(c, ['rightTitle']),
"uri": extract_nested(c, ['rightURI'])
}, extract_nested(raw_itr, "interoperabilityRecord.rights".split(".")))),
"organization": extract_nested(organization, "provider.name".split(".")),
"provider": extract_nested(organization, "provider.name".split(".")),
"publicationYear": extract_nested(raw_itr, "interoperabilityRecord.publicationYear".split(".")),
"services": list(map(lambda s: {
"name": extract_nested(organization, "service.name".split(".")),
"organization": extract_nested(organization, "service.organization".split(".")),
# s.organization on already mapped services
}, service_records)),
"status": extract_nested(raw_itr, "interoperabilityRecord.status".split(".")),
"title": extract_nested(raw_itr, "interoperabilityRecord.title".split(".")),
"type": 'interoperabilityRecord',
# "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)

View File

@ -0,0 +1,41 @@
from typing import Dict, Any, List
from catalogue.vocabulary import CATALOG_VOCABULARY
def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
if len(labels) <= 0:
return current_value
for label in labels:
if isinstance(current_value, dict) and label in current_value:
current_value = current_value[label]
else:
return None
return current_value
def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
value = extract_nested(current_value, labels)
if value is None:
return None
if isinstance(value, list):
return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
if isinstance(value, str) and value != '':
return CATALOG_VOCABULARY[dictionary][value]
return None
def delete_none(_dict):
"""Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
if isinstance(_dict, dict):
for key, value in list(_dict.items()):
if isinstance(value, (list, dict, tuple, set)):
_dict[key] = delete_none(value)
elif value is None or key is None:
del _dict[key]
elif isinstance(_dict, (list, set, tuple)):
_dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
return _dict

View File

@ -0,0 +1,13 @@
from datetime import datetime
from typing import Dict, Any, List
from opensearchpy import OpenSearch
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
from catalogue.vocabulary import CATALOG_VOCABULARY

View File

@ -0,0 +1,878 @@
CATALOG_VOCABULARY = {
'categories': {'category-access_physical_and_eInfrastructures-compute': 'Compute',
'category-access_physical_and_eInfrastructures-data_storage': 'Data Storage',
'category-access_physical_and_eInfrastructures-instrument_and_equipment': 'Instrument & Equipment',
'category-access_physical_and_eInfrastructures-material_storage': 'Material Storage',
'category-access_physical_and_eInfrastructures-network': 'Network',
'category-aggregators_and_integrators-aggregators_and_integrators': 'Aggregators & Integrators',
'category-other-other': 'Other', 'category-processing_and_analysis-data_analysis': 'Data Analysis',
'category-processing_and_analysis-data_management': 'Data Management',
'category-processing_and_analysis-measurement_and_materials_analysis': 'Measurement & Materials Analysis',
'category-security_and_operations-operations_and_infrastructure_management_services': 'Operations & Infrastructure Management Services',
'category-security_and_operations-security_and_identity': 'Security & Identity',
'category-sharing_and_discovery-applications': 'Applications',
'category-sharing_and_discovery-data': 'Data',
'category-sharing_and_discovery-development_resources': 'Development Resources',
'category-sharing_and_discovery-samples': 'Samples',
'category-sharing_and_discovery-scholarly_communication': 'Scholarly Communication',
'category-sharing_and_discovery-software': 'Software',
'category-training_and_support-consultancy_and_support': 'Consultancy & Support',
'category-training_and_support-education_and_training': 'Education & Training'},
'trl': {'trl-1': '1 - basic principles observed', 'trl-2': '2 - technology concept formulated',
'trl-3': '3 - experimental proof of concept', 'trl-4': '4 - technology validated in lab',
'trl-5': '5 - technology validated in relevant environment',
'trl-6': '6 - technology demonstrated in relevant environment',
'trl-7': '7 - system prototype demonstration in operational environment',
'trl-8': '8 - system complete and qualified',
'trl-9': '9 - actual system proven in operational environment'},
'target_users': {'target_user-businesses': 'Businesses', 'target_user-funders': 'Funders',
'target_user-innovators': 'Innovators', 'target_user-other': 'Other',
'target_user-policy_makers': 'Policy Makers', 'target_user-providers': 'Providers',
'target_user-research_communities': 'Research Communities',
'target_user-research_groups': 'Research Groups',
'target_user-research_infrastructure_managers': 'Research Infrastructure Managers',
'target_user-research_managers': 'Research Managers',
'target_user-research_networks': 'Research Networks',
'target_user-research_organisations': 'Research Organisations',
'target_user-research_projects': 'Research Projects', 'target_user-researchers': 'Researchers',
'target_user-resource_managers': 'Resource Managers',
'target_user-resource_provider_managers': 'Provider Managers',
'target_user-publishers': 'Publishers',
'target_user-students': 'Students'},
'access_mode': {'access_mode-free': 'Free', 'access_mode-free_conditionally': 'Free Conditionally',
'access_mode-other': 'Other', 'access_mode-paid': 'Paid',
'access_mode-peer_reviewed': 'Peer Reviewed'},
'funding_body': {'funding_body-ademe': 'Agency for Environment and Energy Management (ADEME)',
'funding_body-ahrc': 'Arts and Humanities Research Council (AHRC)',
'funding_body-aka': 'Academy of Finland (AKA)',
'funding_body-ancs': 'National Authority for Scientific Research (ANCS)',
'funding_body-anr': 'French National Research Agency (ANR)',
'funding_body-apvv': 'Research and Development Agency (APVV)',
'funding_body-arc': 'Australian Research Council (ARC)',
'funding_body-arrs': 'Slovenian Research Agency (ARRS)',
'funding_body-awi': 'Alfred Wegener Institute for Polar and Marine Research (AWI)',
'funding_body-bbsrc': 'Biotechnology and Biological Sciences Research Council (BBSRC)',
'funding_body-bf': 'Belmont Forum (BF)',
'funding_body-bmbf': 'Federal Ministry of Education and Research (BMBF)',
'funding_body-caixa': 'La Caixa Foundation (CAIXA)',
'funding_body-cdti': 'Center for Industrial Technological Development (CDTI)',
'funding_body-cea': 'Alternative Energies and Atomic Energy Commission (CEA)',
'funding_body-cihr': 'Canadian Institutes of Health Research (CIHR)',
'funding_body-cncsis': 'National University Research Council (CNCSIS) - Romania',
'funding_body-cnes': 'National Centre for Space Studies (CNES)',
'funding_body-cnpq': 'National Council for Scientific and Technological Development (CNPq)',
'funding_body-cnr': 'National Research Council (CNR)',
'funding_body-cnrs': 'National Centre for Scientific Research (CNRS)',
'funding_body-csf': 'Croatian Science Foundation (CSF)',
'funding_body-csic': 'Spanish National Research Council (CSIC)',
'funding_body-dashe': 'Danish Agency for Science and Higher Education (DASHE)',
'funding_body-dasti': 'Danish Agency for Science, Technology and Innovation (DASTI)',
'funding_body-ddf': 'The Danish Council for Independent Research (DDF)',
'funding_body-dff': 'Danish Council for Independent Research (DFF)',
'funding_body-dfg': 'German Research Foundation (DFG)',
'funding_body-dgo6': 'General Operational Directorate for Economy, Employment and Research (DGO6)',
'funding_body-dlr': 'German Aerospace Center (DLR)',
'funding_body-dnrf': 'Danish National Research Foundation (DNRF)',
'funding_body-eaer': 'Federal Department of Economic Affairs, Education and Research (EAER)',
'funding_body-ec': 'European Commission (EC)',
'funding_body-epsrc': 'Engineering and Physical Sciences Research Council (EPSRC)',
'funding_body-esa': 'European Space Agency (ESA)',
'funding_body-esrc': 'Economic and Social Research Council (ESRC)',
'funding_body-etag': 'Estonian Research Council (ETAG)',
'funding_body-fapesp': 'São Paulo Research Foundation (FAPESP)',
'funding_body-fct': 'Foundation for Science and Technology (FCT)',
'funding_body-ffg': 'Austrian Research Promotion Agency (FFG)',
'funding_body-fnp': 'Foundation for Polish Science (FNP)',
'funding_body-fnr': 'National Research Fund (FNR)',
'funding_body-fnrs': 'Fonds National de la Recherche Scientifique (FNRS)',
'funding_body-fom': 'Foundation for Fundamental Research on Matter (FOM)',
'funding_body-forte': 'Swedish Research Council for Health, Working Life and Welfare (FORTE)',
'funding_body-fts': 'Fritz Thyssen Foundation (FTS)',
'funding_body-fwf': 'Austrian Science Fund (FWF)',
'funding_body-fwo': 'Research Foundation Flanders (FWO)',
'funding_body-gacr': 'Czech Science Foundation (GACR)',
'funding_body-gsrt': 'General Secretariat for Research and Technology (GSRT)',
'funding_body-ifd': 'Innovation Fund Denmark (IFD)',
'funding_body-ifremer': 'French Research Institute for Exploitation of the Sea (IFREMER)',
'funding_body-imsr': 'Innovation Fund of the Ministry of Economy of the Slovak Republic (IMSR)',
'funding_body-innoviris': 'Brussels Institute for Research and Innovation (INNOVIRIS)',
'funding_body-inra': 'National institute of Agricultural Research (INRA)',
'funding_body-inserm': 'National Institute of Health and Medical Research (INSERM)',
'funding_body-ipev': 'French Polar Institute (IPEV)',
'funding_body-irc': 'Irish Research Council (IRC)',
'funding_body-isc': 'International Science Council (ISC)',
'funding_body-isciii': 'Carlos III Health Institute (ISCIII)',
'funding_body-isf': 'Israel Science Foundation (ISF)',
'funding_body-iwt': 'Agency for Innovation by Science and Technology (IWT)',
'funding_body-jsps': 'Japanese Society for the Promotion of Science (JSPS)',
'funding_body-jst': 'Japanese Science and Technology Agency (JST)',
'funding_body-kaws': 'Knut and Alice Wallenberg Foundation (KAWS)',
'funding_body-kks': 'Knowledge Foundation (KKS)',
'funding_body-lmt': 'Research Council of Lithuania (LMT)',
'funding_body-mcst': 'Malta Council for Science and Technology (MCST)',
'funding_body-mecr': 'Ministry for Education and Scientific Research (MECR)',
'funding_body-mesr': 'Ministry of Higher Education and Research (MESR)',
'funding_body-mestd': 'Ministry of Education, Science and Technological Development of Republic of Serbia (MESTD)',
'funding_body-mgrt': 'Ministry for Economic Development and Technology (MGRT)',
'funding_body-mineco': 'Ministry for Economy and Competitveness (MINECO)',
'funding_body-mistra': 'Swedish Foundation for Strategic Environmental Research (MISTRA)',
'funding_body-mita': 'Agency for Science, Innovation and Technology (MITA)',
'funding_body-miur': 'Ministry for Education, University and Research (MIUR)',
'funding_body-most': "Ministry of Science and Technology of the People's Republic of China (MOST)",
'funding_body-mpg': 'Max Planck Society for the Advancement of Science (MPG)',
'funding_body-mrc': 'Medical Research Council (MRC)',
'funding_body-mse': 'Ministry of Science and Education Republic of Croatia (MSE)',
'funding_body-msvvas_sr': 'The Ministry of Education, Science, Research and Sports of the Slovak Republic (MSVVaS SR)',
'funding_body-nasa': 'National Aeronautics and Space Administration (NASA)',
'funding_body-ncbir': 'National Centre for Research and Development (NCBiR)',
'funding_body-ncn': 'National Science Center (NCN)',
'funding_body-nerc': 'Natural Environment Research Council (NERC)',
'funding_body-nhmrc': 'National Health and Medical Research Council (NHMRC)',
'funding_body-nig': 'National Institutes of Health (NIG)',
'funding_body-nkfia': 'National Research, Development and Innovation Fund (NKFIA)',
'funding_body-nrf': 'National Research Foundation (NRF)',
'funding_body-nserc': 'Natural Sciences and Engineering Research Council of Canada (NSERC)',
'funding_body-nsf': 'National Science Foundation (NSF)',
'funding_body-nwo': 'Netherlands Organisation for Scientific Research (NWO)',
'funding_body-oeaw': 'Austrian Academy of Sciences (OeAW)',
'funding_body-oenfte': 'National Foundation for Research, Technology and Development (OeNFTE)',
'funding_body-onera': 'French National Aerospace Research Center (ONERA)',
'funding_body-other': 'Other', 'funding_body-rannis': 'Icelandic Centre for Research (RANNIS)',
'funding_body-rcn': 'Research Council of Norway (RCN)',
'funding_body-rcuk': 'Research Council UK (RCUK)',
'funding_body-rj': 'The Swedish Foundation for Humanities and Social Sciences (RJ)',
'funding_body-rpf': 'Research Promotion Foundation (RPF)',
'funding_body-sea': 'Swedish Energy Agency (SEA)',
'funding_body-sepa': 'Swedish Environmental Protection Agency (SEPA)',
'funding_body-sfi': 'Science Foundation Ireland (SFI)',
'funding_body-sgpi': 'Secretariat-General for Investment (SGPI)',
'funding_body-snf': 'Swiss National Science Foundation (SNF)',
'funding_body-snsb': 'Swedish National Space Board (SNSB)',
'funding_body-srcf': 'Swedish Reseach Council Formas (SRCF)',
'funding_body-srsa': 'Swedish Radiation Safety Authority (SRSA)',
'funding_body-ssf': 'Swedish Foundation for Strategic Research (SSF)',
'funding_body-sshrc': 'Social Sciences and Humanities Research Council (SSHRC)',
'funding_body-stfc': 'Science and Technology Facilities Council (STFC)',
'funding_body-stw': 'Technology Foundation (STW)',
'funding_body-tacr': 'Technology Agency of the Czech Republic (TACR)',
'funding_body-tara': 'Tara Expeditions Foundation (TARA)',
'funding_body-tekes': 'Finnish Funding Agency for Technology and Innovation (TEKES)',
'funding_body-tubitak': 'Scientific and Technological Research Council of Turkey (TUBITAK)',
'funding_body-uefiscdi_cncs': 'Executive Agency for Higher Education, Research, Development and Innovation Funding (UEFISCDI - CNCS)',
'funding_body-ukri': 'UK Research and Innovation (UKRI)',
'funding_body-vega': 'Scientific Grant Agency (VEGA)',
'funding_body-viaa': 'State Education Development Agency (VIAA)',
'funding_body-vinnova': 'Swedish Governmental Agency for Innovation Systems (VINNOVA)',
'funding_body-vlaio': 'Flanders Innovation & Entrepeneurship (VLAIO)',
'funding_body-vr': 'Swedish Research Council (VR)',
'funding_body-vs': 'Volkswagen Foundation (VS)',
'funding_body-wt': 'Wellcome trust (WT)',
'funding_body-wwtf': 'Vienna Science and Technology Fund (WWTF)',
'funding_body-meys': 'Ministry of Education, Youth and Sports of the Czech Republic (MEYS)',
'funding_body-af': 'Arcadia Fund'},
'target_user': {'target_user-businesses': 'Businesses', 'target_user-funders': 'Funders',
'target_user-innovators': 'Innovators', 'target_user-other': 'Other',
'target_user-policy_makers': 'Policy Makers', 'target_user-providers': 'Providers',
'target_user-research_communities': 'Research Communities',
'target_user-research_groups': 'Research Groups',
'target_user-research_infrastructure_managers': 'Research Infrastructure Managers',
'target_user-research_managers': 'Research Managers',
'target_user-research_networks': 'Research Networks',
'target_user-research_organisations': 'Research Organisations',
'target_user-research_projects': 'Research Projects', 'target_user-researchers': 'Researchers',
'target_user-resource_managers': 'Resource Managers',
'target_user-resource_provider_managers': 'Provider Managers',
'target_user-publishers': 'Publishers',
'target_user-students': 'Students'},
'related_platform': {'related_platform-ands': 'ANDS', 'related_platform-artportalen': 'ArtPortalen',
'related_platform-arxiv': 'arXiv', 'related_platform-ala': 'Atlas of Living Australia',
'related_platform-avp': 'AV-Portal', 'related_platform-aws': 'AWS',
'related_platform-bluecloud': 'Blue-Cloud',
'related_platform-cdl': 'California Digital Library',
'related_platform-ccdc': 'CCDC', 'related_platform-cessda': 'CESSDA',
'related_platform-collabwith': 'COLLABWITH',
'related_platform-cccs': 'Copernicus Climate Change Service',
'related_platform-crossref': 'Crossref', 'related_platform-dariahteach': 'dariahTeach',
'related_platform-dice': 'Data Infrastructure Capacity for EOSC (DICE)',
'related_platform-datacite': 'DataCite', 'related_platform-ds': 'Digital Science',
'related_platform-doab': 'DOAB', 'related_platform-einfracz': 'e-INFRA CZ',
'related_platform-eirgspp': 'e-IRGSP projects', 'related_platform-edugain': 'eduGAIN',
'related_platform-eduteams': 'eduTEAMS', 'related_platform-egi': 'EGI',
'related_platform-egifc': 'EGI Federated Cloud', 'related_platform-egiace': 'EGI-ACE',
'related_platform-elixir': 'ELIXIR', 'related_platform-emodnetc': 'EMODnet Chemistry',
'related_platform-eol': 'Encyclopedia of Life',
'related_platform-enc': 'Endemia New Caledonia',
'related_platform-envri': 'ENVRI Hub', 'related_platform-eoscl': 'EOSC-Life',
'related_platform-eoscn': 'EOSC-Nordic', 'related_platform-eoscp': 'EOSC-Pillar',
'related_platform-eudatcdi': 'EUDAT CDI', 'related_platform-elg': 'European Language Grid',
'related_platform-evs': 'European Values Study (EVS)',
'related_platform-garrcp': 'GARR Container Platform',
'related_platform-gatep': 'GATE platform',
'related_platform-gbif': 'GBIF', 'related_platform-geonames': 'GeoNames',
'related_platform-grin': 'Germplasm Resources Information Network (GRIN)',
'related_platform-geoss': 'Global Earth Observation system of Systems (GEOSS)',
'related_platform-hal': 'HAL', 'related_platform-hamelin': 'Hamelin',
'related_platform-infnc': 'INFN-Cloud', 'related_platform-ispot': 'iSpot',
'related_platform-jisc': 'JISC', 'related_platform-metacentrum': 'MetaCentrum',
'related_platform-natusfera': 'Natusfera', 'related_platform-openairee': 'OpenAIRE EXPLORE',
'related_platform-openairem': 'OpenAIRE MONITOR',
'related_platform-openairerg': 'OpenAIRE research graph',
'related_platform-oc': 'OpenCitations',
'related_platform-pogo': 'Partnership for Observation of the Global Oceans (POGO)',
'related_platform-pnp': 'Pl@ntNet platform', 'related_platform-pc': 'PolicyCloud',
'related_platform-rjb': 'Real Jardín Botánico', 'related_platform-scopus': 'Scopus',
'related_platform-seadatanet': 'SeaDataNet',
'related_platform-tsd': 'Service for Sensitive Data (TSD)',
'related_platform-sshom': 'SSH Open Marketplace', 'related_platform-surf': 'SURF',
'related_platform-share': 'Survey of Health, Ageing and Retirement in Europe (SHARE)',
'related_platform-tf': 'Taylor&Francis', 'related_platform-tb': 'Tela Botanica',
'related_platform-tdp': 'The Dataverse Project',
'related_platform-tnomadl': 'The NOMAD Laboratory', 'related_platform-tpg': 'The Plant Game',
'related_platform-tibp': 'TIB Portal', 'related_platform-tripleh': 'TRIPLE H2020 project',
'related_platform-tubitakcc': 'TÜBITAK cloud compute',
'related_platform-vlab': 'Virtual Earth Laboratory (VLab)',
'related_platform-zbwice': 'ZBW Information Centre for Economics',
'related_platform-zenodo': 'Zenodo'},
'languages': {'aa': 'Afar', 'ab': 'Abkhazian', 'ae': 'Avestan', 'af': 'Afrikaans', 'ak': 'Akan', 'am': 'Amharic',
'an': 'Aragonese', 'ar': 'Arabic', 'as': 'Assamese', 'av': 'Avaric', 'ay': 'Aymara',
'az': 'Azerbaijani',
'ba': 'Bashkir', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bh': 'Bihari', 'bi': 'Bislama',
'bm': 'Bambara',
'bn': 'Bengali', 'bo': 'Tibetan', 'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'ce': 'Chechen',
'ch': 'Chamorro', 'co': 'Corsican', 'cr': 'Cree', 'cs': 'Czech', 'cu': 'Old Church Slavonic',
'cv': 'Chuvash', 'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dv': 'Divehi', 'dz': 'Dzongkha',
'ee': 'Ewe', 'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish', 'et': 'Estonian',
'eu': 'Basque', 'fa': 'Persian', 'ff': 'Fula', 'fi': 'Finnish', 'fj': 'Fijian', 'fo': 'Faroese',
'fr': 'French', 'fy': 'Western Frisian', 'ga': 'Irish', 'gd': 'Galician', 'gl': 'Gaelic',
'gn': 'Guarani', 'gu': 'Gujarati', 'gv': 'Manx', 'ha': 'Hausa', 'he': 'Hebrew', 'hi': 'Hindi',
'ho': 'Hiri Motu', 'hr': 'Croatian', 'ht': 'Haitian', 'hu': 'Hungarian', 'hy': 'Armenian',
'hz': 'Herero', 'ia': 'Interlingua', 'id': 'Indonesian', 'ie': 'Interlingue', 'ig': 'Igbo',
'ii': 'Nuosu', 'iii': 'Sichuan Yi', 'ik': 'Inupiak', 'io': 'Ido', 'is': 'Icelandic', 'it': 'Italian',
'iu': 'Inuktitut', 'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kg': 'Kongo',
'ki': 'Kikuyu',
'kj': 'Kwanyama', 'kk': 'Kazakh', 'kl': 'Kalaallisut', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean',
'kr': 'Kanuri', 'ks': 'Kashmiri', 'ku': 'Kurdish', 'kv': 'Komi', 'kw': 'Cornish', 'ky': 'Kyrgyz',
'la': 'Latin', 'lb': 'Luxembourgish', 'li': 'Limburgish', 'ln': 'Lingala', 'lo': 'Lao',
'lt': 'Lithuanian', 'lu': 'Luba-Katanga', 'lv': 'Latvian', 'mg': 'Malagasy', 'mh': 'Marshallese',
'mi': 'Maori', 'mk': 'Macedonian', 'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi',
'ms': 'Malay',
'mt': 'Maltese', 'my': 'Burmese', 'na': 'Nauru', 'nb': 'Norwegian Bokmål', 'nd': 'Northern Ndebele',
'ne': 'Nepali', 'ng': 'Ndonga', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
'nr': 'Southern Ndebele', 'nv': 'Navajo', 'ny': 'Chichewa', 'oc': 'Occitan', 'oj': 'Ojibwe',
'om': 'Oromo', 'or': 'Oriya', 'os': 'Ossetian', 'ot': 'Other', 'pa': 'Panjabi', 'pi': 'Pāli',
'pl': 'Polish', 'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'rm': 'Romansh', 'rn': 'Kirundi',
'ro': 'Romanian', 'ru': 'Russian', 'rw': 'Kinyarwanda', 'sa': 'Sanskrit', 'sar': 'Sardinian',
'sd': 'Sindhi', 'se': 'Sami', 'sg': 'Sango', 'si': 'Sinhalese', 'sk': 'Slovak', 'sl': 'Slovenian',
'sm': 'Samoan', 'sn': 'Shona', 'so': 'Somali', 'sq': 'Albanian', 'sr': 'Serbian', 'ss': 'Swati',
'st': 'Sesotho', 'su': 'Sundanese', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu',
'tg': 'Tajik', 'th': 'Thai', 'ti': 'Tigrinya', 'tk': 'Turkmen', 'tl': 'Tagalog', 'tn': 'Setswana',
'to': 'Tonga', 'tr': 'Turkish', 'ts': 'Tsonga', 'tt': 'Tatar', 'tw': 'Twi', 'ty': 'Tahitian',
'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'uz': 'Uzbek', 've': 'Venda', 'vi': 'Vietnamese',
'vo': 'Volapük', 'wa': 'Wallon', 'wo': 'Wolof', 'xh': 'Xhosa', 'yi': 'Yiddish', 'yo': 'Yoruba',
'za': 'Zhuang', 'zh': 'Chinese', 'zu': 'Zulu'}, 'ig': {},
'qualification': {'tr_qualification-badge': 'Badge', 'tr_qualification-certification': 'Certification',
'tr_qualification-accreditation': 'Accreditation'}, 'subcategories': {
'subcategory-access_physical_and_eInfrastructures-compute-container_management': 'Container Management',
'subcategory-access_physical_and_eInfrastructures-compute-job_execution': 'Job Execution',
'subcategory-access_physical_and_eInfrastructures-compute-orchestration': 'Orchestration',
'subcategory-access_physical_and_eInfrastructures-compute-other': 'Other',
'subcategory-access_physical_and_eInfrastructures-compute-serverless_applications_repository': 'Serverless Applications Repository',
'subcategory-access_physical_and_eInfrastructures-compute-virtual_machine_management': 'Virtual Machine Management',
'subcategory-access_physical_and_eInfrastructures-compute-workload_management': 'Workload Management',
'subcategory-access_physical_and_eInfrastructures-data_storage-archive': 'Archive',
'subcategory-access_physical_and_eInfrastructures-data_storage-backup': 'Backup',
'subcategory-access_physical_and_eInfrastructures-data_storage-data': 'Data',
'subcategory-access_physical_and_eInfrastructures-data_storage-digital_preservation': 'Digital Preservation',
'subcategory-access_physical_and_eInfrastructures-data_storage-disk': 'Disk',
'subcategory-access_physical_and_eInfrastructures-data_storage-file': 'File',
'subcategory-access_physical_and_eInfrastructures-data_storage-online': 'Online',
'subcategory-access_physical_and_eInfrastructures-data_storage-other': 'Other',
'subcategory-access_physical_and_eInfrastructures-data_storage-queue': 'Queue',
'subcategory-access_physical_and_eInfrastructures-data_storage-recovery': 'Recovery',
'subcategory-access_physical_and_eInfrastructures-data_storage-replicated': 'Replicated',
'subcategory-access_physical_and_eInfrastructures-data_storage-synchronised': 'Synchronised',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-chromatographer': 'Chromatographer',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-cytometer': 'Cytometer',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-digitisation_equipment': 'Digitisation Equipment',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-geophysical': 'Geophysical',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-laser': 'Laser',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-microscopy': 'Microscopy',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-monument_maintenance_equipment': 'Monument Maintenance Equipment',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-other': 'Other',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-radiation': 'Radiation',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-spectrometer': 'Spectrometer',
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-spectrophotometer': 'Spectrophotometer',
'subcategory-access_physical_and_eInfrastructures-material_storage-archiving': 'Archiving',
'subcategory-access_physical_and_eInfrastructures-material_storage-assembly': 'Assembly',
'subcategory-access_physical_and_eInfrastructures-material_storage-disposal': 'Disposal',
'subcategory-access_physical_and_eInfrastructures-material_storage-fulfilment': 'Fulfilment',
'subcategory-access_physical_and_eInfrastructures-material_storage-other': 'Other',
'subcategory-access_physical_and_eInfrastructures-material_storage-packaging': 'Packaging',
'subcategory-access_physical_and_eInfrastructures-material_storage-preservation': 'Preservation',
'subcategory-access_physical_and_eInfrastructures-material_storage-quality_inspecting': 'Quality Inspecting',
'subcategory-access_physical_and_eInfrastructures-material_storage-repository': 'Repository',
'subcategory-access_physical_and_eInfrastructures-material_storage-reworking': 'Reworking',
'subcategory-access_physical_and_eInfrastructures-material_storage-sorting': 'Sorting',
'subcategory-access_physical_and_eInfrastructures-material_storage-warehousing': 'Warehousing',
'subcategory-access_physical_and_eInfrastructures-network-content_delivery_network': 'Content Delivery Network',
'subcategory-access_physical_and_eInfrastructures-network-direct_connect': 'Direct Connect',
'subcategory-access_physical_and_eInfrastructures-network-exchange': 'Exchange',
'subcategory-access_physical_and_eInfrastructures-network-load_balancer': 'Load Balancer',
'subcategory-access_physical_and_eInfrastructures-network-other': 'Other',
'subcategory-access_physical_and_eInfrastructures-network-traffic_manager': 'Traffic Manager',
'subcategory-access_physical_and_eInfrastructures-network-virtual_nework': 'Virtual Network',
'subcategory-access_physical_and_eInfrastructures-network-vpn_gateway': 'VPN Gateway',
'subcategory-access_physical_and_eInfrastructures-network-dns': 'DNS',
'subcategory-aggregators_and_integrators-aggregators_and_integrators-applications': 'Applications',
'subcategory-aggregators_and_integrators-aggregators_and_integrators-data': 'Data',
'subcategory-aggregators_and_integrators-aggregators_and_integrators-other': 'Other',
'subcategory-aggregators_and_integrators-aggregators_and_integrators-services': 'Services',
'subcategory-aggregators_and_integrators-aggregators_and_integrators-software': 'Software',
'subcategory-other-other-other': 'Other',
'subcategory-processing_and_analysis-data_analysis-2d_3d_digitisation': '2D/3D Digitisation',
'subcategory-processing_and_analysis-data_analysis-artificial_intelligence': 'Artificial Intelligence',
'subcategory-processing_and_analysis-data_analysis-data_extrapolation': 'Data Extrapolation',
'subcategory-processing_and_analysis-data_analysis-forecast': 'Forecast',
'subcategory-processing_and_analysis-data_analysis-image_data_analysis': 'Image/Data Analysis',
'subcategory-processing_and_analysis-data_analysis-machine_learning': 'Machine Learning',
'subcategory-processing_and_analysis-data_analysis-other': 'Other',
'subcategory-processing_and_analysis-data_analysis-visualization': 'Visualization',
'subcategory-processing_and_analysis-data_analysis-workflows': 'Workflows',
'subcategory-processing_and_analysis-data_analysis-quality_assesment': 'Quality Assesment',
'subcategory-processing_and_analysis-data_management-access': 'Access',
'subcategory-processing_and_analysis-data_management-annotation': 'Annotation',
'subcategory-processing_and_analysis-data_management-anonymisation': 'Anonymisation',
'subcategory-processing_and_analysis-data_management-brokering': 'Brokering',
'subcategory-processing_and_analysis-data_management-digitisation': 'Digitisation',
'subcategory-processing_and_analysis-data_management-discovery': 'Discovery',
'subcategory-processing_and_analysis-data_management-embargo': 'Embargo',
'subcategory-processing_and_analysis-data_management-interlinking': 'Interlinking',
'subcategory-processing_and_analysis-data_management-maintenance': 'Maintenance',
'subcategory-processing_and_analysis-data_management-mining': 'Mining',
'subcategory-processing_and_analysis-data_management-other': 'Other',
'subcategory-processing_and_analysis-data_management-persistent_identifier': 'Persistent Identifier',
'subcategory-processing_and_analysis-data_management-preservation': 'Preservation',
'subcategory-processing_and_analysis-data_management-publishing': 'Publishing',
'subcategory-processing_and_analysis-data_management-registration': 'Registration',
'subcategory-processing_and_analysis-data_management-transfer': 'Transfer',
'subcategory-processing_and_analysis-data_management-validation': 'Validation',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-analysis': 'Analysis',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-characterisation': 'Characterisation',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-maintenance_and_modification': 'Maintenance & Modification',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-other': 'Other',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-production': 'Production',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-testing_and_validation': 'TEsting & Validation',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-validation': 'Validation',
'subcategory-processing_and_analysis-measurement_and_materials_analysis-workflows': 'Workflows',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-accounting': 'Accounting',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-analysis': 'Analysis',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-billing': 'Billing',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-configuration': 'Configuration',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-coordination': 'Coordination',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-helpdesk': 'Helpdesk',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-monitoring': 'Monitoring',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-order_management': 'Order Management',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-other': 'Other',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-transportation': 'Transportation',
'subcategory-security_and_operations-operations_and_infrastructure_management_services-utilities': 'Utilities',
'subcategory-security_and_operations-security_and_identity-certification_authority': 'Certification Authority',
'subcategory-security_and_operations-security_and_identity-coordination': 'Coordination',
'subcategory-security_and_operations-security_and_identity-firewall': 'Firewall',
'subcategory-security_and_operations-security_and_identity-group_management': 'Group Management',
'subcategory-security_and_operations-security_and_identity-identity_and_access_management': 'Identity & Access Management',
'subcategory-security_and_operations-security_and_identity-other': 'Other',
'subcategory-security_and_operations-security_and_identity-single_sign_on': 'Single Sign-On',
'subcategory-security_and_operations-security_and_identity-threat_protection': 'Threat Protection',
'subcategory-security_and_operations-security_and_identity-tools': 'Tools',
'subcategory-security_and_operations-security_and_identity-user_authentication': 'User Authentication',
'subcategory-sharing_and_discovery-applications-applications_repository': 'Applications Repository',
'subcategory-sharing_and_discovery-applications-business': 'Business',
'subcategory-sharing_and_discovery-applications-collaboration': 'Collaboration',
'subcategory-sharing_and_discovery-applications-communication': 'Communication',
'subcategory-sharing_and_discovery-applications-education': 'Education',
'subcategory-sharing_and_discovery-applications-other': 'Other',
'subcategory-sharing_and_discovery-applications-productivity': 'Productivity',
'subcategory-sharing_and_discovery-applications-social_networking': 'Social/Networking',
'subcategory-sharing_and_discovery-applications-utilities': 'Utilities',
'subcategory-sharing_and_discovery-data-clinical_trial_data': 'Clinical Trial Data',
'subcategory-sharing_and_discovery-data-data_archives': 'Data Archives',
'subcategory-sharing_and_discovery-data-epidemiological_data': 'Epidemiological Data',
'subcategory-sharing_and_discovery-data-government_and_agency_data': 'Government & Agency Data',
'subcategory-sharing_and_discovery-data-online_service_data': 'Online Service Data',
'subcategory-sharing_and_discovery-data-other': 'Other',
'subcategory-sharing_and_discovery-data-scientific_research_data': 'Scientific/Research Data',
'subcategory-sharing_and_discovery-data-statistical_data': 'Statistical Data',
'subcategory-sharing_and_discovery-data-metadata': 'Metadata',
'subcategory-sharing_and_discovery-development_resources-apis_repository_gateway': 'APIs Repository/Gateway',
'subcategory-sharing_and_discovery-development_resources-developer_tools': 'Developer Tools',
'subcategory-sharing_and_discovery-development_resources-other': 'Other',
'subcategory-sharing_and_discovery-development_resources-software_development_kits': 'Software Development Kits',
'subcategory-sharing_and_discovery-development_resources-software_libraries': 'Software Libraries',
'subcategory-sharing_and_discovery-development_resources-simulation_tools': 'Simulation Tools',
'subcategory-sharing_and_discovery-samples-biological_samples': 'Biological Samples',
'subcategory-sharing_and_discovery-samples-characterisation': 'Characterisation',
'subcategory-sharing_and_discovery-samples-chemical_compounds_library': 'Chemical Compounds Library',
'subcategory-sharing_and_discovery-samples-other': 'Other',
'subcategory-sharing_and_discovery-samples-preparation': 'Preparation',
'subcategory-sharing_and_discovery-scholarly_communication-analysis': 'Analysis',
'subcategory-sharing_and_discovery-scholarly_communication-assessment': 'Assessment',
'subcategory-sharing_and_discovery-scholarly_communication-discovery': 'Discovery',
'subcategory-sharing_and_discovery-scholarly_communication-other': 'Other',
'subcategory-sharing_and_discovery-scholarly_communication-outreach': 'Outreach',
'subcategory-sharing_and_discovery-scholarly_communication-preparation': 'Preparation',
'subcategory-sharing_and_discovery-scholarly_communication-publication': 'Publication',
'subcategory-sharing_and_discovery-scholarly_communication-writing': 'Writing',
'subcategory-sharing_and_discovery-software-libraries': 'Libraries',
'subcategory-sharing_and_discovery-software-other': 'Other',
'subcategory-sharing_and_discovery-software-platform': 'Platform',
'subcategory-sharing_and_discovery-software-software_package': 'Software Package',
'subcategory-sharing_and_discovery-software-software_repository': 'Software Repository',
'subcategory-training_and_support-consultancy_and_support-application_optimisation': 'Application Optimisation',
'subcategory-training_and_support-consultancy_and_support-application_porting': 'Application_Porting',
'subcategory-training_and_support-consultancy_and_support-application_scaling': 'Application Scaling',
'subcategory-training_and_support-consultancy_and_support-audit_and_assessment': 'Audit & Assessment',
'subcategory-training_and_support-consultancy_and_support-benchmarking': 'Benchmarking',
'subcategory-training_and_support-consultancy_and_support-calibration': 'Calibration',
'subcategory-training_and_support-consultancy_and_support-certification': 'Certification',
'subcategory-training_and_support-consultancy_and_support-consulting': 'Consulting',
'subcategory-training_and_support-consultancy_and_support-methodology_development': 'Methodology Development',
'subcategory-training_and_support-consultancy_and_support-modeling_and_simulation': 'Modeling & Simulation',
'subcategory-training_and_support-consultancy_and_support-other': 'Other',
'subcategory-training_and_support-consultancy_and_support-prototype_development': 'Prototype Development',
'subcategory-training_and_support-consultancy_and_support-software_development': 'Software Development',
'subcategory-training_and_support-consultancy_and_support-software_improvement': 'Software Improvement',
'subcategory-training_and_support-consultancy_and_support-technology_transfer': 'Technology Transfer',
'subcategory-training_and_support-consultancy_and_support-testing': 'Testing',
'subcategory-training_and_support-education_and_training-in_house_courses': 'In-House Courses',
'subcategory-training_and_support-education_and_training-online_courses': 'Online Courses',
'subcategory-training_and_support-education_and_training-open_registration_courses': 'Open Registration Courses',
'subcategory-training_and_support-education_and_training-other': 'Other',
'subcategory-training_and_support-education_and_training-related_training': 'Related Training',
'subcategory-training_and_support-education_and_training-required_training': 'Required Training',
'subcategory-training_and_support-education_and_training-training_platform': 'Training Platform',
'subcategory-training_and_support-education_and_training-training_tool': 'Training Tool'}, 'service-ig': {},
'providers': {'eosc.ess': 'European Spallation Source ERIC', 'eosc.openaire': 'OpenAIRE',
'eosc.ierek': ' International Experts for Research Enrichment and Knowledge Exchange',
'eosc.centerdata': 'Centerdata',
'ni4os.ukim_fcse': 'University Ss. Cyril and Methodius, Faculty of Computer Science and Engineering',
'ni4os.sanu': 'Serbian Academy of Sciences and Arts', 'eosc.ds-wizard': 'Data Stewardship Wizard',
'eosc.ubi': 'Ubitech', 'eosc.eosc-dih': 'EOSC DIH - Digital Innovation Hub',
'eosc.vamdc': 'Virtual Atomic and Molecular Data Centre',
'eosc.dariah_eric': 'DARIAH ERIC (Digital Research Infrastructure for the Arts and Humanities)',
'eosc-nordic.rtu': 'Riga Technical University',
'eosc.vito': 'VITO NV (Vlaamse Instelling voor Technologisch Onderzoek NV)',
'eosc.unifl': 'University of Florence, DISIT lab', 'eosc.mi': 'Mandat International',
'eosc.lida': 'Lithuanian Data Archive for Social Sciences and Humanities',
'eosc.epos': 'European Plate Observing System', 'eosc.gbif-es': 'GBIF Spain',
'eosc.materialscloud': 'Materials Cloud', 'eosc.vilnius-university': 'Vilnius University',
'eosc.vecma': 'Verified Exascale Computing for Multiscale Applications', 'eosc.hn': 'Huma-Num',
'eosc.instruct-eric': 'Instruct-ERIC',
'eosc.bbmri-eric': 'Biobanking and BioMolecular resources Research Infrastructure European Research Infrastructure Consortium',
'eosc.cut_library': 'Cracow University of Technology. The Library',
'eosc.cnrsin2p3': ' Centre National de la Recherche Scientifique ',
'eosc.forschungsdaten': 'forschungsdaten.info', 'eosc.odatis': 'Pôle Odatis',
'eosc.cy-biobank': 'biobank.cy Center of Excellence in Biobanking and Biomedical Research, University of Cyprus',
'eosc.up': 'Ubiquity Press Ltd',
'eosc.ceric-eric': 'Central European Research Infrastructure Consortium',
'eosc.ccsd': 'Center for direct scientific communication',
'eosc.lnec': 'Laboratório Nacional de Engenharia Civil',
'eosc.t-systems': 'T-Systems International GmbH',
'eosc.icos_eric': 'Integrated Carbon Observation System European Research Infrastructure Consortium',
'eosc.srce': 'University of Zagreb University Computing Centre',
'eosc.crem': 'Centre de recherche Crem',
'eosc.carbonneutrallng': 'Horizon Europe Project Truly Carbon Neutral electricity enhanced Synthesis of Liquefied Natural Gas (LNG) from biomass',
'eosc.rb': 'Reportbrain Limited',
'ni4os.ibceb': 'Ivane Beritashvili Center of Experimental Biomedicine',
'eosc.ehealth_graz': 'Institute of eHealth', 'eosc.ku_leuven': 'KU Leuven',
'eosc.creatis': "Centre de Recherche en Acquisition et Traitement de l'Image pour la Santé",
'eosc.elixir-belgium': 'ELIXIR Belgium',
'eosc.earthwatch': 'Conservation Education and Research Trust',
'eosc.meeo': 'Meteorological Environmental Earth Observation', 'eosc.vib': 'VIB',
'eosc.inbelixir-es': 'INB: The Spanish National Bioinformatics Institute, the Spanish node for ELIXIR',
'eosc.iagos': 'In-service Aircraft for a Global Observing System AISBL',
'eosc-nordic.vu': 'Vilnius University',
'eosc.ifin-hh': 'Horia Hulubei National Institute for R&D in Physics and Nuclear Engineering',
'eosc.max_iv_laboratory': 'MAX IV Laboratory, Lund University',
'eosc.e-cam': 'E-CAM Centre of Excellence', 'eosc.scai': 'Fraunhofer SCAI',
'eosc.ehri': 'European Holocaust Research Infrastructure', 'eosc.rli': 'Reiner Lemoine Institute',
'eosc.expertai': 'expert.ai', 'eosc.sensing_clues': 'Sensing Clues Foundation',
'eosc.cerm-cirmmp': 'Magnetic Resonance Center of the University of Florence - CERM, Interuniversity consortium CIRMMP',
'eosc.rcisd': 'Regional Centre for Information and Scientific Development Ltd.',
'ni4os.brfaa': 'Biomedical Research Foundation, Academy of Athens',
'ni4os.ibiss': 'Institute for Biological Research Siniša Stanković, University of Belgrade',
'eosc.astron': 'NWO-I Netherlands Institute for Radio Astronomy (ASTRON)',
'eosc.bih_-_center_digital_health': 'Berlin Institute of Health at Charité Universitätsmedizin Berlin, Center of Digital Health ',
'eosc.net7': 'Net7 S.r.l.', 'eosc.csuc': 'Consorci de Serveis Universitaris de Catalunya',
'eosc.iasa': 'Institute of Accelerating Systems and Applications',
'eosc.elixir-italy': 'ELIXIR Italy',
'eosc.rolos': 'Rolos Machine Intelligence Platform for academia and business with Consulting and Applications',
'eosc.readcoop': 'READ-COOP SCE mit beschränkter Haftung',
'eosc.slices': 'Scientific Large Scale Infrastructure for Computing/Communication Experimental Studies',
'eosc.emphasis': 'European Infrastructure for Plant Phenotyping',
'eosc.usv': 'Stefan cel Mare University of Suceava', 'eosc.enhancer': 'EnhanceR',
'eosc.asgc': 'Academia Sinica Grid Computing Centre', 'eosc.msw': 'MyScienceWork',
'eosc.oipub': 'Omni Iota Science Limited',
'ni4os.ichtm': 'Institute of Chemistry, Technology and Metallurgy, University of Belgrade',
'eosc.surf-nl': 'SURF', 'eosc.esrf': 'European Synchrotron Radiation Facility',
'eosc.ensam': 'Arts et Metiers Institute of Technology',
'eosc.desy': 'Deutsches Elektronen-Synchrotron',
'eosc.ifremer': 'Ifremer, the French National Institute for Ocean Science',
'eosc.inria': 'Institut national de recherche en informatique et en automatique',
'eosc.gbif_portugal': 'Portuguese Node of GBIF',
'eosc.mobile_observation_integration_service': 'DDQ B.V.',
'eosc.awi_bremerhaven': 'Alfred Wegener Institute for Polar and Marine Research in cooperation with MARUM, Center for Marine Environmental Sciences',
'eosc.tib': 'Leibniz Information Centre for Science and Technology',
'eosc.obp': 'Open Book Publishers',
'eosc.diamond_light_source': 'Diamond Light Source Ltd.',
'eosc.kit-scc': 'KIT - Scientific Computing Center',
'eosc.sites': 'Swedish Infrastructure for Ecosystem Science',
'eosc.crg': 'Centre for Genomic Regulation',
'eosc.naes_of_ukraine': ' National Academy of Educational Sciences of Ukraine',
'eosc.soleil': 'Synchrotron SOLEIL', 'eosc.eiscat': 'EISCAT Scientific Association',
'eosc.teledyne': 'Teledyne Marine', 'eosc.uni-freiburg': 'University of Freiburg',
'eosc.lago': 'Latin American Giant Observatory',
'eosc.sios': 'The Svalbard Integrated Arctic Earth Observing System',
'eosc.upc': 'Universitat Politècnica de Catalunya',
'eosc.ess_eric': 'European Social Survey, European Research Infrastructure Consortium',
'eosc.arkivum': 'Arkivum Limited', 'eosc.enermaps': 'EnerMaps',
'eosc.cineca': 'Cineca Consorzio Interuniversitario', 'eosc.bi_insight': 'BI INSIGHT S.A.',
'eosc.embl-ebi': 'European Molecular Biology Laboratory - European Bioinformatics Institute',
'eosc.ifca-csic': 'Institute of Physics of Cantabria (IFCA)',
'eosc.kue': 'Krakow University of Economics, Main Library',
'eosc.ulb-sa': 'University and State Library of Saxony Anhalt',
'eosc-nordic.llu': 'Latvia University of Life Sciences and Technologies',
'eosc.fairmat': 'Consortium FAIRmat', 'eosc.authenix': 'Secure Dimensions GmbH',
'eosc.cnr-iia': 'Institute of Atmospheric Pollution Research - National Research Council of Italy',
'eosc.blue-cloud': 'Blue-Cloud - Piloting innovative services for Marine Research & the Blue Economy',
'eosc.upekrl': 'University of Physical Education in Krakow, Library',
'eosc.oxford_e-research_centre': 'Oxford e-Research Centre, University of Oxford, UK',
'eosc.fir': 'FIR e. V. at RWTH Aachen University', 'eosc.lab1100': 'LAB1100',
'eosc.capsh': 'Committee for the Accessibility of Publications in Sciences and Humanities',
'eosc.kit': 'Karlsruhe Institute of Technology',
'eosc.ciemat-tic': 'Scientific IT Research Activities and Knowledge, ICT Division, CIEMAT',
'eosc.operas': 'OPERAS AISBL',
'ni4os.grena': 'Georgian Research and Educational Networking Association',
'eosc.riga_stradins_university': 'Riga Stradins University',
'eosc.hostkey': 'HOSTKEY B.V. - Dedicated servers in Amsterdam DC', 'eosc.ubiwhere': 'Ubiwhere ',
'eosc.bsc-es': 'Barcelona Supercomputing Center - Centro Nacional de Supercomputación',
'eosc.euro-argo': 'Euro-Argo ERIC, the European contribution to Argo programme',
'eosc.cnag': 'Consorcio para la Explotación del Centro Nacional de Análisis Genómico',
'eosc.hzdr': 'Helmholtz-Zentrum Dresden-Rossendorf e.V.',
'eosc.eosc.grnet': 'National Infrastructures for Research and Technology',
'eosc.embrc-eric': 'European Marine Biological Resource Centre', 'eosc.dynaikon': 'DynAikon Limited',
'ni4os.nsl-ge': 'National Science Library at Tbilisi State University',
'eosc.ktu': 'Kaunas University of Technology', 'eosc.sj-ucp': 'Universidade Católica Portuguesa',
'eosc.gcc_umcg': 'Genomics Coordination Center, University Medical Center Groningen',
'eosc.psnc': 'Poznan Supercomputing and Networking Center',
'eosc.consorci_cee_lab_llum_sincrotro': 'CONSORCI PER A LA CONSTRUCCIÓ, EQUIPAMENT I EXPLOTACIÓ DEL LABORATORI DE LLUM SINCROTRÓ',
'eosc.ei': 'Earlham Institute', 'eosc.psi': 'Paul Scherrer Institute',
'eosc.seadatanet': 'SeaDataNet',
'eosc.uit': 'UiT The Arctic University of Norway', 'eosc.ukaea': 'UK Atomic Energy Authority',
'eosc.switch': 'SWITCH', 'eosc.bkh': 'Biodiversity Knowledge Hub',
'eosc.fzj': 'Forschungszentrum Jülich',
'eosc.grycap': 'Institute of Instrumentation for Molecular Imaging - Grid and High Performance Computing - Universitat Politècnica de València',
'eosc.infrafrontier': 'INFRAFRONTIER', 'eosc.siris_academic': 'SIRIS Academic SL',
'eosc.ill': 'Institut Laue Langevin',
'eosc.lindatclariah-cz': 'LINDAT/CLARIAH-CZ Research Infrastructure for Language Resources and Digital Arts and Humanities in the Czech Republic',
'eosc.mediprospectsai': 'MediprospectsAI ltd',
'eosc.coard': 'Collaborative Open Access Research and Development', 'eosc.elixir-europe': 'ELIXIR',
'eosc.jsc-de': 'Jülich Supercomputing Centre', 'eosc.fh_joanneum': 'FH JOANNEUM Gesellschaft mbH',
'eosc.dsmz': 'Leibniz Institute DSMZ - German Collection of Microorganisms and Cell Cultures',
'eosc.data_revenue': 'Data Revenue', 'eosc.openbiomaps': 'OpenBioMaps Consortium',
'eosc.edelweiss_connect': 'Edelweiss Connect GmbH', 'eosc.egi-fed': 'EGI Foundation',
'ni4os.ipb': 'Institute of Physics Belgrade', 'eosc.upf': 'Universitat Pompeu Fabra',
'eosc.infn': 'Italian National Institute of Nuclear Physics',
'eosc.sks': 'Scientific Knowledge Services', 'eosc.cds': 'Strasbourg astronomical Data Centre',
'eosc.geant': 'GÉANT Association',
'eosc.emso_eric': 'European Multidisciplinary Seafloor and water column Observatory',
'eosc.upv-es': 'Universitat Politècnica de València',
'eosc.csi_piemonte': 'Consorzio per il Sistema Informativo',
'eosc.bifi_-_unizar': 'Institute for Biocomputation and Physics of Complex Systems - University of Zaragoza',
'eosc.wenmr': 'A Worldwide e-Infrastructure for Structural Biology',
'eosc.bioexcel': 'BioExcel Centre of Excellence', 'eosc.ubora': 'UBORA association',
'ni4os.fcub': 'University of Belgrade - Faculty of Chemistry',
'eosc.coronis_computing_sl': 'CORONIS COMPUTING SL',
'eosc.jagiellonian_library': 'Jagiellonian University, Jagiellonian Library',
'eosc.data_centre': 'Centre for Data Analysis and Archiving',
'eosc.elettra_sincrotrone_trieste': 'George Kourousias',
'eosc.fairdi': 'FAIR Data Infrastructure for Physics, Chemistry, Materials Science, and Astronomy',
'eosc.embimos': 'EMBIMOS (Environmental and Sustainability Participatory Information Systems)',
'eosc.mz': 'Materials Zone',
'eosc.charite_bih_brain_simulation': 'Charité University Medicine Berlin, Berlin Institute of Health, Brain Simulation Section',
'eosc.ici_bucharest': 'National Institute for Research and Development in Informatics - ICI Bucharest',
'eosc.ibiom-cnrhttpwwwibiomcnrit': 'Institute of Biomembranes, Bioenergetics and Molecular Biotechnologies, National Research Council',
'eosc.bineo': 'Bineo Consulting SL', 'eosc.uniwersytet_opolski': 'University of Opole',
'eosc.oasees': 'Open autonomous programmable cloud apps & smart sensors', 'eosc.datacite': 'DataCite',
'eosc.idea': 'IDEAconsult', 'eosc.iict': 'Institute of Information and Communication Technologies',
'eosc.unibo': 'Alma Mater Studiorum - Università di Bologna',
'eosc.iasa_of_nasu': 'Institute for Applied System Analysis of the National Academy of Sciences of Ukraine',
'eosc.cyberbotics': 'Cyberbotics',
'eosc.cite': 'Communication & Information Technologies Experts SA Consulting and Development Services',
'eosc.gesis': 'GESIS Leibniz Institute for the Social Sciences', 'eosc.unipd': 'University of Padua',
'eosc.smartsmear': 'Institute for Atmospheric and Earth System Research',
'eosc.euro-bioimaging': 'Euro-BioImaging', 'eosc.gft': 'GFT Italy',
'eosc.cc-in2p3cnrs': 'Computing Centre of the National Institute of Nuclear Physics and Particle Physics, CNRS',
'eosc.ror-org': 'Research Organization Registry',
'eosc.bijvoetcenter': 'Bijvoet Centre - Utrecht University', 'eosc.d4science': 'D4Science',
'eosc.terradue': 'Terradue', 'eosc.gbif': 'Global Biodiversity Information Facility (GBIF)',
'eosc.csc-fi': 'CSC IT CENTER FOR SCIENCE',
'eosc.cesga': 'Fundacion Centro Tecnologico de Supercomputacion de Galicia',
'eosc.ubfzf': 'University of Belgrade Faculty of Philosophy',
'eosc.cines': 'National Computing Center for Higher Education',
'eosc.uni_konstanz': 'University of Konstanz', 'eosc.cesnet': 'CESNET', 'eosc.cs_group': 'CS GROUP',
'eosc.treeofscience': 'Tree of Science', 'eosc.cscs': 'Swiss National Supercomputing Centre',
'eosc.denbi': 'de.NBI - German Network for Bioinformatics Infrastructure',
'eosc.gwdg': 'Gesellschaft für wissenschaftliche Datenverarbeitung mbH Göttingen',
'eosc.sciences_po': 'Fondation Nationale des Sciences Politiques',
'eosc.cern': 'EUROPEAN ORGANIZATION FOR NUCLEAR RESEARCH',
'eosc.unibi-ub': 'Bielefeld University Library', 'eosc.sinergise': 'Sinergise',
'eosc.plantnet': 'PlantNet consortium (hosted by Inria)', 'eosc.exoscale': 'EXOSCALE',
'eosc.cmcc': 'Euro-Mediterranean Center on Climate Change',
'eosc.taltechdata': 'Tallinn University of Technology',
'eosc.tum-net': 'Technical University of Munich, Chair of Network Architectures and Services',
'eosc.cnio': 'CNIO - Spanish National Cancer Research Centre',
'eosc.hits': 'Heidelberg Institute for Theoretical Studies',
'eosc.zpid': 'Leibniz Institute for Psychology', 'eosc.fssda': 'Finnish Social Science Data Archive',
'eosc.ugr-es': 'University of Granada UGR',
'eosc.etais': 'Estonian Scientific Computing Infrastructure',
'eosc.inoe_2000': 'National Institute for Research and Development in Optoelectronics',
'eosc.northern_data_cloud_services': 'ND CS (Services) GmbH', 'eosc.eurac': 'Eurac Research',
'eosc.europeana': 'Europeana Foundation', 'eosc.kit-lib': 'KIT - Library',
'eosc.dkrz': 'Deutsches Klimarechenzentrum GmbH',
'eosc.predictia': 'Predictia Intelligent Data Solutions SL', 'eosc.scipedia': 'SCIPEDIA',
'ni4os.rbi': 'Ruđer Bošković Institute', 'eosc.jelastic': 'Virtuozzo',
'eosc.scigne': 'The SCIGNE Platform',
'eosc.ibergrid': 'IBERGRID - Iberian Distributed Computing Infrastructure',
'eosc.openedition': 'OpenEdition', 'eosc.norce': 'NORCE Norwegian Research Centre',
'eosc.lsd-ufcg': 'Laboratório de Sistemas Distribuídos - Universidade Federal de Campina Grande',
'eosc.sethsoftware': 'Seth Software spółka z ograniczoną odpowiedzialnością',
'eosc.gsi': 'GSI Helmholtzzentrum für Schwerionenforschung GmbH',
'eosc.incd': 'Portuguese National Distributed Computing Infrastructure (INCD)',
'eosc.iisas': 'Institute of Informatics - Slovak Academy of Sciences ',
'eosc.100percentit': '100 Percent IT', 'eosc.f6snl': 'F6S Network',
'eosc.trust-it': 'Trust-IT Services',
'eosc.eodc': 'Earth Observation Data Centre for Water Resources Monitoring',
'ni4os.uob-rcub': 'University of Belgrade Computer Centre',
'eosc.unige': 'University of Geneva, Department of Astronomy',
'eosc.leaena': 'National Technical University of Athens', 'eosc.doabf': 'DOAB Foundation',
'eosc.rbi': 'Ruđer Bošković Institute', 'eosc.sobigdata': 'SoBigData',
'eosc.progedo': 'PROduction et GEstion des DOnnées',
'eosc.isa-ulisboa': 'Instituto Superior de Agronomia da Universidade de Lisboa',
'eosc.openknowledgemaps': 'Open Knowledge Maps - Verein zur Förderung der Sichtbarkeit wissenschaftlichen Wissens',
'eosc.fau_evt': 'Friedrich-Alexander-University Erlangen-Nürnberg, Chair of Energy Process Engineering',
'eosc.nikhef': 'Nikhef (Stichting Nederlandse Wetenschappelijk Onderzoek Instituten)',
'eosc.charles_university': 'Charles University', 'eosc.dcc-uk': 'Digital Curation Centre',
'eosc.it4i_vsb-tuo': 'VSB Technical University of Ostrava, IT4Innovations National Supercomputing Center',
'eosc.mundi_web_services': 'Mundi Web Services',
'eosc.gdansk_tech': 'Gdańsk University of Technology',
'eosc.bg_up': 'Pedagogical University of Krakow, Main Library', 'eosc.figshare': 'Figshare',
'eosc.libnova': 'LIBNOVA SL', 'eosc.pml': 'Plymouth Marine Laboratory',
'eosc.eox': 'EOX IT Services GmbH', 'eosc.dtu': 'Technical University of Denmark',
'eosc.european_xfel': 'European X-ray Free Electron Laser Facility GmbH ',
'eosc.cyfronet': 'Academic Computer Centre CYFRONET AGH',
'eosc.progressive': 'Progressive Systems Srl',
'eosc.ipsl': 'Institut Pierre-Simon Laplace',
'ni4os.grnet': 'National Infrastructures for Research and Technology',
'eosc-nordic.uot': 'University of Tartu', 'eosc.sztaki': 'INSTITUTE FOR COMPUTER SCIENCE AND CONTROL',
'eosc.cnr_-_isti': 'Institute for Information Science and Technologies "Alessandro Faedo" - ISTI',
'eosc.cbra': 'Clinical Bioinformatics Area', 'eosc.beia': 'BEIA CONSULT INTERNATIONAL',
'eosc.slu': 'Swedish University of Agricultural Sciences', 'eosc.elcogen': 'Elcogen Oy',
'eosc.enoll': 'European Network of Living Labs', 'eosc.inode': 'Intelligent Open Data Exploration',
'eosc.creaf': 'Center for Research in Ecology and Forestry Applications',
'eosc.csic': 'Consejo Superior de Investigaciones Científicas (CSIC)',
'eosc.athena': 'Athena Research and Innovation Center in Information and Communication Technologies',
'eosc.carlzeissm': 'Carl Zeiss Microscopy', 'eosc.unimib': 'University of Milano-Bicocca',
'eosc.ukri_-_stfc': 'UK Research and Innovation - Science and Technology Facilities Council',
'eosc.niod': 'NIOD Institute for War, Genocide and Holocaust Studies',
'eosc.cloudferro': 'CloudFerro',
'eosc.vliz': 'Flanders Marine Institute', 'eosc.unitartu': 'University of Tartu',
'eosc.lu': 'Lund University',
'eosc.clarin-eric': 'European Research Infrastructure for Language Resources and Technology',
'eosc.ekt': 'National Documentation Centre', 'eosc.digifarm': 'DigiFarm',
'eosc.inaf': 'Istituto Nazionale di Astrofisica',
'eosc.altec': 'Aerospace Logistics Technology Engineering Company',
'eosc.hu-cms': 'Humboldt-Universität zu Berlin - Computer- und Medienservice',
'eosc.agh_university_main_library': 'AGH University of Krakow Main Library ',
'eosc.ictlc': 'ICTLC S.P.A.', 'eosc.transcript': 'transcript Independent Academic Publishing ',
'eosc.elixir-uk': 'ELIXIR United Kingdom',
'eosc.acdh-ch': 'Austrian Centre for Digital Humanities and Cultural Heritage',
'eosc.tubitak_ulakbim': 'Turkish Academic Network and Information Center', 'eosc.sixsq': 'SixSq',
'eosc.fzj-inm7': 'Forschungszentrum Jülich, Institute of Neurosciences and Medicine (INM) Brain and Behavior (INM-7)',
'eosc.forth': 'Foundation for Research and Technology, Hellas (FORTH)',
'eosc.grnet': 'National Infrastructures for Research and Technology',
'eosc.prace': 'Partnership For Advanced Computing in Europe aisbl',
'eosc.umr_map': 'UMR CNRS/MC 3495 MAP', 'eosc.fris': 'Flemisch Research Information Space',
'eosc.komanord': 'Koma Nord', 'eosc.unparallel': 'UNPARALLEL Innovation, Lda',
'eosc.lifewatch-eric': 'LifeWatch ERIC', 'eosc.university_of_sussex': 'The University of Sussex',
'eosc.cnb-csic': 'Centro Nacional de Biotecnologia (CSIC)', 'eosc.elsevier': 'Elsevier BV',
'eosc.eudat': 'EUDAT', 'eosc.nilu': 'The Foundation NILU',
'eosc.oslo_university': 'University of Oslo',
'eosc.uo': 'University of Oulu', 'eosc.lapp': "Laboratoire d'Annecy de Physique des Particules",
'eosc.cessda-eric': 'Consortium of European Social Science Data Archives ERIC',
'eosc.olos': 'OLOS Association', 'eosc.obsparis': 'Observatoire de Paris'}, 'guideline_type': {
'ir_eosc_guideline_type-eosc_core_interoperability_guideline': 'EOSC-Core Interoperability Guideline',
'ir_eosc_guideline_type-eosc_exchange_interoperability_guideline_thematic': 'EOSC-Exchange Interoperability Guideline (Thematic)',
'ir_eosc_guideline_type-eosc_exchange_interoperability_guideline_horizontal': 'EOSC-Exchange Interoperability Guideline (Horizontal)',
'ir_eosc_guideline_type-operational_baseline': 'Operational Baseline'},
'tr_access': {'tr_access_right-open_access': 'Open Access',
'tr_access_right-restricted_access': 'Restricted Access',
'tr_access_right-metadata_only_access': 'Metadata Only Access',
'tr_access_right-paid_access': 'Paid Access'},
'subdomains': {
'scientific_subdomain-agricultural_sciences-agricultural_biotechnology': 'Agricultural Biotechnology',
'scientific_subdomain-agricultural_sciences-agriculture_forestry_and_fisheries': 'Agriculture, Forestry & Fisheries',
'scientific_subdomain-agricultural_sciences-animal_and_dairy_sciences': 'Animal & Dairy Sciences',
'scientific_subdomain-agricultural_sciences-other_agricultural_sciences': 'Other Agricultural Sciences',
'scientific_subdomain-agricultural_sciences-veterinary_sciences': 'Veterinary Sciences',
'scientific_subdomain-engineering_and_technology-chemical_engineering': 'Chemical Engineering',
'scientific_subdomain-engineering_and_technology-civil_engineering': 'Civil Engineering',
'scientific_subdomain-engineering_and_technology-electrical_electronic_and_information_engineering': 'Electrical, Electronic & Information Engineering',
'scientific_subdomain-engineering_and_technology-environmental_biotechnology': 'Environmental Biotechnology',
'scientific_subdomain-engineering_and_technology-environmental_engineering': 'Environmental Engineering',
'scientific_subdomain-engineering_and_technology-industrial_biotechnology': 'Industrial Biotechnology',
'scientific_subdomain-engineering_and_technology-materials_engineering': 'Materials Engineering',
'scientific_subdomain-engineering_and_technology-mechanical_engineering': 'Mechanical Engineering',
'scientific_subdomain-engineering_and_technology-medical_engineering': 'Medical Engineering',
'scientific_subdomain-engineering_and_technology-nanotechnology': 'Nanotechnology',
'scientific_subdomain-engineering_and_technology-other_engineering_and_technology_sciences': 'Other Engineering & Technology Sciences',
'scientific_subdomain-generic-generic': 'Generic', 'scientific_subdomain-humanities-arts': 'Arts',
'scientific_subdomain-humanities-history_and_archaeology': 'History & Archaeology',
'scientific_subdomain-humanities-languages_and_literature': 'Languages & Literature',
'scientific_subdomain-humanities-other_humanities': 'Other Humanities',
'scientific_subdomain-humanities-philosophy_ethics_and_religion': 'Philosophy, Ethics & Religion',
'scientific_subdomain-medical_and_health_sciences-basic_medicine': 'Basic Medicine',
'scientific_subdomain-medical_and_health_sciences-clinical_medicine': 'Clinical Medicine',
'scientific_subdomain-medical_and_health_sciences-health_sciences': 'Health Sciences',
'scientific_subdomain-medical_and_health_sciences-medical_biotechnology': 'Medical Biotechnology',
'scientific_subdomain-medical_and_health_sciences-other_medical_sciences': 'Other Medical Sciences',
'scientific_subdomain-natural_sciences-biological_sciences': 'Biological Sciences',
'scientific_subdomain-natural_sciences-chemical_sciences': 'Chemical Sciences',
'scientific_subdomain-natural_sciences-computer_and_information_sciences': 'Computer & Information Sciences',
'scientific_subdomain-natural_sciences-earth_and_related_environmental_sciences': 'Earth & Related Environmental Sciences',
'scientific_subdomain-natural_sciences-mathematics': 'Mathematics',
'scientific_subdomain-natural_sciences-other_natural_sciences': 'Other Natural Sciences',
'scientific_subdomain-natural_sciences-physical_sciences': 'Physical Sciences',
'scientific_subdomain-other-other': 'Other',
'scientific_subdomain-social_sciences-economics_and_business': 'Economics & Business',
'scientific_subdomain-social_sciences-educational_sciences': 'Educational Sciences',
'scientific_subdomain-social_sciences-law': 'Law',
'scientific_subdomain-social_sciences-media_and_communications': 'Media & Communications',
'scientific_subdomain-social_sciences-other_social_sciences': 'Other Social Sciences',
'scientific_subdomain-social_sciences-political_sciences': 'Political Sciences',
'scientific_subdomain-social_sciences-psychology': 'Psychology',
'scientific_subdomain-social_sciences-social_and_economic_geography': 'Social & Economic Geography',
'scientific_subdomain-social_sciences-sociology': 'Sociology'},
'access_type': {'access_type-mail_in': 'Mail-In', 'access_type-other': 'Other', 'access_type-physical': 'Physical',
'access_type-remote': 'Remote', 'access_type-virtual': 'Virtual'},
'expertise_level': {'tr_expertise_level-advanced': 'Advanced', 'tr_expertise_level-intermediate': 'Intermediate',
'tr_expertise_level-beginner': 'Beginner', 'tr_expertise_level-all': 'All'},
'tr_content': {'tr_content_resource_type-animation': 'Animation', 'tr_content_resource_type-audio': 'Audio',
'tr_content_resource_type-diagram': 'Diagram', 'tr_content_resource_type-game': 'Game',
'tr_content_resource_type-image': 'Image', 'tr_content_resource_type-multimedia': 'Multimedia',
'tr_content_resource_type-poster': 'Poster', 'tr_content_resource_type-slides': 'Slides',
'tr_content_resource_type-text': 'Text', 'tr_content_resource_type-video': 'Video',
'tr_content_resource_type-website': 'Website', 'tr_content_resource_type-other': 'Other'},
'domains': {'scientific_domain-agricultural_sciences': 'Agricultural Sciences',
'scientific_domain-engineering_and_technology': 'Engineering & Technology',
'scientific_domain-generic': 'Generic', 'scientific_domain-humanities': 'Humanities',
'scientific_domain-medical_and_health_sciences': 'Medical & Health Sciences',
'scientific_domain-natural_sciences': 'Natural Sciences', 'scientific_domain-other': 'Other',
'scientific_domain-social_sciences': 'Social Sciences'},
'tr_dcmi': {'tr_dcmi_type-activity_plan': 'Activity Plan', 'tr_dcmi_type-assessment': 'Assessment',
'tr_dcmi_type-assessment_item': 'Assessment Item',
'tr_dcmi_type-educator_curriculum_guide': 'Educator Curriculum Guide',
'tr_dcmi_type-lesson_plan': 'Lesson Plan',
'tr_dcmi_type-physical_learning_resource': 'Physical Learning Resource',
'tr_dcmi_type-recorded_lesson': 'Recorded Lesson',
'tr_dcmi_type-supporting_document': 'Supporting Document', 'tr_dcmi_type-textbook': 'Textbook',
'tr_dcmi_type-unit_plan': 'Unit Plan', 'tr_dcmi_type-other': 'Other'},
'funding_program': {'funding_program-afis2020': 'Anti Fraud Information System (AFIS2020)',
'funding_program-agr': 'European Agricultural Guarantee Fund (after transfers between EAGF and EAFRD) (AGR)',
'funding_program-agrnet': 'Net transfer between EAGF and EAFRD (AGRNET)',
'funding_program-amf': 'Asylum, Migration and Integration Fund (AMF)',
'funding_program-cdf2020': 'Rights, equality and citizenship programme (CDF2020)',
'funding_program-cef': 'Connecting Europe Facility (CEF)',
'funding_program-cf': 'Cohesion Fund (CF)',
'funding_program-cf_det': 'Contribution from the Cohesion Fund to the CEF programme (CF_DET)',
'funding_program-cfsp': 'Common foreign and security policy (CFSP2020)',
'funding_program-cit2020': 'Europe for Citizens (CIT2020)',
'funding_program-compreg': 'Competitiveness (more developed regions) (COMPREG)',
'funding_program-cons': 'Consumer programme (CONS)',
'funding_program-copernicus': 'European Earth Observation Programme (COPERNICUS)',
'funding_program-cosme': 'Programme for the competitiveness of enterprises and small and medium-sized enterprises (COSME)',
'funding_program-cpm_h3': 'Union Civil Protection Mechanism — Member States (CPM_H3)',
'funding_program-cpm_h4': 'Union Civil Protection Mechanism — Outside EU (CPM_H4)',
'funding_program-crea': 'Creative Europe programme (CREA)',
'funding_program-cust2020': 'Action programme for customs in the European Union (CUST 2020)',
'funding_program-dci2020': 'Development Cooperation Instrument (DCI2020)',
'funding_program-e4a': 'The Union programme for education, training, youth and sport (Erasmus+) (E4A)',
'funding_program-eafrd': 'European Agricultural Fund for Rural Development (after transfers between EAGF and EAFRD) (EAFRD)',
'funding_program-eafrd2020': 'European Agricultural Fund for Rural Development (EAFRD2020)',
'funding_program-eagf2020': 'European Agricultural Guarantee Fund (EAGF2020)',
'funding_program-ear2020': 'Emergency Aid Reserve (EAR2020)',
'funding_program-eerp': 'Energy projects to aid economic recovery (EERP)',
'funding_program-efsd': 'European Fund for Sustainable Development (EFSD)',
'funding_program-efsi': 'European Fund for Strategic Investments (EFSI)',
'funding_program-egf2020': 'European Globalisation Adjustment Fund (EGF2020)',
'funding_program-eidhr2020': 'European Instrument for Democracy and Human Rights (EIDHR2020)',
'funding_program-emff2020': 'European Maritime and Fisheries Fund (EMFF2020)',
'funding_program-eni': 'European Neighbourhood Instrument (ENI)',
'funding_program-erdf': 'European Regional Development Fund (ERDF)',
'funding_program-esc': 'European Solidarity Corps (ESC)',
'funding_program-esf': 'European Social Fund (ESF)',
'funding_program-esp2017': 'European statistical programme (ESP2017)',
'funding_program-esp2020': 'European statistical programme (ESP2020)',
'funding_program-euav': 'EU Aid Volunteers initiative (EUAV)',
'funding_program-euratom': 'Euratom research and training programme (EURATOM)',
'funding_program-eurodac2020': 'Comparison of fingerprints for the effective application of the Dublin Convention (EURODAC2020)',
'funding_program-eusf2020': 'European Union Solidarity Fund (EUSF2020)',
'funding_program-eusf_h3': 'European Union Solidarity Fund (EUSF) — Member States (EUSF_H3)',
'funding_program-eusf_h4': 'European Union Solidarity Fund (EUSF) — Countries negotiating for accession (EUSF_H4)',
'funding_program-fead': 'Fund for European Aid to the Most Deprived (FEAD)',
'funding_program-ff2020': 'Food and feed (FF2020)',
'funding_program-finser2020': 'Specific activities in the field of financial reporting and auditing (FINSER2020)',
'funding_program-fisc2020': 'Action programme for taxation in the European Union (FISC2020)',
'funding_program-gal2014': 'Implementation and exploitation of European satellite navigation systems (EGNOS and Galileo) (GAL2014)',
'funding_program-grld2020': 'EU cooperation with Greenland (GRLD2020)',
'funding_program-h2020': 'The framework programme for research and innovation (H2020)',
'funding_program-health': "Union's action in the field of health (Health programme) (HEALTH)",
'funding_program-herc3': "Programme to promote activities in the field of the protection of the European Union's financial interests (HERC3)",
'funding_program-hfr2015': 'Supplementary high flux reactor (HFR) programmes (HFR2015)',
'funding_program-huma2020': 'Humanitarian aid (HUMA2020)',
'funding_program-icfs': 'Enhancing consumers involvement in EU policy making in the field of financial services (ICFS)',
'funding_program-ies': 'Instrument for emergency support within the Union (IES)',
'funding_program-ifs2020': 'Instrument contributing to Stability and Peace (IFS2020)',
'funding_program-insc2020': 'Instrument for Nuclear Safety Cooperation (INSC2020)',
'funding_program-ipa2': 'Instrument for Pre-accession Assistance (IPA2)',
'funding_program-isa2015': 'Interoperability Solutions for European Public Administrations (ISA2015)',
'funding_program-isa2020': 'Interoperability Solutions for European public administrations, businesses and citizens (ISA2020)',
'funding_program-isf': 'Internal Security Fund (ISF)',
'funding_program-iter': 'International thermonuclear experimental reactor (ITER)',
'funding_program-just': 'Justice programme (JUST)',
'funding_program-life2020': 'Programme for the Environment and Climate Action (LIFE2020)',
'funding_program-loan2020': 'Guarantee Fund for external actions (LOAN2020)',
'funding_program-mfa': 'Macro financial assistance (MFA)',
'funding_program-nd': 'Nuclear decommissioning assistance programmes in Bulgaria, Lithuania and Slovakia (ND)',
'funding_program-other': 'Other',
'funding_program-outreg': 'Outermost and sparsely populated regions (OUTREG)',
'funding_program-peri2020': 'Exchange, assistance and training programme for the protection of the euro against counterfeiting (PERI2020)',
'funding_program-pi': 'Partnership instrument for cooperation with third countries (PI)',
'funding_program-psci': 'European Union programme for employment and social innovation (PSCI)',
'funding_program-regconv': 'Regional convergence (REGCONV)',
'funding_program-rfmos': 'Compulsory contributions to regional fisheries management organisations (RFMOs) and to other international organisations',
'funding_program-sfpas': 'Sustainable Fisheries Partnership Agreements (SFPAs)',
'funding_program-sis2020': 'Schengen Information System (SIS2020)',
'funding_program-ta_ia': 'Technical assistance and innovative actions (TA_IA)',
'funding_program-tcc': 'Instrument of financial support for encouraging the economic development of the Turkish Cypriot community (TCC)',
'funding_program-terrcoop': 'European territorial cooperation (TERRCOOP)',
'funding_program-transreg': 'Transition regions (TRANSREG)',
'funding_program-vis2020': 'Visa Information System (VIS2020)',
'funding_program-yei': 'Youth employment initiative (specific top-up allocation) (YEI))',
'funding_program-lripmeys': 'Large Research Infrastructures Programme of the MEYS, Czech Republic',
'funding_program-ddoict': 'Development, deployment and operation of ICT-based e-infrastructures',
'funding_program-nucleu': 'NUCLEU Programme (Romania)',
'funding_program-driltah': 'LINDAT/CLARIAH-CZ Digital Research Infrastructure for the Language Technologies, Arts and Humanities (LM2018101)',
'funding_program-esaeoep': 'ESA EO Exploitation Platforms initiative'},
'order_type': {'order_type-fully_open_access': 'Fully Open Access', 'order_type-open_access': 'Open Access',
'order_type-order_required': 'Order Required', 'order_type-other': 'Other'}, 'related_resource': {},
'related_resources': {}}

23
airflow/dags/dag_utils.py Normal file
View File

@ -0,0 +1,23 @@
from airflow.hooks.base import BaseHook
from opensearchpy import OpenSearch
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
def get_opensearch_client(kwargs) -> OpenSearch:
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
return OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
def get_bucket_name(context: dict, hook: S3Hook, param_name: str):
bucket_name = context["params"][param_name]
if not bucket_name:
bucket_name = hook.extra_args['bucket_name']
return bucket_name

View File

@ -0,0 +1,43 @@
import os
from datetime import timedelta
import pendulum
import requests
from airflow.decorators import dag
from airflow.decorators import task
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@dag(
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"url": "File to download",
"dst_key": "key containing the file",
"dst_bucket": "bucket that will contain file"
},
tags=["s3"],
)
def download_to_s3():
@task
def download(**context):
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
with requests.get(context["params"]["url"], stream=True) as r:
r.raise_for_status()
hook.load_file_obj(r.raw, context["params"]["dst_key"], bucket_name=context["params"]["dst_bucket"], replace=True, encrypt=False)
download()
download_to_s3()

View File

@ -0,0 +1,218 @@
from __future__ import annotations
import os
from datetime import timedelta
import opensearchpy
import pendulum
import requests
from airflow.decorators import dag
from airflow.decorators import task
from airflow.hooks.base import BaseHook
from airflow.utils.helpers import chain
from opensearchpy import OpenSearch, helpers
from catalogue.RawCatalogOpensearch import RawCatalogOpensearch
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@dag(
dag_id="import_Catalogue",
schedule=None,
dagrun_timeout=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
default_args=default_args,
params={
"OPENSEARCH_CONN_ID": "opensearch_default",
"SHARDS": 3,
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
},
tags=["lot1"]
)
def import_catalogue_entities():
@task
def create_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
for entity in RawCatalogOpensearch.entities:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
if client.indices.exists(indexname):
client.indices.delete(indexname)
@task
def harvest_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
session = requests.session()
for entity in RawCatalogOpensearch.entities:
indexname = catalog.get_index(entity)
baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
callurl = f"{baseurl}/{entity}"
params = {"draft": "false", "active": "true", "suspended": "false"}
if client.indices.exists(indexname):
client.indices.delete(indexname)
while True:
reply = session.get(url=callurl, params=params)
reply.raise_for_status()
content = reply.json()
if 'results' not in content:
break
results = content['results']
if len(results) <= 0:
break
def streamed_results():
for r in results:
yield {"_index": indexname, "_id": r['id'], "_source": r}
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
if success:
succeeded = succeeded + 1
else:
print("error: " + str(item))
failed = failed + 1
# end of stream conditions
if content['to'] >= content['total']:
break
params['from'] = content['to']
client.indices.refresh(indexname)
@task
def map_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
for entity in RawCatalogOpensearch.mapped_entities:
mapped_index = catalog.get_mapped_index(entity)
if client.indices.exists(mapped_index):
client.indices.delete(mapped_index)
def streamed_results():
for hit in opensearchpy.helpers.scan(client,
index=catalog.get_index(entity),
query={"query": {"match_all": {}}}):
r = hit['_source']
doc = None
match entity:
case "interoperability-records":
doc = catalog.map_interoperability(r)
case "training-resources":
doc = catalog.map_training(r)
case "services":
doc = catalog.map_service(r)
yield {"_index": mapped_index, "_id": doc['id'], "_source": doc}
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
if success:
succeeded = succeeded + 1
else:
print("error: " + str(item))
failed = failed + 1
print(f"Entity: {entity} succes: {success} error: {failed}")
@task
def close_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
def refresh_index(index_name):
if index_name is not None:
client.indices.refresh(index_name)
client.indices.put_settings(index=index_name, body={
"index": {
"number_of_replicas": 1,
"refresh_interval": "60s",
}
})
def update_aliases(index_name, alias_name):
if index_name is not None and alias_name is not None:
client.indices.update_aliases(
body={"actions": [
{"remove": {"index": f"{alias_name}_*", "alias": alias_name}},
{"add": {"index": index_name, "alias": alias_name}},
]}
)
for entity in RawCatalogOpensearch.entities:
refresh_index(catalog.get_index(entity))
refresh_index(catalog.get_mapped_index(entity))
update_aliases(catalog.get_index(entity), catalog.get_alias(entity))
update_aliases(catalog.get_mapped_index(entity), catalog.get_mapped_alias(entity))
# update "allresources" alias with mapped indices
actions = []
for entity in RawCatalogOpensearch.mapped_entities:
index_name = catalog.get_mapped_index(entity)
entity_alias = catalog.get_mapped_alias(entity)
actions.append({"remove": {"index": f"{entity_alias}_*", "alias": "allresources"}})
actions.append({"add": {"index": index_name, "alias": "allresources"}})
if len(actions) > 0:
client.indices.update_aliases(
body={"actions": actions}
)
chain(
create_indexes.override(task_id="create_indexes")(),
harvest_indexes.override(task_id="harvest_indexes")(),
map_indexes.override(task_id="map_indexes")(),
close_indexes.override(task_id="close_indexes")()
)
import_catalogue_entities()

View File

@ -0,0 +1,317 @@
from __future__ import annotations
import codecs
import gzip
import io
import json
import logging
import os
from datetime import timedelta
from airflow.exceptions import AirflowException
from kubernetes.client import models as k8s
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.operators.python import PythonOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.helpers import chain
from airflow.hooks.base import BaseHook
from opensearchpy import OpenSearch, helpers
from EOSC_indexes import mappings
from EOSC_entity_trasform import filter_entities, transform_entities
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
configs = {
"all": {"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues", "interoperability", "services", "training"]},
"skg-if": {"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues"]},
"catalogue": {"ENTITIES": ["interoperability", "services", "training"]},
}
for config_name, config in configs.items():
dag_id = f"import_EOSC_{config_name}"
@dag(
dag_id=dag_id,
schedule=None,
dagrun_timeout=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
default_args=default_args,
params={
"S3_CONN_ID": "s3_conn",
"OPENSEARCH_CONN_ID": "opensearch_default",
"KEY_PREFIX": "/",
"EOSC_CATALOG_BUCKET": "eosc-portal-import",
"BATCH_LOADERS_NUM": 10,
"ENTITIES": config["ENTITIES"],
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
},
tags=["lot1"]
)
def import_EOSC_entities():
@task
def create_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
client.cluster.put_settings(body={
"persistent": {
"cluster.routing.allocation.balance.prefer_primary": True,
"segrep.pressure.enabled": True
}
})
for entity in kwargs["params"]["ENTITIES"]:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
if client.indices.exists(indexname):
client.indices.delete(indexname)
client.indices.create(indexname, {
"settings": {
"index": {
"number_of_shards": 40,
"number_of_replicas": 0,
"refresh_interval": -1,
"translog.flush_threshold_size": "2048MB",
"codec": "zstd_no_dict",
"replication.type": "SEGMENT"
}
},
"mappings": mappings[entity]
})
def compute_batches(ds=None, **kwargs):
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
pieces = []
for entity in kwargs["params"]["ENTITIES"]:
s3_path = os.path.normpath(kwargs["params"]["KEY_PREFIX"] + "/" + entity + "/")
keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=s3_path)
to_delete = list(filter(lambda key: key.endswith('.PROCESSED'), keys))
for obj in to_delete:
hook.get_conn().delete_object(Bucket=kwargs["params"]["EOSC_CATALOG_BUCKET"], Key=obj)
for key in keys:
if key.endswith(('.json.gz', '.json')):
pieces.append((entity, key))
def split_list(list_a, chunk_size):
for i in range(0, len(list_a), chunk_size):
yield {"files": list_a[i:i + chunk_size]}
if len(pieces) <= 0:
print("Nothing found in: " + kwargs["params"]["KEY_PREFIX"])
return list()
num_batches = len(pieces)//kwargs["params"]["BATCH_LOADERS_NUM"]
if num_batches > 0:
return list(split_list(pieces, num_batches))
return list(split_list(pieces, len(pieces)))
@task(executor_config={
"pod_override": k8s.V1Pod(
spec=k8s.V1PodSpec(
containers=[
k8s.V1Container(
name="base",
resources=k8s.V1ResourceRequirements(
requests={
"cpu": "550m",
"memory": "256Mi"
}
)
)
]
)
)
})
def bulk_load(files: list[(str, str)], **kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180,
request_timeout=5*60
)
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
retries = 0
while len(files) > 0 and retries < 5:
retries += 1
retry_files = []
for (entity, key) in files:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
if hook.check_for_key(key=f"{key}.PROCESSED", bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"]):
print(f'Skipping {entity}: {key}')
continue
print(f'Processing {indexname}: {key}')
s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
with gzip.GzipFile(fileobj=s3_obj.get()["Body"], mode='rb') if key.endswith(".gz") else codecs.getreader('utf-8')(s3_obj.get()["Body"]) as s3file:
def _generate_data():
for line in s3file:
data: dict = json.loads(line)
if entity in transform_entities:
data = transform_entities[entity](data)
if entity in filter_entities:
if filter_entities[entity](data):
print(data["local_identifier"] + " does not meet inclusion policies")
continue
index = {"update": {"_index": indexname, "_id": data.pop("_id")}}
yield index, {"doc": data, "doc_as_upsert": True}
# disable success post logging
logging.getLogger("opensearch").setLevel(logging.WARN)
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
expand_action_callback=lambda arg: arg,
raise_on_exception=False,
raise_on_error=False,
chunk_size=5000,
max_chunk_bytes=50 * 1024 * 1024,
timeout=5*60):
if success:
succeeded = succeeded + 1
else:
print("error: " + str(item))
failed = failed + 1
print(f"Bulk report: inserted {succeeded} items, {failed} failures, {retries} tentative")
if failed > 0:
retry_files.append((entity, key))
else:
hook.load_string(
"",
f"{key}.PROCESSED",
bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"],
replace=False
)
files = retry_files # retry files with errors
# Check if there are remaining files to recovered in retry
if len(files) > 0:
raise AirflowException("ERROR could not import all items from: " + str(files))
@task
def merge_curation_db(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
if "products" in kwargs["params"]["ENTITIES"]:
products_index = f'products_{kwargs["params"]["SUFFIX"]}'
curationdb_index = 'curation'
if client.indices.exists(curationdb_index):
client.reindex(body={
"source": {
"index": curationdb_index,
"_source": ["status"]
},
"dest": {
"index": products_index
}
},
refresh=False,
requests_per_second=-1,
scroll="4h",
slices="auto",
timeout=60*60*4,
wait_for_completion=True)
@task
def delete_missing_curated(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
if "products" in kwargs["params"]["ENTITIES"]:
products_index = f'products_{kwargs["params"]["SUFFIX"]}'
client.indices.refresh(products_index)
client.delete_by_query(index=products_index,
body={"query": {"bool": {"must_not": {"exists": {"field": "local_identifier"}}}}},
refresh=True
)
@task
def close_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
timeout=180
)
for entity in kwargs["params"]["ENTITIES"]:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.refresh(indexname)
# update aliases
for entity in kwargs["params"]["ENTITIES"]:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.update_aliases(
body={"actions": [
{"remove": {"index": f"{entity}_*", "alias": entity}},
{"add": {"index": indexname, "alias": entity}},
]}
)
# update "allresources" alias
actions = []
for entity in kwargs["params"]["ENTITIES"]:
if entity in ['products', 'services', 'training', 'interoperability']:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}})
actions.append({"add": {"index": indexname, "alias": "allresources"}})
if len(actions) > 0:
client.indices.update_aliases(
body={"actions": actions}
)
parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
chain(
create_indexes.override(task_id="create_indexes")(),
merge_curation_db.override(task_id="merge_curation_db")(),
parallel_batches,
bulk_load.expand_kwargs(parallel_batches.output),
delete_missing_curated.override(task_id="delete_missing_curated_recs")(),
close_indexes.override(task_id="close_indexes")()
)
import_EOSC_entities()

View File

@ -0,0 +1,67 @@
import requests
def init_ams(endpoint: str, project: str, token: str, reset: bool):
session = requests.session()
def delete_topic(topic):
print(f"Deleting projects/{project}/topics/{topic}", flush=True)
reply = session.delete(
headers={"x-api-key": token},
url=f"https://{endpoint}/v1/projects/{project}/topics/{topic}"
)
if not (200 <= reply.status_code < 500 or reply.status_code == 504):
reply.raise_for_status()
def delete_subscription(subscription):
print(f"Deleting projects/{project}/subscriptions/{subscription}", flush=True)
reply = session.delete(
headers={"x-api-key": token},
url=f"https://{endpoint}/v1/projects/{project}/subscriptions/{subscription}"
)
if not (200 <= reply.status_code < 500 or reply.status_code == 504):
reply.raise_for_status()
def create_topic(topic):
print(f"Creating projects/{project}/topics/{topic}", flush=True)
reply = session.put(
headers={"x-api-key": token},
url=f"https://{endpoint}/v1/projects/{project}/topics/{topic}",
json={
"maxMessages": "1",
"returnImmediately": "false"
}
)
if not (200 <= reply.status_code < 300 or reply.status_code == 409 or reply.status_code == 504):
reply.raise_for_status()
def create_subscription(topic, subscription):
print(f"Creating projects/{project}/subscriptions/{subscription}", flush=True)
reply = session.put(
headers={"x-api-key": token},
url=f"https://{endpoint}/v1/projects/{project}/subscriptions/{subscription}",
json={
"topic": f"projects/{project}/topics/{topic}",
"ackDeadlineSeconds": 600
}
)
if not (200 <= reply.status_code < 300 or reply.status_code == 409):
reply.raise_for_status()
subscriptions = {
'curation_requests': ['curation_requests_debug', 'curation_requests_dispatcher'],
'curation_replies': ['curation_replies_rest_debug', 'curation_replies_rest'],
'curation_spam_candidates': ['curation_spam_candidates_debug', 'curation_spam_candidates_dispatcher'],
'graph_requests': ['graph_requests_debug', 'graph_requests_indexer']
}
for topic in ['curation_requests', 'curation_replies', 'curation_spam_candidates', 'graph_requests']:
if reset:
for sub in subscriptions[topic]:
delete_subscription(sub)
delete_topic(topic)
create_topic(topic)
for sub in subscriptions[topic]:
create_subscription(topic, sub)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,141 @@
import os
import time
from datetime import timedelta
import pendulum
import requests
from airflow.decorators import dag
from airflow.decorators import task
from airflow.hooks.base import BaseHook
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
def delete_pending_multipart_uploads(s3_client, bucket, key):
multipart_uploads = s3_client.list_multipart_uploads(Bucket=bucket)
if 'Uploads' in multipart_uploads:
for upload in multipart_uploads['Uploads']:
if upload['Key'] == key:
upload_id = upload['UploadId']
s3_client.abort_multipart_upload(
Bucket=bucket,
Key=key,
UploadId=upload_id
)
print(f"Aborted multipart upload {upload_id} for key {key}")
else:
print("No pending multipart uploads found")
def download_uri(session: requests.Session, url: str, s3_client, bucket, key, max_retries: int = 10):
parts = []
total_size = 0
current_size = 0
part_number = 1
chunk_size = 0
response = s3_client.create_multipart_upload(Bucket=bucket,
Key=key)
upload_id = response['UploadId']
tries = 0
while tries < max_retries:
try:
with session.get(url,
headers={'Range': 'bytes=%d-' % current_size},
stream=True) as r:
if total_size == 0:
total_size = int(r.headers['Content-length'])
chunk_size = max(total_size // (10000 - 1), 15 * 1024 * 1024)
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk:
response = s3_client.upload_part(
Body=chunk,
Bucket=bucket,
Key=key,
PartNumber=part_number,
UploadId=upload_id
)
parts.append({'PartNumber': part_number, 'ETag': response['ETag']})
current_size += len(chunk)
print(f"Read {current_size} of {total_size} part no {part_number}")
part_number += 1
tries = 0
break # break the retry loop when reaches end of chunks
except Exception as e:
tries += 1
if tries < max_retries:
print(e)
print("Resume in 60 seconds...")
time.sleep(60)
continue
else:
print(f"ABORT: failed after {max_retries} tentatives")
s3_client.abort_multipart_upload(
Bucket=bucket,
Key=key,
UploadId=upload_id
)
raise
s3_client.complete_multipart_upload(
Bucket=bucket,
Key=key,
UploadId=upload_id,
MultipartUpload={'Parts': parts}
)
@dag(
schedule=None,
dagrun_timeout=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
default_args=default_args,
params={
"file": "File to download",
"dst_bucket": "bucket that will contain file",
"max_retries": 10
},
tags=["s3"],
)
def openaire_to_s3():
@task
def download(**context):
http_conn = BaseHook.get_connection("openaire_default")
max_retries = context["params"]["max_retries"]
url = "https://" + http_conn.host + "/data/graph/" + context["params"]["file"]
bucket_name = context["params"]["dst_bucket"]
s3_key = "/data/graph/" + context["params"]["file"]
session = requests.Session()
session.headers['Connection'] = 'close'
session.auth = (http_conn.login, http_conn.password)
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
# Cleanup file and pending uploads
delete_pending_multipart_uploads(s3_client=hook.get_conn(), bucket=bucket_name, key=s3_key)
hook.delete_objects(bucket=bucket_name,
keys=[s3_key])
download_uri(session=session,
url=url,
s3_client=hook.get_conn(),
bucket=bucket_name,
key=s3_key,
max_retries=max_retries)
download()
openaire_to_s3()

View File

@ -0,0 +1,80 @@
import json
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.operators.python import get_current_context
from dag_utils import get_opensearch_client
# Define default arguments
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
managed_indexes = {'catalog_datasources', 'catalog_interoperability-records', 'catalog_providers',
'catalog_resource-interoperability-records', 'catalog_services', 'catalog_training-resources',
'datasource', 'grants', 'interoperability',
'organizations', 'persons', 'products',
'services', 'topics', 'training', 'venues'
}
@dag(
dag_id="remove_old_indexes",
# dag_display_name="Remove outdated MKG indexes",
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"OPENSEARCH_CONN_ID": "opensearch_default",
},
tags=["opensearch", "maintenance"],
)
def remove_old_indexes():
@task
def remove_indexes():
context = get_current_context()
client = get_opensearch_client(context)
indexes = client.cat.indices(format="json")
aliases = client.cat.aliases(format="json")
print(json.dumps(aliases))
print(json.dumps(indexes))
# indexes referred by aliases
alias_index_names = {alias['index'] for alias in aliases}
# indexes ordered by timestamp
index_dict = {}
for index in indexes:
index_name = index['index']
if '_' in index_name:
base_name = '_'.join(index_name.split('_')[:-1])
timestamp = index_name.split('_')[-1]
if not (base_name in managed_indexes and timestamp.isdigit()):
continue
if base_name not in index_dict:
index_dict[base_name] = []
index_dict[base_name].append((index_name, timestamp))
for base_name, index_list in index_dict.items():
index_list.sort(key=lambda x: x[1], reverse=True)
most_recent_index = index_list[0][0]
for index_name, timestamp in index_list:
if index_name != most_recent_index and index_name not in alias_index_names:
# hook.run(f'/{index_name}')
print(f'Deleted index: {index_name}')
remove_indexes()
remove_old_indexes()

111
airflow/dags/test_dag.py Normal file
View File

@ -0,0 +1,111 @@
from __future__ import annotations
import os
from datetime import timedelta
import pendulum
from airflow.decorators import dag, task_group
from airflow.decorators import task
from airflow.exceptions import AirflowSkipException
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import get_current_context
from airflow.utils.helpers import chain
from kubernetes.client import models as k8s
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@dag(
dag_id="test_s3_openaire_dump",
# dag_display_name="(Test) Import OpenAIRE entities from S3",
schedule=None,
dagrun_timeout=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
default_args=default_args,
params={
"S3_CONN_ID": "s3_conn",
"OPENSEARCH_CONN_ID": "opensearch_default",
"KEY_PREFIX": "/",
"S3_BUCKET": "kg-1",
"BATCH_LOADERS_NUM": 10,
"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues"],
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
},
tags=["openaire", "lot1", "mkg"]
)
def import_s3_openaire_dump():
@task
def create_indexes():
kwargs = get_current_context()
print(kwargs["params"]["ENTITIES"])
@task_group
def load_and_map_entity(entity: str):
@task(trigger_rule="none_failed")
def compute_batches():
nonlocal entity
kwargs = get_current_context()
if entity not in kwargs["params"]["ENTITIES"]:
raise AirflowSkipException(f"Skipping {entity}")
return [[(entity, '1'), (entity, '2')], [], []]
@task(executor_config={
"pod_override": k8s.V1Pod(
spec=k8s.V1PodSpec(
containers=[
k8s.V1Container(
name="base",
resources=k8s.V1ResourceRequirements(
requests={
"cpu": "550m",
"memory": "256Mi"
}
)
)
]
)
)
})
def parallel_load(files: list[(str, str)], **kwargs):
kwargs = get_current_context()
print(files)
parallel_load.expand(files=compute_batches())
@task(trigger_rule="none_failed")
def merge_curation_db(**kwargs):
pass
@task(trigger_rule="none_failed")
def delete_missing_curated(**kwargs):
pass
@task(trigger_rule="none_failed")
def close_indexes(**kwargs):
pass
chain(
create_indexes(),
# todo get checkpoint
merge_curation_db(),
load_and_map_entity("datasource"),
load_and_map_entity("grants"),
load_and_map_entity("organizations"),
load_and_map_entity("persons"),
load_and_map_entity("products"),
load_and_map_entity("topics"),
load_and_map_entity("venues"),
delete_missing_curated(),
close_indexes()
# todo ask resync
)
import_s3_openaire_dump()

View File

@ -1,27 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
name: openaire-data-platform
nodes:
- role: control-plane
image: kindest/node:v1.28.6@sha256:e9e59d321795595d0eed0de48ef9fbda50388dc8bd4a9b23fb9bd869f370ec7e
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
authorization-mode: "AlwaysAllow"
extraPortMappings:
- containerPort: 80
hostPort: 80
protocol: TCP
- containerPort: 443
hostPort: 443
protocol: TCP
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/certs.d"

View File

@ -1,32 +0,0 @@
#!/bin/sh
set -o errexit
# Script Origin: https://kind.sigs.k8s.io/docs/user/local-registry/
# create registry container unless it already exists
reg_name='kind-registry'
reg_port='5001'
if [ "$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" != 'true' ]; then
docker run \
-d --restart=always -p "127.0.0.1:${reg_port}:5000" --name "${reg_name}" \
registry:2
fi
# connect the registry to the cluster network if not already connected
if [ "$(docker inspect -f='{{json .NetworkSettings.Networks.kind}}' "${reg_name}")" = 'null' ]; then
docker network connect "kind" "${reg_name}"
fi
# Document the local registry
# https://github.com/kubernetes/enhancements/tree/master/keps/sig-cluster-lifecycle/generic/1755-communicating-a-local-registry
cat <<EOF | kubectl apply --context $1 -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: local-registry-hosting
namespace: kube-public
data:
localRegistryHosting.v1: |
host: "localhost:${reg_port}"
help: "https://kind.sigs.k8s.io/docs/user/local-registry/"
EOF

View File

@ -1,671 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
name: ingress-nginx
---
apiVersion: v1
automountServiceAccountToken: true
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx
namespace: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- apiGroups:
- ""
resources:
- configmaps
- pods
- secrets
- endpoints
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- services
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses/status
verbs:
- update
- apiGroups:
- networking.k8s.io
resources:
- ingressclasses
verbs:
- get
- list
- watch
- apiGroups:
- coordination.k8s.io
resourceNames:
- ingress-nginx-leader
resources:
- leases
verbs:
- get
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission
namespace: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- create
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- configmaps
- endpoints
- nodes
- pods
- secrets
- namespaces
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- apiGroups:
- ""
resources:
- services
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- networking.k8s.io
resources:
- ingresses/status
verbs:
- update
- apiGroups:
- networking.k8s.io
resources:
- ingressclasses
verbs:
- get
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission
rules:
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx
namespace: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ingress-nginx
subjects:
- kind: ServiceAccount
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission
namespace: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ingress-nginx-admission
subjects:
- kind: ServiceAccount
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ingress-nginx
subjects:
- kind: ServiceAccount
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ingress-nginx-admission
subjects:
- kind: ServiceAccount
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: v1
data:
allow-snippet-annotations: "false"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-controller
namespace: ingress-nginx
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-controller
namespace: ingress-nginx
spec:
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- appProtocol: http
name: http
port: 80
protocol: TCP
targetPort: http
- appProtocol: https
name: https
port: 443
protocol: TCP
targetPort: https
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
type: NodePort
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-controller-admission
namespace: ingress-nginx
spec:
ports:
- appProtocol: https
name: https-webhook
port: 443
targetPort: webhook
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-controller
namespace: ingress-nginx
spec:
minReadySeconds: 0
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
strategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
spec:
containers:
- args:
- /nginx-ingress-controller
- --election-id=ingress-nginx-leader
- --controller-class=k8s.io/ingress-nginx
- --ingress-class=nginx
- --configmap=$(POD_NAMESPACE)/ingress-nginx-controller
- --validating-webhook=:8443
- --validating-webhook-certificate=/usr/local/certificates/cert
- --validating-webhook-key=/usr/local/certificates/key
- --watch-ingress-without-class=true
- --publish-status-address=localhost
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: LD_PRELOAD
value: /usr/local/lib/libmimalloc.so
image: registry.k8s.io/ingress-nginx/controller:v1.9.6@sha256:1405cc613bd95b2c6edd8b2a152510ae91c7e62aea4698500d23b2145960ab9c
imagePullPolicy: IfNotPresent
lifecycle:
preStop:
exec:
command:
- /wait-shutdown
livenessProbe:
failureThreshold: 5
httpGet:
path: /healthz
port: 10254
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: controller
ports:
- containerPort: 80
hostPort: 80
name: http
protocol: TCP
- containerPort: 443
hostPort: 443
name: https
protocol: TCP
- containerPort: 8443
name: webhook
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10254
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
requests:
cpu: 100m
memory: 90Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- NET_BIND_SERVICE
drop:
- ALL
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 101
seccompProfile:
type: RuntimeDefault
volumeMounts:
- mountPath: /usr/local/certificates/
name: webhook-cert
readOnly: true
dnsPolicy: ClusterFirst
nodeSelector:
ingress-ready: "true"
kubernetes.io/os: linux
serviceAccountName: ingress-nginx
terminationGracePeriodSeconds: 0
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
volumes:
- name: webhook-cert
secret:
secretName: ingress-nginx-admission
---
apiVersion: batch/v1
kind: Job
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission-create
namespace: ingress-nginx
spec:
template:
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission-create
spec:
containers:
- args:
- create
- --host=ingress-nginx-controller-admission,ingress-nginx-controller-admission.$(POD_NAMESPACE).svc
- --namespace=$(POD_NAMESPACE)
- --secret-name=ingress-nginx-admission
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20231226-1a7112e06@sha256:25d6a5f11211cc5c3f9f2bf552b585374af287b4debf693cacbe2da47daa5084
imagePullPolicy: IfNotPresent
name: create
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 65532
seccompProfile:
type: RuntimeDefault
nodeSelector:
kubernetes.io/os: linux
restartPolicy: OnFailure
serviceAccountName: ingress-nginx-admission
---
apiVersion: batch/v1
kind: Job
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission-patch
namespace: ingress-nginx
spec:
template:
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission-patch
spec:
containers:
- args:
- patch
- --webhook-name=ingress-nginx-admission
- --namespace=$(POD_NAMESPACE)
- --patch-mutating=false
- --secret-name=ingress-nginx-admission
- --patch-failure-policy=Fail
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20231226-1a7112e06@sha256:25d6a5f11211cc5c3f9f2bf552b585374af287b4debf693cacbe2da47daa5084
imagePullPolicy: IfNotPresent
name: patch
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 65532
seccompProfile:
type: RuntimeDefault
nodeSelector:
kubernetes.io/os: linux
restartPolicy: OnFailure
serviceAccountName: ingress-nginx-admission
---
apiVersion: networking.k8s.io/v1
kind: IngressClass
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: nginx
spec:
controller: k8s.io/ingress-nginx
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.9.6
name: ingress-nginx-admission
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
service:
name: ingress-nginx-controller-admission
namespace: ingress-nginx
path: /networking/v1/ingresses
failurePolicy: Fail
matchPolicy: Equivalent
name: validate.nginx.ingress.kubernetes.io
rules:
- apiGroups:
- networking.k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- ingresses
sideEffects: None

View File

@ -0,0 +1,12 @@
#!/usr/bin/env -S docker build . --tag=gbloisi/curation:1.0.0 --platform linux/amd64 --push --network=host --file
FROM python:3.12-slim-bullseye
COPY requirements.txt /
RUN python -m pip install --upgrade -r /requirements.txt
COPY antispam-batch.py blacklist.txt curation-rest.py /
# Run the server
CMD python3 /curation-rest.py

View File

@ -0,0 +1,255 @@
import json
import sys
import traceback
from typing import Any, Dict, List, Optional
from jsonargparse import ArgumentParser
from openai import AsyncOpenAI
import asyncio
import enum
import instructor
from pydantic import BaseModel, Field, SecretStr
from datetime import datetime
from opensearchpy import OpenSearch, helpers, AsyncOpenSearch
class Topics(str, enum.Enum):
"""Correctly assign one of the predefined topic to the content"""
SPAM = "SPAM, advertisement, promotional"
SALES = "direct sales of goods or services"
EXPLICIT_CONTENT = "porn, violence or Harmful content"
RESEARCH = "description of a scientific research"
DATASET = "description of a scientific dataset "
OBJECT = "scientific description of an object"
BIBLIOGRAPHIC = "bibliographic record"
NA = "not available"
class ProductInfo(BaseModel):
"""
Your task is to identify SPAM content among research product descriptions.
"""
language: str = Field(description="The language of the content")
topic: Topics
reason: str = Field(description="explain why the topic was chosen")
spam_words: list[str] = Field(description="content's spam words", min_length=0, max_length=3)
main_model_schema = ProductInfo.model_json_schema()
response_schema = json.dumps(main_model_schema, indent=None)
parser = ArgumentParser(env_prefix="CURATION", default_env=True)
parser.add_argument("--opensearch.host", default='opensearch-cluster.local-dataplatform')
parser.add_argument("--opensearch.port", default=443, type=int)
parser.add_argument("--opensearch.user", default="admin", type=SecretStr)
parser.add_argument("--opensearch.password", default="admin", type=SecretStr)
parser.add_argument("--openai.host", default='localhost')
parser.add_argument("--openai.port", default=8000, type=int)
parser.add_argument("--openai.api_key", default='api_key')
parser.add_argument("--parallelism", default=36, type=int)
cfg = parser.parse_args()
with open("/blacklist.txt", "r") as text_file:
blacklist = [line.rstrip().lower() for line in text_file.readlines()]
client = AsyncOpenSearch(
hosts=[{'host': cfg.get("opensearch.host"), 'port': cfg.get("opensearch.port")}],
http_auth=(cfg.get("opensearch.user").get_secret_value(), cfg.get("opensearch.password").get_secret_value()),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20
)
oai = instructor.patch(AsyncOpenAI(base_url="http://" + cfg.get("openai.host") + ":" + str(cfg.get("openai.port")) + "/v1",
api_key=cfg.get("openai.api_key"),
timeout=2400.0*6.0),
mode=instructor.Mode.JSON_SCHEMA)
def source_txt_value(data: Dict[str, Any], labels: List[str]) -> Optional[Any]:
if len(labels) <= 0:
return None
current_value = data['_source']
for label in labels:
if isinstance(current_value, dict) and label in current_value:
current_value = current_value[label]
else:
return None
if current_value is None:
return None
if isinstance(current_value, list):
if len(current_value) > 0:
return current_value[0]
else:
return None
return str(current_value)
async def eval_spam_candidate(hit: dict) -> ProductInfo:
response = await oai.chat.completions.create(
model="suzume-multilingual",
response_model=ProductInfo,
messages=[
{
"role": "user",
"content": hit['title']
}
],
extra_body={
"cache_prompt": True,
"json_schema": response_schema
},
temperature=0.0,
max_retries=5,
stream=False
)
return response.model_dump()
async def evaluate_hit(hit: dict):
obj = await eval_spam_candidate(hit)
if obj['topic'] in [Topics.SPAM, Topics.EXPLICIT_CONTENT, Topics.SALES]:
print("SPAM detected: " + hit['local_identifier'], flush=True)
print("AI Reponse:" + str(obj) + " for: " + hit['title'], flush=True)
obj['local_identifier'] = hit['local_identifier']
obj['trigger_word'] = hit['found']
obj['abstract'] = hit['title']
obj['timestamp'] = datetime.now().isoformat()
await client.index(
index='spam',
body=obj,
id=hit['local_identifier'],
refresh=True
)
return obj
async def get_potential_spam() -> Any:
count = 0
resume_from = 0
async for hit in helpers.async_scan(client, index="products", query={"query": {"match_all": {}}}, scroll='1d'):
count = count + 1
if count < resume_from:
continue
local_identifier = source_txt_value(hit, ["local_identifier"])
print(f"{count}:\t{local_identifier}")
title = source_txt_value(hit, ["titles", "none"])
description = source_txt_value(hit, ['abstracts', 'none'])
if title is None:
if description is None:
print("No description! {local_identifier}", flush=True)
continue
title = ""
if description is not None:
title = title + " " + description
utf8_title = title.encode('utf-8')
if len(utf8_title) > 2048:
title = utf8_title[0:2048].decode('utf-8', 'ignore')
test_string = title.lower()
split_string = test_string.split()
found = None
for badword in blacklist:
if badword in test_string:
if len(badword) == 1 or ' ' in badword or badword in split_string:
found = badword
break
if found is None:
continue
if await client.exists(index="spam", id=local_identifier):
print("cached")
continue
yield {"local_identifier": local_identifier, "title": title, "found": found}
async def worker(name, queue):
try:
while True:
# Get a "work item" out of the queue.
hit = await queue.get()
# Sleep for the "sleep_for" seconds.
await evaluate_hit(hit)
# Notify the queue that the "work item" has been processed.
queue.task_done()
except Exception as e:
print(traceback.format_exc())
sys.exit(-1)
async def main():
#if await client.indices.exists("spam"):
# await client.indices.delete("spam")
if not await client.indices.exists("spam"):
await client.indices.create("spam", {
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 0,
"replication.type": "SEGMENT"
}
},
"mappings": {
"properties": {
"local_identifier": {
"type": "keyword"
},
"language": {
"type": "keyword"
},
"topic": {
"type": "keyword"
},
"abstract": {
"type": "text",
"index": False,
},
"reason": {
"type": "text",
"index": False,
},
"spam_words": {
"type": "keyword"
},
"trigger_word": {
"type": "keyword"
},
"timestamp": {
"type": "date",
"format": "date_hour_minute_second_fraction"
}
}
}
})
parallelism = cfg.get("parallelism")
queue = asyncio.Queue(parallelism)
tasks = []
for i in range(parallelism):
task = asyncio.create_task(worker(f'worker-{i}', queue))
tasks.append(task)
async for hit in get_potential_spam():
await queue.put(hit)
await queue.join()
# Cancel our worker tasks.
for task in tasks:
task.cancel()
# Wait until all worker tasks are cancelled.
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(main())
loop.close()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,298 @@
from datetime import datetime
from enum import Enum
from flask_openapi3 import Info, Tag
from flask_openapi3 import OpenAPI
from jsonargparse import ArgumentParser
from opensearchpy import OpenSearch, NotFoundError, helpers
from pydantic import BaseModel, SecretStr
import logging
parser = ArgumentParser(env_prefix="CURATION", default_env=True)
parser.add_argument("--opensearch.host", default='opensearch-cluster.local-dataplatform')
parser.add_argument("--opensearch.port", default=443, type=int)
parser.add_argument("--opensearch.user", default="admin", type=SecretStr)
parser.add_argument("--opensearch.password", default="admin", type=SecretStr)
parser.add_argument("--debug", default=False, type=bool)
cfg = parser.parse_args()
print(cfg.as_dict())
client = OpenSearch(
hosts=[{'host': cfg.get("opensearch.host"), 'port': cfg.get("opensearch.port")}],
http_auth=(cfg.get("opensearch.user").get_secret_value(), cfg.get("opensearch.password").get_secret_value()),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20,
)
# if client.indices.exists("curation"):
# client.indices.delete("curation")
if not client.indices.exists("curation"):
client.indices.create("curation", {
"settings": {
"index": {
"number_of_shards": 10,
"number_of_replicas": 0,
"codec": "zstd_no_dict",
"replication.type": "SEGMENT"
},
},
"mappings": {
"dynamic": "strict",
"properties": {
"local_identifier": {
"type": "keyword"
},
"timestamp": {
"type": "date",
"format": "date_hour_minute_second_fraction"
},
"creator": {
"type": "keyword"
},
"status": {
"type": "keyword"
},
"note": {
"index": False,
"type": "text"
},
"log": {
"type": "object",
"properties": {
"timestamp": {
"format": "date_hour_minute_second_fraction",
"type": "date"
},
"creator": {
"type": "keyword"
},
"status": {
"index": False,
"type": "keyword"
},
"note": {
"index": False,
"type": "text"
},
}
}
}
}
})
info = Info(title="Curator API", version="1.0.0")
app = OpenAPI(__name__, info=info)
curation_tag = Tag(name="curation", description="Curator API")
class CurationStatus(str, Enum):
valid = "valid"
withdrawn = "withdrawn"
alert = "alert"
restore = "restore"
reset = "reset"
class CurationRequest(BaseModel):
local_identifier: str
creator: str
status: CurationStatus
note: str
class LogEntry(BaseModel):
timestamp: str
creator: str
status: CurationStatus
note: str
class CurationResponse(BaseModel):
local_identifier: str
timestamp: str
creator: str
status: CurationStatus
note: str
log: list[LogEntry]
@app.route('/health')
def health_check():
if all_required_services_are_running():
return 'OK', 200
else:
return 'Service Unavailable', 500
def all_required_services_are_running():
os_health = client.cluster.health()
return os_health['status'] in ['green', 'yellow'] and os_health['number_of_nodes'] > 0
@app.post("/curation", summary="set curation",
responses={200: CurationResponse},
tags=[curation_tag])
def post_curation(query: CurationRequest):
"""
set curation status
"""
curation = dict()
try:
hit = client.get(index="curation", id=query.local_identifier)
curation = hit['_source']
if query.status.name == curation['status']:
return {"msg": "status is not changed"}, 403
# move current status in history
annotations = curation['log'] if 'log' in curation else list()
if isinstance(annotations, dict):
annotations = [annotations]
annotations.insert(0, {
"timestamp": curation['timestamp'],
"creator": curation['creator'],
"status": curation['status'],
"note": curation['note'],
})
annotations = annotations[0:100]
curation['log'] = annotations
curation['timestamp'] = datetime.now().isoformat()
curation['creator'] = query.creator
curation['note'] = query.note
print(curation)
# todo check status transition
match query.status.name:
case "valid":
if curation['status'] not in ('restore', 'reset'):
return {"msg": "status cannot be updated to 'valid'"}, 403
curation['status'] = query.status.name
case "withdrawn":
curation['status'] = query.status.name
case "alert":
curation['status'] = query.status.name
case "restore":
if curation['status'] != "withdrawn":
return {"msg": "only withdrawn records can be restored'"}, 403
curation['status'] = query.status.name
case "reset":
curation['status'] = query.status.name
#TODO transactionality in case of failure?
client.index(
index='curation',
id=query.local_identifier,
body=curation,
refresh=True,
if_primary_term=hit['_primary_term'],
if_seq_no=hit['_seq_no']
)
metadata_status = curation['status']
if metadata_status == 'reset':
client.update(
index='products',
id=query.local_identifier,
body={
"script": {"source": "ctx._source.remove(\"status\")"}
},
refresh=True
)
else:
if metadata_status == "restore":
metadata_status = 'valid'
client.update(
index='products',
id=query.local_identifier,
body={
"doc": {"status": metadata_status}
},
refresh=True
)
except NotFoundError:
curation['local_identifier'] = query.local_identifier
curation['timestamp'] = datetime.now().isoformat()
curation['status'] = query.status.name
curation['creator'] = query.creator
curation['note'] = query.note
match query.status.name:
case "restore":
return {"msg": "cannot restore: status does not exist'"}, 403
case "reset":
return {"msg": "cannot reset: status does not exist'"}, 403
client.index(
index='curation',
id=query.local_identifier,
body=curation,
refresh=True,
op_type='create'
)
client.update(
index='products',
id=query.local_identifier,
body={
"doc": {"status": curation['status']}
},
refresh=True
)
return curation
@app.get("/curation", summary="get curation", tags=[curation_tag])
def get_curation(local_identifier: str):
"""
to get a curation record
"""
try:
hit = client.get(index="curation", id=local_identifier)
return {
"code": 0,
"message": "ok",
"data": hit['_source']
}
except NotFoundError:
return {"msg": f"Cannot fetch: '{local_identifier}' does not exist'"}, 403
@app.get("/alerts", summary="get curation in alert status", tags=[curation_tag])
def get_alerts():
"""
to get a curation record
"""
query = {
"query": {
"terms": {
"status": [CurationStatus.alert]
}
}
}
return {
"code": 0,
"message": "ok",
"data": list(helpers.scan(client, index="curation", query=query))
}
if __name__ == "__main__":
debug = False
if debug:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')
app.run(debug=True)
else:
from waitress import serve
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
serve(app, host="0.0.0.0", port=5000)

View File

@ -0,0 +1,45 @@
{
"type": "object",
"required": [
"language",
"topic",
"reason"
],
"properties": {
"language": {
"type": "string"
},
"topic": {
"enum": [
"Other",
"Natural and life Sciences",
"Engineering And Technology",
"Computer Science",
"Medical And Health Sciences",
"Agricultural And Veterinary Sciences",
"Social Sciences",
"Humanities And The Arts",
"Archaeology",
"Bibliographic record",
"Porn, Violence or Harmful content",
"Direct sales of goods or services",
"SPAM, advertisement, promotional"
],
"type": "string"
},
"general_subject": {
"type": "string"
},
"reason": {
"description": "reason of the classification",
"type": "string"
},
"spam_words": {
"items": {
"type": "string"
},
"type": "array",
"maxItems": 3
}
}
}

View File

@ -0,0 +1,11 @@
langchain
langchain-community
langchain-core
instructor
pydantic
openai
opensearch-py
jsonargparse
flask
flask-openapi3
flask-waitress

View File

@ -1,7 +0,0 @@
#!/usr/bin/env -S docker build . --tag=gbloisi/airflow:2.8.3rc1-python3.11 --platform linux/arm64/v8,linux/amd64 --push --network=host --file
FROM apache/airflow:2.8.3rc1-python3.11
COPY requirements.txt /
RUN pip install --no-cache-dir "apache-airflow==${AIRFLOW_VERSION}" -r /requirements.txt

View File

@ -1,14 +0,0 @@
apache-airflow-providers-amazon
apache-airflow-providers-apache-spark
apache-airflow-providers-cncf-kubernetes
apache-airflow-providers-opensearch
apache-airflow-providers-postgres
apache-airflow-providers-sftp
apache-airflow[google]
msgspec
opensearch-py
opensearch-py-ml
smart_open[all]

View File

@ -1,2 +0,0 @@
docker tag openaire/airflow:2.8.2 localhost:5001/airflow:2.8.1
docker push localhost:5001/airflow:2.8.1

View File

@ -1,69 +0,0 @@
#
#
#
useStandardNaming: true
createUserJob:
useHelmHooks: false
applyCustomEnv: false
migrateDatabaseJob:
useHelmHooks: false
applyCustomEnv: false
# Airflow executor
executor: "KubernetesExecutor"
# Secrets for all airflow containers
secret:
# - envName: ""
# secretName: ""
# secretKey: ""
#- envName: "AIRFLOW_CONN_S3"
# secretName: "minio"
# secretKey: "s3connection"
- envName: "AIRFLOW_CONN_S3_CONN"
secretName: "s3-conn-secrets"
secretKey: "AIRFLOW_CONN_S3_CONN"
dags:
persistence:
enabled: true
gitSync:
enabled: true
repo: "https://code-repo.d4science.org/giambattista.bloisi/lot1-kickoff.git"
branch: "airflow"
subPath: "airflow/dags"
config:
webserver:
expose_config: 'True' # by default this is 'False'
#base_url: "http://localhost/"
logging:
remote_logging: "True"
logging_level: "INFO"
remote_base_log_folder: "s3://lot1-airflow/logs"
remote_log_conn_id: "s3_conn"
encrypt_s3_logs: "False"
ingress:
enabled: true
## WARNING: set as "networking.k8s.io/v1beta1" for Kubernetes 1.18 and earlier
apiVersion: networking.k8s.io/v1
## airflow webserver ingress configs
web:
annotations: {}
host: "localhost"
path: "/"
## WARNING: requires Kubernetes 1.18 or later, use "kubernetes.io/ingress.class" annotation for older versions
ingressClassName: "nginx"
## flower ingress configs
flower:
annotations: {}
host: "localhost"
path: "/flower"
## WARNING: requires Kubernetes 1.18 or later, use "kubernetes.io/ingress.class" annotation for older versions
ingressClassName: "nginx"

View File

@ -1,161 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
name: local-path-storage
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: local-path-provisioner-service-account
namespace: local-path-storage
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: local-path-provisioner-role
namespace: local-path-storage
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: local-path-provisioner-role
rules:
- apiGroups: [""]
resources: ["nodes", "persistentvolumeclaims", "configmaps", "pods", "pods/log"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: local-path-provisioner-bind
namespace: local-path-storage
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: local-path-provisioner-role
subjects:
- kind: ServiceAccount
name: local-path-provisioner-service-account
namespace: local-path-storage
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: local-path-provisioner-bind
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: local-path-provisioner-role
subjects:
- kind: ServiceAccount
name: local-path-provisioner-service-account
namespace: local-path-storage
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: local-path-provisioner
namespace: local-path-storage
spec:
replicas: 1
selector:
matchLabels:
app: local-path-provisioner
template:
metadata:
labels:
app: local-path-provisioner
spec:
serviceAccountName: local-path-provisioner-service-account
containers:
- name: local-path-provisioner
image: rancher/local-path-provisioner:v0.0.26
imagePullPolicy: IfNotPresent
command:
- local-path-provisioner
- --debug
- start
- --config
- /etc/config/config.json
volumeMounts:
- name: config-volume
mountPath: /etc/config/
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumes:
- name: config-volume
configMap:
name: local-path-config
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: local-path
annotations:
storageclass.kubernetes.io/is-default-class: "true"
provisioner: rancher.io/local-path
volumeBindingMode: WaitForFirstConsumer
reclaimPolicy: Delete
---
kind: ConfigMap
apiVersion: v1
metadata:
name: local-path-config
namespace: local-path-storage
data:
config.json: |-
{
"nodePathMap":[
{
"node":"DEFAULT_PATH_FOR_NON_LISTED_NODES",
"paths":["/opt/local-path-provisioner"]
}
]
}
setup: |-
#!/bin/sh
set -eu
mkdir -m 0777 -p "$VOL_DIR"
teardown: |-
#!/bin/sh
set -eu
rm -rf "$VOL_DIR"
helperPod.yaml: |-
apiVersion: v1
kind: Pod
metadata:
name: helper-pod
spec:
priorityClassName: system-node-critical
tolerations:
- key: node.kubernetes.io/disk-pressure
operator: Exists
effect: NoSchedule
containers:
- name: helper-pod
image: busybox
imagePullPolicy: IfNotPresent

View File

@ -1,22 +0,0 @@
apiVersion: helm.cattle.io/v1
kind: HelmChartConfig
metadata:
name: rke2-ingress-nginx
namespace: kube-system
spec:
valuesContent: |-
controller:
ingressClassResource:
controllerValue: "k8s.io/carriershipper-ingress-nginx"
config:
use-forwarded-headers: "true"
enable-real-ip: "true"
proxy-buffer-size: "256k"
proxy-buffer-number: "4"
large-client-header-buffers: "4 16k"
metrics:
enabled: true
serviceMonitor:
enabled: true
additionalLabels:
cluster: nonproduction

View File

@ -1,63 +0,0 @@
opensearchCluster:
enabled: true
general:
serviceName: opensearch-cluster
version: 2.12.0
security:
config:
adminCredentialsSecret:
name: admin-credentials-secret # The secret with the admin credentials for the operator to use
securityConfigSecret:
name: securityconfig-secret # The secret containing your customized securityconfig
adminSecret:
name: opensearch-admin-certs
tls:
transport:
generate: false
perNode: false
secret:
name: opensearch-certs
nodesDn: ["CN=Opensearch_Node", ]
adminDn: ["CN=OpenSearch_Admin", ]
http:
generate: false
secret:
name: opensearch-certs
dashboards:
opensearchCredentialsSecret:
name: admin-credentials-secret # This is the name of your secret that contains the credentials for Dashboards to use
enable: true
version: 2.12.0
replicas: 1
resources:
requests:
memory: "512Mi"
cpu: "200m"
limits:
memory: "512Mi"
cpu: "200m"
tls:
enable: true
generate: false
secret:
name: opensearch-dashboards-certs
nodePools:
- component: nodes
replicas: 3
diskSize: "250Gi"
nodeSelector:
jvm: -Xmx12G -Xms12G -XX:ActiveProcessorCount=8
resources:
requests:
memory: "24Gi"
cpu: "4000m"
limits:
memory: "24Gi"
cpu: "8000m"
roles:
- "cluster_manager"
- "data"
persistence:
pvc:
accessModes: # You can change the accessMode
- ReadWriteOnce

View File

@ -1,22 +0,0 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: opensearch-ingress
namespace: oa-opensearch
annotations:
# nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
# nginx.ingress.kubernetes.io/rewrite-target: /$1
kubernetes.io/ingress.class: nginx
spec:
ingressClassName: nginx
rules:
- host: "dashboard.opensearch.lot1.xyz"
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: opensearch-cluster-dashboards
port:
number: 5601

View File

@ -1,13 +0,0 @@
webapp:
ingress:
enabled: true
className: "nginx"
annotations:
kubernetes.io/ingress.class: nginx
hosts:
- host: localhost
paths:
- path: /
pathType: ImplementationSpecific
tls: []

View File

@ -1,69 +0,0 @@
#
#
#
useStandardNaming: true
createUserJob:
useHelmHooks: false
applyCustomEnv: false
migrateDatabaseJob:
useHelmHooks: false
applyCustomEnv: false
# Airflow executor
executor: "KubernetesExecutor"
# Secrets for all airflow containers
secret:
# - envName: ""
# secretName: ""
# secretKey: ""
#- envName: "AIRFLOW_CONN_S3"
# secretName: "minio"
# secretKey: "s3connection"
- envName: "AIRFLOW_CONN_S3_CONN"
secretName: "s3-conn-secrets"
secretKey: "AIRFLOW_CONN_S3_CONN"
dags:
persistence:
enabled: true
gitSync:
enabled: true
repo: "https://code-repo.d4science.org/giambattista.bloisi/lot1-kickoff.git"
branch: "airflow"
subPath: "airflow/dags"
config:
webserver:
expose_config: 'True' # by default this is 'False'
#base_url: "http://localhost/"
logging:
remote_logging: "True"
logging_level: "INFO"
remote_base_log_folder: "s3://lot1-airflow/logs"
remote_log_conn_id: "s3_conn"
encrypt_s3_logs: "False"
ingress:
enabled: true
## WARNING: set as "networking.k8s.io/v1beta1" for Kubernetes 1.18 and earlier
apiVersion: networking.k8s.io/v1
## airflow webserver ingress configs
web:
annotations: {}
host: "localhost"
path: "/"
## WARNING: requires Kubernetes 1.18 or later, use "kubernetes.io/ingress.class" annotation for older versions
ingressClassName: "nginx"
## flower ingress configs
flower:
annotations: {}
host: "localhost"
path: "/flower"
## WARNING: requires Kubernetes 1.18 or later, use "kubernetes.io/ingress.class" annotation for older versions
ingressClassName: "nginx"

View File

@ -1,457 +0,0 @@
###
# Root key for dynamically creating a secret for use with configuring root MinIO User
# Specify the ``name`` and then a list of environment variables.
#
# .. important::
#
# Do not use this in production environments.
# This field is intended for use with rapid development or testing only.
#
# For example:
#
# .. code-block:: yaml
#
# name: myminio-env-configuration
# accessKey: minio
# secretKey: minio123
#
secrets:
name: myminio-env-configuration
accessKey: minio
secretKey: minio123
###
# The name of an existing Kubernetes secret to import to the MinIO Tenant
# The secret must contain a key ``config.env``.
# The values should be a series of export statements to set environment variables for the Tenant.
# For example:
#
# .. code-block:: shell
#
# stringData:
# config.env: | -
# export MINIO_ROOT_USER=ROOTUSERNAME
# export MINIO_ROOT_PASSWORD=ROOTUSERPASSWORD
#
#existingSecret:
# name: myminio-env-configuration
###
# Root key for MinIO Tenant Chart
tenant:
###
# The Tenant name
#
# Change this to match your preferred MinIO Tenant name.
name: myminio
###
# Specify the Operator container image to use for the deployment.
# ``image.tag``
# For example, the following sets the image to the ``quay.io/minio/operator`` repo and the v5.0.12 tag.
# The container pulls the image if not already present:
#
# .. code-block:: yaml
#
# image:
# repository: quay.io/minio/minio
# tag: RELEASE.2024-02-09T21-25-16Z
# pullPolicy: IfNotPresent
#
# The chart also supports specifying an image based on digest value:
#
# .. code-block:: yaml
#
# image:
# repository: quay.io/minio/minio@sha256
# digest: 28c80b379c75242c6fe793dfbf212f43c602140a0de5ebe3d9c2a3a7b9f9f983
# pullPolicy: IfNotPresent
#
#
image:
repository: quay.io/minio/minio
tag: RELEASE.2024-02-09T21-25-16Z
pullPolicy: IfNotPresent
###
#
# An array of Kubernetes secrets to use for pulling images from a private ``image.repository``.
# Only one array element is supported at this time.
imagePullSecret: { }
###
# The Kubernetes `Scheduler <https://kubernetes.io/docs/concepts/scheduling-eviction/kube-scheduler/>`__ to use for dispatching Tenant pods.
#
# Specify an empty dictionary ``{}`` to dispatch pods with the default scheduler.
scheduler: { }
###
# The Kubernetes secret name that contains MinIO environment variable configurations.
# The secret is expected to have a key named config.env containing environment variables exports.
configuration:
name: myminio-env-configuration
###
# Top level key for configuring MinIO Pool(s) in this Tenant.
#
# See `Operator CRD: Pools <https://min.io/docs/minio/kubernetes/upstream/reference/operator-crd.html#pool>`__ for more information on all subfields.
pools:
###
# The number of MinIO Tenant Pods / Servers in this pool.
# For standalone mode, supply 1. For distributed mode, supply 4 or more.
# Note that the operator does not support upgrading from standalone to distributed mode.
- servers: 1
###
# Custom name for the pool
name: pool-0
###
# The number of volumes attached per MinIO Tenant Pod / Server.
volumesPerServer: 4
###
# The capacity per volume requested per MinIO Tenant Pod.
size: 1Gi
###
# The `storageClass <https://kubernetes.io/docs/concepts/storage/storage-classes/>`__ to associate with volumes generated for this pool.
#
# If using Amazon Elastic Block Store (EBS) CSI driver
# Please make sure to set xfs for "csi.storage.k8s.io/fstype" parameter under StorageClass.parameters.
# Docs: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/parameters.md
# storageClassName: standard
###
# Specify `storageAnnotations <https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/>`__ to associate to PVCs.
storageAnnotations: { }
###
# Specify `annotations <https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/>`__ to associate to Tenant pods.
annotations: { }
###
# Specify `labels <https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/>`__ to associate to Tenant pods.
labels: { }
###
#
# An array of `Toleration labels <https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/>`__ to associate to Tenant pods.
#
# These settings determine the distribution of pods across worker nodes.
tolerations: [ ]
###
# Any `Node Selectors <https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/>`__ to apply to Tenant pods.
#
# The Kubernetes scheduler uses these selectors to determine which worker nodes onto which it can deploy Tenant pods.
#
# If no worker nodes match the specified selectors, the Tenant deployment will fail.
nodeSelector: { }
###
#
# The `affinity <https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/>`__ or anti-affinity settings to apply to Tenant pods.
#
# These settings determine the distribution of pods across worker nodes and can help prevent or allow colocating pods onto the same worker nodes.
affinity: { }
###
#
# The `Requests or Limits <https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/>`__ for resources to associate to Tenant pods.
#
# These settings can control the minimum and maximum resources requested for each pod.
# If no worker nodes can meet the specified requests, the Operator may fail to deploy.
resources: { }
###
# The Kubernetes `SecurityContext <https://kubernetes.io/docs/tasks/configure-pod-container/security-context/>`__ to use for deploying Tenant resources.
#
# You may need to modify these values to meet your cluster's security and access settings.
#
# We recommend disabling recursive permission changes by setting ``fsGroupChangePolicy`` to ``OnRootMismatch`` as those operations can be expensive for certain workloads (e.g. large volumes with many small files).
securityContext:
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
fsGroupChangePolicy: "OnRootMismatch"
runAsNonRoot: true
###
# The Kubernetes `SecurityContext <https://kubernetes.io/docs/tasks/configure-pod-container/security-context/>`__ to use for deploying Tenant containers.
# You may need to modify these values to meet your cluster's security and access settings.
containerSecurityContext:
runAsUser: 1000
runAsGroup: 1000
runAsNonRoot: true
###
#
# An array of `Topology Spread Constraints <https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/>`__ to associate to Operator Console pods.
#
# These settings determine the distribution of pods across worker nodes.
topologySpreadConstraints: [ ]
###
#
# The name of a custom `Container Runtime <https://kubernetes.io/docs/concepts/containers/runtime-class/>`__ to use for the Operator Console pods.
# runtimeClassName: ""
###
# The mount path where Persistent Volumes are mounted inside Tenant container(s).
mountPath: /export
###
# The Sub path inside Mount path where MinIO stores data.
#
# .. warning::
#
# Treat the ``mountPath`` and ``subPath`` values as immutable once you deploy the Tenant.
# If you change these values post-deployment, then you may have different paths for new and pre-existing data.
# This can vastly increase operational complexity and may result in unpredictable data states.
subPath: /data
###
# Configures a Prometheus-compatible scraping endpoint at the specified port.
metrics:
enabled: false
port: 9000
protocol: http
###
# Configures external certificate settings for the Tenant.
certificate:
###
# Specify an array of Kubernetes TLS secrets, where each entry corresponds to a secret the TLS private key and public certificate pair.
#
# This is used by MinIO to verify TLS connections from clients using those CAs
# If you omit this and have clients using TLS certificates minted by an external CA, those connections may fail with warnings around certificate verification.
# See `Operator CRD: TenantSpec <https://min.io/docs/minio/kubernetes/upstream/reference/operator-crd.html#tenantspec>`__.
externalCaCertSecret: [ ]
###
# Specify an array of Kubernetes secrets, where each entry corresponds to a secret contains the TLS private key and public certificate pair.
#
# Omit this to use only the MinIO Operator autogenerated certificates.
#
# If you omit this field *and* set ``requestAutoCert`` to false, the Tenant starts without TLS.
#
# See `Operator CRD: TenantSpec <https://min.io/docs/minio/kubernetes/upstream/reference/operator-crd.html#tenantspec>`__.
#
# .. important::
#
# The MinIO Operator may output TLS connectivity errors if it cannot trust the Certificate Authority (CA) which minted the custom certificates.
#
# You can pass the CA to the Operator to allow it to trust that cert.
# See `Self-Signed, Internal, and Private Certificates <https://min.io/docs/minio/kubernetes/upstream/operations/network-encryption.html#self-signed-internal-and-private-certificates>`__ for more information.
# This step may also be necessary for globally trusted CAs where you must provide intermediate certificates to the Operator to help build the full chain of trust.
externalCertSecret: [ ]
###
# Enable automatic Kubernetes based `certificate generation and signing <https://kubernetes.io/docs/tasks/tls/managing-tls-in-a-cluster>`__
requestAutoCert: true
###
# This field is used only when ``requestAutoCert: true``.
# Use this field to set CommonName for the auto-generated certificate.
# MinIO defaults to using the internal Kubernetes DNS name for the pod
# The default DNS name format is typically ``*.minio.default.svc.cluster.local``.
#
# See `Operator CRD: CertificateConfig <https://min.io/docs/minio/kubernetes/upstream/reference/operator-crd.html#certificateconfig>`__
certConfig: { }
###
# MinIO features to enable or disable in the MinIO Tenant
# See `Operator CRD: Features <https://min.io/docs/minio/kubernetes/upstream/reference/operator-crd.html#features>`__.
features:
bucketDNS: false
domains: { }
enableSFTP: false
###
# Array of objects describing one or more buckets to create during tenant provisioning.
# Example:
#
# .. code-block:: yaml
#
# - name: my-minio-bucket
# objectLock: false # optional
# region: us-east-1 # optional
buckets: [ ]
###
# Array of Kubernetes secrets from which the Operator generates MinIO users during tenant provisioning.
#
# Each secret should specify the ``CONSOLE_ACCESS_KEY`` and ``CONSOLE_SECRET_KEY`` as the access key and secret key for that user.
users: [ ]
###
# The `PodManagement <https://kubernetes.io/docs/tutorials/stateful-application/basic-stateful-set/#pod-management-policy>`__ policy for MinIO Tenant Pods.
# Can be "OrderedReady" or "Parallel"
podManagementPolicy: Parallel
# The `Liveness Probe <https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes>`__ for monitoring Tenant pod liveness.
# Tenant pods will be restarted if the probe fails.
liveness: { }
###
# `Readiness Probe <https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/>`__ for monitoring Tenant container readiness.
# Tenant pods will be removed from service endpoints if the probe fails.
readiness: { }
###
# `Startup Probe <https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/>`__ for monitoring container startup.
# Tenant pods will be restarted if the probe fails.
# Refer
startup: { }
###
# Directs the Operator to deploy the MinIO S3 API and Console services as LoadBalancer objects.
#
# If the Kubernetes cluster has a configured LoadBalancer, it can attempt to route traffic to those services automatically.
#
# - Specify ``minio: true`` to expose the MinIO S3 API.
# - Specify ``console: true`` to expose the Console.
#
# Both fields default to ``false``.
exposeServices: { }
###
# The `Kubernetes Service Account <https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/>`__ associated with the Tenant.
serviceAccountName: ""
###
# Directs the Operator to add the Tenant's metric scrape configuration to an existing Kubernetes Prometheus deployment managed by the Prometheus Operator.
prometheusOperator: false
###
# Configure pod logging configuration for the MinIO Tenant.
#
# - Specify ``json`` for JSON-formatted logs.
# - Specify ``anonymous`` for anonymized logs.
# - Specify ``quiet`` to supress logging.
#
# An example of JSON-formatted logs is as follows:
#
# .. code-block:: shell
#
# $ k logs myminio-pool-0-0 -n default
# {"level":"INFO","errKind":"","time":"2022-04-07T21:49:33.740058549Z","message":"All MinIO sub-systems initialized successfully"}
logging: { }
###
# serviceMetadata allows passing additional labels and annotations to MinIO and Console specific
# services created by the operator.
serviceMetadata: { }
###
# Add environment variables to be set in MinIO container (https://github.com/minio/minio/tree/master/docs/config)
env: [ ]
###
# PriorityClassName indicates the Pod priority and hence importance of a Pod relative to other Pods.
# This is applied to MinIO pods only.
# Refer Kubernetes documentation for details https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#priorityclass/
priorityClassName: ""
###
# An array of `Volumes <https://kubernetes.io/docs/concepts/storage/volumes/>`__ which the Operator can mount to Tenant pods.
#
# The volumes must exist *and* be accessible to the Tenant pods.
additionalVolumes: [ ]
###
# An array of volume mount points associated to each Tenant container.
#
# Specify each item in the array as follows:
#
# .. code-block:: yaml
#
# volumeMounts:
# - name: volumename
# mountPath: /path/to/mount
#
# The ``name`` field must correspond to an entry in the ``additionalVolumes`` array.
additionalVolumeMounts: [ ]
# Define configuration for KES (stateless and distributed key-management system)
# Refer https://github.com/minio/kes
#kes:
# ## Image field:
# # Image from tag (original behavior), for example:
# # image:
# # repository: quay.io/minio/kes
# # tag: 2024-01-11T13-09-29Z
# # Image from digest (added after original behavior), for example:
# # image:
# # repository: quay.io/minio/kes@sha256
# # digest: fb15af611149892f357a8a99d1bcd8bf5dae713bd64c15e6eb27fbdb88fc208b
# image:
# repository: quay.io/minio/kes
# tag: 2024-01-11T13-09-29Z
# pullPolicy: IfNotPresent
# env: [ ]
# replicas: 2
# configuration: |-
# address: :7373
# tls:
# key: /tmp/kes/server.key # Path to the TLS private key
# cert: /tmp/kes/server.crt # Path to the TLS certificate
# proxy:
# identities: []
# header:
# cert: X-Tls-Client-Cert
# admin:
# identity: ${MINIO_KES_IDENTITY}
# cache:
# expiry:
# any: 5m0s
# unused: 20s
# log:
# error: on
# audit: off
# keystore:
# # KES configured with fs (File System mode) doesn't work in Kubernetes environments and is not recommended
# # use a real KMS
# # fs:
# # path: "./keys" # Path to directory. Keys will be stored as files. Not Recommended for Production.
# vault:
# endpoint: "http://vault.default.svc.cluster.local:8200" # The Vault endpoint
# namespace: "" # An optional Vault namespace. See: https://www.vaultproject.io/docs/enterprise/namespaces/index.html
# prefix: "my-minio" # An optional K/V prefix. The server will store keys under this prefix.
# approle: # AppRole credentials. See: https://www.vaultproject.io/docs/auth/approle.html
# id: "<YOUR APPROLE ID HERE>" # Your AppRole Role ID
# secret: "<YOUR APPROLE SECRET ID HERE>" # Your AppRole Secret ID
# retry: 15s # Duration until the server tries to re-authenticate after connection loss.
# tls: # The Vault client TLS configuration for mTLS authentication and certificate verification
# key: "" # Path to the TLS client private key for mTLS authentication to Vault
# cert: "" # Path to the TLS client certificate for mTLS authentication to Vault
# ca: "" # Path to one or multiple PEM root CA certificates
# status: # Vault status configuration. The server will periodically reach out to Vault to check its status.
# ping: 10s # Duration until the server checks Vault's status again.
# # aws:
# # # The AWS SecretsManager key store. The server will store
# # # secret keys at the AWS SecretsManager encrypted with
# # # AWS-KMS. See: https://aws.amazon.com/secrets-manager
# # secretsmanager:
# # endpoint: "" # The AWS SecretsManager endpoint - e.g.: secretsmanager.us-east-2.amazonaws.com
# # region: "" # The AWS region of the SecretsManager - e.g.: us-east-2
# # kmskey: "" # The AWS-KMS key ID used to en/decrypt secrets at the SecretsManager. By default (if not set) the default AWS-KMS key will be used.
# # credentials: # The AWS credentials for accessing secrets at the AWS SecretsManager.
# # accesskey: "" # Your AWS Access Key
# # secretkey: "" # Your AWS Secret Key
# # token: "" # Your AWS session token (usually optional)
# imagePullPolicy: "IfNotPresent"
# externalCertSecret: null
# clientCertSecret: null
# # Key name to be created on the KMS, default is "my-minio-key"
# keyName: ""
# resources: { }
# nodeSelector: { }
# affinity:
# nodeAffinity: { }
# podAffinity: { }
# podAntiAffinity: { }
# tolerations: [ ]
# annotations: { }
# labels: { }
# serviceAccountName: ""
# securityContext:
# runAsUser: 1000
# runAsGroup: 1000
# runAsNonRoot: true
# fsGroup: 1000
###
# Configures `Ingress <https://kubernetes.io/docs/concepts/services-networking/ingress/>`__ for the Tenant S3 API and Console.
#
# Set the keys to conform to the Ingress controller and configuration of your choice.
ingress:
api:
enabled: true
ingressClassName: "nginx"
labels: { }
annotations:
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
tls: [ ]
host: minio.local
path: /
pathType: Prefix
console:
enabled: true
ingressClassName: "nginx"
labels: { }
annotations:
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
tls: [ ]
host: minio-console.local
path: /
pathType: Prefix
# Use an extraResources template section to include additional Kubernetes resources
# with the Helm deployment.
#extraResources:
# - |
# apiVersion: v1
# kind: Secret
# type: Opaque
# metadata:
# name: {{ dig "secrets" "existingSecret" "" (.Values | merge (dict)) }}
# stringData:
# config.env: |-
# export MINIO_ROOT_USER='minio'
# export MINIO_ROOT_PASSWORD='minio123'

View File

@ -1,57 +0,0 @@
opensearchCluster:
enabled: true
general:
serviceName: opensearch-cluster
version: 2.12.0
security:
config:
adminSecret:
name: opensearch-admin-certs
tls:
transport:
generate: false
perNode: false
secret:
name: opensearch-certs
nodesDn: ["CN=Opensearch_Node", ]
adminDn: ["CN=OpenSearch_Admin", ]
http:
generate: false
secret:
name: opensearch-certs
dashboards:
enable: true
version: 2.12.0
replicas: 1
resources:
requests:
memory: "512Mi"
cpu: "200m"
limits:
memory: "512Mi"
cpu: "200m"
tls:
enable: true
generate: false
secret:
name: opensearch-dashboards-certs
nodePools:
- component: nodes
replicas: 3
diskSize: "5Gi"
nodeSelector:
jvm: -Xmx1024M -Xms1024M
resources:
requests:
memory: "2Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "500m"
roles:
- "cluster_manager"
- "data"
persistence:
pvc:
accessModes: # You can change the accessMode
- ReadWriteOnce

View File

@ -1,6 +0,0 @@
clusterName: opensearch-cluster
extraEnvs:
- name: DISABLE_INSTALL_DEMO_CONFIG
value: "true"

View File

@ -1,11 +0,0 @@
env = "gcp"
kube_context= "rke2-cluster-0"
domain = "openaire.duckdns.org"
admin_user = "admin"
admin_password = "admin"
admin_hash = "$2y$10$Wd.mnnrDG01KJ42aVtC89.FdXOvyRm4RNfDfZ5F8k4r/fmSZgrIEq" # generate with htpasswd -bnBC 10 "" <admin_password>
s3_endpoint = "https://storage.googleapis.com"
s3_key= "google key"
s3_secret = "google secret"
# bucket skgif-openaire-eu

View File

@ -1,23 +0,0 @@
env = "local"
kube_context= "kind-local-dataplatform"
domain = "local-dataplatform"
admin_user = "admin"
admin_password = "admin"
admin_hash = "$2y$10$Wd.mnnrDG01KJ42aVtC89.FdXOvyRm4RNfDfZ5F8k4r/fmSZgrIEq" # generate with htpasswd -bnBC 10 "" <admin_password>
s3_endpoint = "https://minio.lot1-minio-tenant.svc.cluster.local"
s3_key= "minio"
s3_secret = "minio123"
/*
{
"type": "s3",
"settings": {
"bucket": "opensearch-repo",
"base_path": "lot1",
"endpoint": "https://minio.lot1-minio-tenant.svc.cluster.local",
"access_key": "minio",
"secret_key": "minio123"
}
}
*/

28
main.tf
View File

@ -1,28 +0,0 @@
/*module "minio" {
source = "./modules/minio"
kube_context = "kind-openaire-data-platform"
}*/
module "opensearch-cluster" {
source = "./modules/opensearch"
kube_context = var.kube_context
admin_user = var.admin_user
admin_password = var.admin_password
admin_hash = var.admin_hash
env = var.env
domain = var.domain
}
module "airflow" {
source = "./modules/airflow"
kube_context = var.kube_context
admin_user = var.admin_user
admin_password = var.admin_password
admin_hash = var.admin_hash
env = var.env
domain = var.domain
s3_endpoint = var.s3_endpoint
s3_key = var.s3_key
s3_secret = var.s3_secret
}

View File

@ -1,211 +0,0 @@
resource "kubernetes_namespace" "spark_jobs_namespace" {
metadata {
name = "${var.namespace_prefix}spark-jobs"
}
}
resource "kubernetes_service_account_v1" "spark_sa" {
metadata {
name = "spark"
namespace = "${var.namespace_prefix}spark-jobs"
}
}
resource "kubernetes_role" "airflow_spark_role" {
metadata {
name = "airflow-spark-role"
namespace = "${var.namespace_prefix}spark-jobs"
}
rule {
api_groups = ["sparkoperator.k8s.io"]
resources = ["sparkapplications", "sparkapplications/status",
"scheduledsparkapplications", "scheduledsparkapplications/status"]
verbs = ["*"]
}
rule {
api_groups = [""]
resources = ["pods/log"]
verbs = ["*"]
}
}
resource "kubernetes_role_binding_v1" "airflow_spark_role_binding" {
metadata {
name = "airflow-spark-role-binding"
namespace = "${var.namespace_prefix}spark-jobs"
}
subject {
kind = "ServiceAccount"
name = "airflow-worker"
namespace = "${var.namespace_prefix}airflow"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = "airflow-spark-role"
}
}
resource "kubernetes_role_binding_v1" "airflow_spark_role_binding2" {
metadata {
name = "airflow-spark-role-binding2"
namespace = "${var.namespace_prefix}spark-jobs"
}
subject {
kind = "ServiceAccount"
name = "airflow-worker"
namespace = "${var.namespace_prefix}airflow"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = "spark-role"
}
}
resource "kubernetes_role_binding_v1" "spark_role_binding" {
metadata {
name = "spark-role-binding"
namespace = "${var.namespace_prefix}spark-jobs"
}
subject {
kind = "ServiceAccount"
name = "spark"
namespace = "${var.namespace_prefix}spark-jobs"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = "spark-role"
}
}
resource "helm_release" "gcp_spark_operator" {
depends_on = [kubernetes_namespace.spark_jobs_namespace]
name = "gcp-spark-operator"
chart = "spark-operator"
repository = "https://kubeflow.github.io/spark-operator"
create_namespace = "true"
namespace = "${var.namespace_prefix}gcp-spark-operator"
dependency_update = "true"
version = "1.1.27"
set {
name = "sparkJobNamespace"
value = "${var.namespace_prefix}spark-jobs"
}
set {
name = "enableWebhook"
value = "true"
}
set {
name = "ingressUrlFormat"
value = "\\{\\{$appName\\}\\}.\\{\\{$appNamespace\\}\\}.${var.domain}"
type = "string"
}
}
resource "kubernetes_namespace" "airflow" {
metadata {
name = "${var.namespace_prefix}airflow"
}
}
resource "kubernetes_secret" "s3_conn_secrets" {
depends_on = [kubernetes_namespace.airflow]
metadata {
name = "s3-conn-secrets"
namespace = "${var.namespace_prefix}airflow"
}
data = {
username = var.s3_key
password = var.s3_secret
AIRFLOW_CONN_S3_CONN = <<EOT
{
"conn_type": "aws",
"extra": {
"aws_access_key_id": "${var.s3_key}",
"aws_secret_access_key": "${var.s3_secret}",
"endpoint_url": "${var.s3_endpoint}",
"verify": false
}
}
EOT
}
type = "Opaque"
}
resource "helm_release" "airflow" {
depends_on = [kubernetes_secret.s3_conn_secrets]
name = "airflow"
chart = "airflow"
repository = "https://airflow.apache.org"
namespace = "${var.namespace_prefix}airflow"
dependency_update = "true"
version = "1.13.0"
values = [
file("./envs/${var.env}/airflow.yaml")
]
set {
name = "fernetkey"
value = "TG9mVjJvVEpoREVYdmdTRWlHdENXQ05zOU5OU2VGY0U="
}
set {
name = "webserver.defaultUser.password"
value = var.admin_password
}
set {
name = "spec.values.env"
value = yamlencode([
{
name = "AIRFLOW__WEBSERVER__BASE_URL",
value = "https://airflow.${var.domain}"
},
{
name = "AIRFLOW__WEBSERVER__ENABLE_PROXY_FIX",
value = "True"
}
])
}
set {
name = "images.airflow.repository"
value = "gbloisi/airflow"
}
set {
name = "images.airflow.tag"
value = "2.8.3rc1-python3.11"
}
set {
name = "ingress.web.host"
value = "airflow.${var.domain}"
}
set {
name = "ingress.flower.host"
value = "airflow.${var.domain}"
}
}

View File

@ -1,12 +0,0 @@
provider "helm" {
# Several Kubernetes authentication methods are possible: https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs#authentication
kubernetes {
config_path = pathexpand(var.kube_config)
config_context = var.kube_context
}
}
provider "kubernetes" {
config_path = pathexpand(var.kube_config)
config_context = var.kube_context
}

View File

@ -1,51 +0,0 @@
variable "env" {
type = string
default = "local"
}
variable "kube_config" {
type = string
default = "~/.kube/config"
}
variable "kube_context" {
type = string
default = "default"
}
variable "namespace_prefix" {
type = string
default = "lot1-"
}
variable "domain" {
type = string
default = "local-dataplatform"
}
variable "s3_endpoint" {
type = string
default = "https://minio.lot1-minio-tenant.svc.cluster.local"
}
variable "s3_key" {
type = string
default = "minio"
}
variable "s3_secret" {
type = string
default = "minio123"
}
variable "admin_user" {
type = string
}
variable "admin_password" {
type = string
}
variable "admin_hash" {
type = string
}

View File

@ -1,34 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: create-bucket
namespace: block-storage
spec:
template:
spec:
containers:
- name: createbucket
image: amazon/aws-cli
command: ["aws"]
args:
- s3api
- create-bucket
- --bucket
- postgres
- --endpoint-url
- http://minio:80
env:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: minio-secret
key: accesskey
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: minio-secret
key: secretkey
restartPolicy: Never
backoffLimit: 1

View File

@ -1,9 +0,0 @@
resource "helm_release" "minio_operator" {
name = "minio-operator"
chart = "operator"
repository = "https://operator.min.io/"
create_namespace = "true"
namespace = "minio-operator"
dependency_update = "true"
version = "5.0.12"
}

View File

@ -1,59 +0,0 @@
resource "helm_release" "minio_tenant" {
name = "minio-tenant"
chart = "tenant"
repository = "https://operator.min.io/"
create_namespace = "true"
namespace = "${var.namespace_prefix}minio-tenant"
dependency_update = "true"
version = "5.0.12"
values = [
file("./envs/${var.env}/minio-tenant.yaml")
]
set {
name = "ingress.api.host"
value = "minio.${var.domain}"
}
set {
name = "ingress.console.host"
value = "console-minio.${var.domain}"
}
}
/*
resource "kubernetes_manifest" "minio_ingress" {
manifest = yamldecode(<<YAML
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ingress-minio
namespace: block-storage
annotations:
kubernetes.io/ingress.class: "nginx"
## Remove if using CA signed certificate
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/proxy-body-size: "0"
spec:
ingressClassName: nginx
tls:
- hosts:
- minio.${var.domain}
secretName: nginx-tls
rules:
- host: minio.${var.domain}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: minio
port:
number: 443
YAML
)
}*/

View File

@ -1,12 +0,0 @@
provider "helm" {
# Several Kubernetes authentication methods are possible: https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs#authentication
kubernetes {
config_path = pathexpand(var.kube_config)
config_context = var.kube_context
}
}
provider "kubernetes" {
config_path = pathexpand(var.kube_config)
config_context = var.kube_context
}

View File

@ -1,24 +0,0 @@
variable "env" {
type = string
default = "local"
}
variable "kube_config" {
type = string
default = "~/.kube/config"
}
variable "kube_context" {
type = string
default = "default"
}
variable "namespace_prefix" {
type = string
default = "lot1-"
}
variable "domain" {
type = string
default = "local-dataplatform"
}

View File

@ -1,152 +0,0 @@
resource "kubernetes_manifest" "opensearch_issuer" {
depends_on = [kubernetes_namespace.opensearch_cluster]
manifest = yamldecode(<<YAML
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: selfsigned-issuer
namespace: "${var.namespace_prefix}opensearch-cluster"
spec:
selfSigned: {}
YAML
)
}
resource "kubernetes_manifest" "opensearch_ca_certificate" {
depends_on = [kubernetes_namespace.opensearch_cluster]
manifest = yamldecode(<<YAML
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: ca-certificate
namespace: "${var.namespace_prefix}opensearch-cluster"
spec:
secretName: ca-cert
duration: 9000h # ~1year
renewBefore: 360h # 15d
commonName: Test CA
isCA: true
privateKey:
size: 2048
usages:
- digital signature
- key encipherment
issuerRef:
name: selfsigned-issuer
YAML
)
}
resource "kubernetes_manifest" "opensearch_ca_issuer" {
depends_on = [kubernetes_namespace.opensearch_cluster]
manifest = yamldecode(<<YAML
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: ca-issuer
namespace: "${var.namespace_prefix}opensearch-cluster"
spec:
ca:
secretName: ca-cert
YAML
)
}
resource "kubernetes_manifest" "opensearch_cluster_certificate" {
depends_on = [kubernetes_namespace.opensearch_cluster]
manifest = yamldecode(<<YAML
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: opensearch-certs
namespace: "${var.namespace_prefix}opensearch-cluster"
spec:
secretName: opensearch-certs
duration: 9000h # ~1year
renewBefore: 360h # 15d
isCA: false
privateKey:
size: 2048
algorithm: RSA
encoding: PKCS8
dnsNames:
- opensearch-cluster.${var.domain}
- opensearch-cluster
- opensearch-cluster-masters-0
- opensearch-cluster-masters-1
- opensearch-cluster-masters-2
- opensearch-cluster-bootstrap-0
usages:
- signing
- key encipherment
- server auth
- client auth
commonName: Opensearch_Node
issuerRef:
name: ca-issuer
YAML
)
}
resource "kubernetes_manifest" "opensearch_admin_certificate" {
depends_on = [kubernetes_namespace.opensearch_cluster]
manifest = yamldecode(<<YAML
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: opensearch-admin-certs
namespace: "${var.namespace_prefix}opensearch-cluster"
spec:
secretName: opensearch-admin-certs
duration: 9000h # ~1year
renewBefore: 360h # 15d
isCA: false
privateKey:
size: 2048
algorithm: RSA
encoding: PKCS8
commonName: OpenSearch_Admin
usages:
- signing
- key encipherment
- server auth
- client auth
issuerRef:
name: ca-issuer
YAML
)
}
resource "kubernetes_manifest" "opensearch_dashboard_certificate" {
depends_on = [kubernetes_namespace.opensearch_cluster]
manifest = yamldecode(<<YAML
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: opensearch-dashboards-certs
namespace: "${var.namespace_prefix}opensearch-cluster"
spec:
secretName: opensearch-dashboards-certs
duration: 9000h # ~1year
renewBefore: 360h # 15d
isCA: false
privateKey:
size: 2048
algorithm: RSA
encoding: PKCS8
dnsNames:
- opensearch-cluster-dashboards
usages:
- signing
- key encipherment
- server auth
- client auth
issuerRef:
name: ca-issuer
YAML
)
}

View File

@ -1,358 +0,0 @@
resource "kubernetes_manifest" "opensearch_securityconfig_secret" {
depends_on = [kubernetes_namespace.opensearch_cluster]
computed_fields = ["stringData"]
manifest = yamldecode(<<YAML
apiVersion: v1
kind: Secret
metadata:
name: securityconfig-secret
namespace: "${var.namespace_prefix}opensearch-cluster"
type: Opaque
stringData:
action_groups.yml: |-
_meta:
type: "actiongroups"
config_version: 2
internal_users.yml: |-
_meta:
type: "internalusers"
config_version: 2
admin:
hash: "${var.admin_hash}"
reserved: true
backend_roles:
- "admin"
description: "Demo admin user"
dashboarduser:
hash: "${var.admin_hash}"
reserved: true
description: "Demo OpenSearch Dashboards user"
nodes_dn.yml: |-
_meta:
type: "nodesdn"
config_version: 2
whitelist.yml: |-
_meta:
type: "whitelist"
config_version: 2
tenants.yml: |-
_meta:
type: "tenants"
config_version: 2
roles_mapping.yml: |-
_meta:
type: "rolesmapping"
config_version: 2
all_access:
reserved: false
backend_roles:
- "admin"
description: "Maps admin to all_access"
own_index:
reserved: false
users:
- "*"
description: "Allow full access to an index named like the username"
readall:
reserved: false
backend_roles:
- "readall"
manage_snapshots:
reserved: false
backend_roles:
- "snapshotrestore"
dashboard_server:
reserved: true
users:
- "dashboarduser"
roles.yml: |-
_meta:
type: "roles"
config_version: 2
dashboard_read_only:
reserved: true
security_rest_api_access:
reserved: true
# Allows users to view monitors, destinations and alerts
alerting_read_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/alerting/alerts/get'
- 'cluster:admin/opendistro/alerting/destination/get'
- 'cluster:admin/opendistro/alerting/monitor/get'
- 'cluster:admin/opendistro/alerting/monitor/search'
# Allows users to view and acknowledge alerts
alerting_ack_alerts:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/alerting/alerts/*'
# Allows users to use all alerting functionality
alerting_full_access:
reserved: true
cluster_permissions:
- 'cluster_monitor'
- 'cluster:admin/opendistro/alerting/*'
index_permissions:
- index_patterns:
- '*'
allowed_actions:
- 'indices_monitor'
- 'indices:admin/aliases/get'
- 'indices:admin/mappings/get'
# Allow users to read Anomaly Detection detectors and results
anomaly_read_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/ad/detector/info'
- 'cluster:admin/opendistro/ad/detector/search'
- 'cluster:admin/opendistro/ad/detectors/get'
- 'cluster:admin/opendistro/ad/result/search'
- 'cluster:admin/opendistro/ad/tasks/search'
- 'cluster:admin/opendistro/ad/detector/validate'
- 'cluster:admin/opendistro/ad/result/topAnomalies'
# Allows users to use all Anomaly Detection functionality
anomaly_full_access:
reserved: true
cluster_permissions:
- 'cluster_monitor'
- 'cluster:admin/opendistro/ad/*'
index_permissions:
- index_patterns:
- '*'
allowed_actions:
- 'indices_monitor'
- 'indices:admin/aliases/get'
- 'indices:admin/mappings/get'
# Allows users to read Notebooks
notebooks_read_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/notebooks/list'
- 'cluster:admin/opendistro/notebooks/get'
# Allows users to all Notebooks functionality
notebooks_full_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/notebooks/create'
- 'cluster:admin/opendistro/notebooks/update'
- 'cluster:admin/opendistro/notebooks/delete'
- 'cluster:admin/opendistro/notebooks/get'
- 'cluster:admin/opendistro/notebooks/list'
# Allows users to read observability objects
observability_read_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opensearch/observability/get'
# Allows users to all Observability functionality
observability_full_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opensearch/observability/create'
- 'cluster:admin/opensearch/observability/update'
- 'cluster:admin/opensearch/observability/delete'
- 'cluster:admin/opensearch/observability/get'
# Allows users to read and download Reports
reports_instances_read_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/reports/instance/list'
- 'cluster:admin/opendistro/reports/instance/get'
- 'cluster:admin/opendistro/reports/menu/download'
# Allows users to read and download Reports and Report-definitions
reports_read_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/reports/definition/get'
- 'cluster:admin/opendistro/reports/definition/list'
- 'cluster:admin/opendistro/reports/instance/list'
- 'cluster:admin/opendistro/reports/instance/get'
- 'cluster:admin/opendistro/reports/menu/download'
# Allows users to all Reports functionality
reports_full_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/reports/definition/create'
- 'cluster:admin/opendistro/reports/definition/update'
- 'cluster:admin/opendistro/reports/definition/on_demand'
- 'cluster:admin/opendistro/reports/definition/delete'
- 'cluster:admin/opendistro/reports/definition/get'
- 'cluster:admin/opendistro/reports/definition/list'
- 'cluster:admin/opendistro/reports/instance/list'
- 'cluster:admin/opendistro/reports/instance/get'
- 'cluster:admin/opendistro/reports/menu/download'
# Allows users to use all asynchronous-search functionality
asynchronous_search_full_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/asynchronous_search/*'
index_permissions:
- index_patterns:
- '*'
allowed_actions:
- 'indices:data/read/search*'
# Allows users to read stored asynchronous-search results
asynchronous_search_read_access:
reserved: true
cluster_permissions:
- 'cluster:admin/opendistro/asynchronous_search/get'
# Allows user to use all index_management actions - ism policies, rollups, transforms
index_management_full_access:
reserved: true
cluster_permissions:
- "cluster:admin/opendistro/ism/*"
- "cluster:admin/opendistro/rollup/*"
- "cluster:admin/opendistro/transform/*"
index_permissions:
- index_patterns:
- '*'
allowed_actions:
- 'indices:admin/opensearch/ism/*'
# Allows users to use all cross cluster replication functionality at leader cluster
cross_cluster_replication_leader_full_access:
reserved: true
index_permissions:
- index_patterns:
- '*'
allowed_actions:
- "indices:admin/plugins/replication/index/setup/validate"
- "indices:data/read/plugins/replication/changes"
- "indices:data/read/plugins/replication/file_chunk"
# Allows users to use all cross cluster replication functionality at follower cluster
cross_cluster_replication_follower_full_access:
reserved: true
cluster_permissions:
- "cluster:admin/plugins/replication/autofollow/update"
index_permissions:
- index_patterns:
- '*'
allowed_actions:
- "indices:admin/plugins/replication/index/setup/validate"
- "indices:data/write/plugins/replication/changes"
- "indices:admin/plugins/replication/index/start"
- "indices:admin/plugins/replication/index/pause"
- "indices:admin/plugins/replication/index/resume"
- "indices:admin/plugins/replication/index/stop"
- "indices:admin/plugins/replication/index/update"
- "indices:admin/plugins/replication/index/status_check"
config.yml: |-
_meta:
type: "config"
config_version: "2"
config:
dynamic:
http:
anonymous_auth_enabled: false
authc:
basic_internal_auth_domain:
http_enabled: true
transport_enabled: true
order: "4"
http_authenticator:
type: basic
challenge: true
authentication_backend:
type: intern
YAML
)
}
resource "kubernetes_secret" "opensearch_admin_credential_secrets" {
depends_on = [kubernetes_namespace.opensearch_cluster]
metadata {
name = "admin-credentials-secret"
namespace = "${var.namespace_prefix}opensearch-cluster"
}
data = {
username = "admin"
password = var.admin_password
}
type = "Opaque"
}
resource "helm_release" "opensearch-cluster" {
depends_on = [helm_release.opensearch-operator, kubernetes_namespace.opensearch_cluster, kubernetes_manifest.opensearch_cluster_certificate, kubernetes_manifest.opensearch_dashboard_certificate]
chart = "opensearch-cluster"
name = "opensearch-cluster"
namespace = "${var.namespace_prefix}opensearch-cluster"
create_namespace = false
repository = "https://opensearch-project.github.io/opensearch-k8s-operator/"
version = "2.5.1"
values = [
file("./envs/${var.env}/opensearch-cluster.yaml")
]
}
resource "kubernetes_manifest" "opensearch_dashboard_ingress" {
depends_on = [helm_release.opensearch-cluster]
manifest = yamldecode(<<YAML
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: opensearch-dashboard-ingress
namespace: "${var.namespace_prefix}opensearch-cluster"
annotations:
kubernetes.io/ingress.class: nginx
cert-manager.io/cluster-issuer: cert-manager-webhook-duckdns-staging
spec:
ingressClassName: nginx
tls:
- hosts:
- "opensearch-cluster-dashboards.${var.domain}"
secretName: "opensearch-cluster-dashboards-tls-secret-staging"
rules:
- host: "opensearch-cluster-dashboards.${var.domain}"
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: opensearch-cluster-dashboards
port:
number: 5601
YAML
)
}
resource "kubernetes_manifest" "opensearch_ingress" {
depends_on = [helm_release.opensearch-cluster]
manifest = yamldecode(<<YAML
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: opensearch-ingress
namespace: "${var.namespace_prefix}opensearch-cluster"
annotations:
nginx.ingress.kubernetes.io/proxy-ssl-verify: "false"
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
kubernetes.io/ingress.class: nginx
cert-manager.io/cluster-issuer: cert-manager-webhook-duckdns-staging
spec:
ingressClassName: nginx
tls:
- hosts:
- "opensearch-cluster.${var.domain}"
secretName: "opensearch-cluster-tls-secret-staging"
rules:
- host: "opensearch-cluster.${var.domain}"
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: opensearch-cluster
port:
number: 9200
YAML
)
}

View File

@ -1,37 +0,0 @@
resource "kubernetes_namespace" "opensearch_operator" {
metadata {
name = "${var.namespace_prefix}opensearch-operator"
}
}
resource "kubernetes_namespace" "opensearch_cluster" {
metadata {
name = "${var.namespace_prefix}opensearch-cluster"
}
}
resource "helm_release" "opensearch-operator" {
depends_on = [kubernetes_namespace.opensearch_operator, kubernetes_namespace.opensearch_cluster]
chart = "opensearch-operator"
name = "opensearch-operator"
namespace = "${var.namespace_prefix}opensearch-operator"
create_namespace = false
repository = "https://opensearch-project.github.io/opensearch-k8s-operator/"
version = "2.5.1"
set {
name = "manager.watchNamespace"
value = "${var.namespace_prefix}opensearch-cluster"
}
# You can provide a map of value using yamldecode. Don't forget to escape the last element after point in the name
set {
name = "manager\\.extraEnv"
value = yamlencode({
name = "SKIP_INIT_CONTAINER",
value = "true"
})
}
}

View File

@ -1,24 +0,0 @@
terraform {
required_providers {
bcrypt = {
source = "viktorradnai/bcrypt"
version = "0.1.2"
}
}
}
provider "bcrypt" {
}
provider "helm" {
# Several Kubernetes authentication methods are possible: https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs#authentication
kubernetes {
config_path = pathexpand(var.kube_config)
config_context = var.kube_context
}
}
provider "kubernetes" {
config_path = pathexpand(var.kube_config)
config_context = var.kube_context
}

View File

@ -1,37 +0,0 @@
variable "env" {
type = string
default = "local"
}
variable "kube_config" {
type = string
default = "~/.kube/config"
}
variable "kube_context" {
type = string
default = "default"
}
variable "namespace_prefix" {
type = string
default = "lot1-"
}
variable "domain" {
type = string
default = "local-dataplatform"
}
variable "admin_user" {
type = string
}
variable "admin_password" {
type = string
}
variable "admin_hash" {
type = string
}

View File

@ -1,46 +0,0 @@
variable "env" {
type = string
default = "local"
}
variable "kube_config" {
type = string
default = "~/.kube/config"
}
variable "kube_context" {
type = string
default = "default"
}
variable "namespace_prefix" {
type = string
default = "lot1-"
}
variable "domain" {
type = string
default = "local-dataplatform"
}
variable "admin_user" {
type = string
}
variable "admin_password" {
type = string
}
variable "admin_hash" {
type = string
}
variable "s3_endpoint" {
default = "https://minio.lot1-minio-tenant.svc.cluster.local"
}
variable "s3_key" {
default = "minio"
}
variable "s3_secret" {
default = "minio123"
}