lot1-kickoff/airflow/dags/opensearch_indexes.py

594 lines
15 KiB
Python
Raw Normal View History

2024-03-12 15:57:14 +01:00
2024-03-14 21:04:35 +01:00
mappings = {}
2024-03-12 15:57:14 +01:00
2024-03-14 21:04:35 +01:00
mappings['datasources'] = {
"properties": {
"data_source_classification": {
"type": "keyword"
},
"jurisdiction": {
"type": "keyword"
},
"local_identifier": {
"type": "keyword"
},
"name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"research_product_type": {
"type": "keyword"
},
"thematic": {
"type": "boolean"
},
"version_control": {
"type": "boolean"
},
2024-03-12 15:57:14 +01:00
2024-03-14 21:04:35 +01:00
# THOSE WERE NOT INFERREd AUTOMATICALLY
"submission_policy_url": {
"type": "text"
},
"preservation_policy_url": {
"type": "text"
},
"persistent_identity_systems": {
"type": "nested",
"properties": {
"product_type": {
"type": "keyword"
},
"pid_schemes": {
"type": "keyword"
}
}
},
"research_product_license": {
"type": "nested",
"properties": {
"name": {
"type": "text"
},
"url": {
"type": "text"
}
}
},
"research_product_access_policy": {
"type": "keyword"
},
"research_metadata_license": {
"type": "nested",
"properties": {
"name": {
"type": "text"
},
"url": {
"type": "text"
}
}
},
"research_metadata_access_policy": {
"type": "keyword"
},
}
}
mappings['venues'] = {
2024-03-12 15:57:14 +01:00
"properties": {
"identifiers": {
"type": "nested",
"properties": {
"scheme": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"local_identifier": {
"type": "keyword"
},
"name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"publisher": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"type": {
"type": "keyword"
}
}
}
2024-03-14 21:04:35 +01:00
mappings['topics'] = {
2024-03-12 15:57:14 +01:00
"properties": {
"identifiers": {
"type": "nested",
"properties": {
"scheme": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"local_identifier": {
"type": "keyword"
},
"name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
}
}
}
2024-03-14 21:04:35 +01:00
mappings['persons'] = {
2024-03-12 15:57:14 +01:00
"properties": {
"family_name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
2024-03-14 21:04:35 +01:00
"full_name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
2024-03-12 15:57:14 +01:00
"given_name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"identifiers": {
"type": "nested",
"properties": {
"scheme": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"local_identifier": {
"type": "keyword"
}
}
}
2024-03-14 21:04:35 +01:00
mappings['organizations'] = {
2024-03-12 15:57:14 +01:00
"properties": {
"country": {
"type": "keyword"
},
"identifiers": {
"type": "nested",
"properties": {
"scheme": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"local_identifier": {
"type": "keyword"
},
"name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"other_names": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"short_name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"type": {
"type": "keyword"
}
}
}
2024-03-14 21:04:35 +01:00
mappings['grants'] = {
2024-03-12 15:57:14 +01:00
"properties": {
2024-03-14 21:04:35 +01:00
"acronym": { # TODO: could be keyword only??
2024-03-12 15:57:14 +01:00
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
},
"currency": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"end_date": {
2024-03-14 21:46:33 +01:00
"type": "date",
"ignore_malformed": "true"
2024-03-12 15:57:14 +01:00
},
"funded_amount": {
2024-03-14 21:04:35 +01:00
"type": "double"
2024-03-12 15:57:14 +01:00
},
"funder": {
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
},
"funding_stream": {
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
},
"grantCode": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"identifiers": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"scheme": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"value": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
}
},
"local_identifier": {
"type": "keyword"
},
"start_date": {
2024-03-14 21:46:33 +01:00
"type": "date",
"ignore_malformed": "true"
2024-03-12 15:57:14 +01:00
},
"summary": {
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
},
"title": {
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
},
"website": {
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
}
}
}
2024-03-14 21:04:35 +01:00
mappings['products'] = {
2024-03-12 15:57:14 +01:00
"properties": {
"abstracts": {
"type": "object",
"properties": {
2024-03-14 21:04:35 +01:00
# TODO: other languages via index templates
2024-03-12 15:57:14 +01:00
"none": {
"type": "text"
}
}
},
"contributions": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"person": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"full_name": {
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
},
"local_identifier": {
"type": "keyword"
},
"orcid": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
}
},
2024-03-14 21:04:35 +01:00
"declared_affiliations": {
"type": "keyword" # TODO: ask to Miriam, not mapped automatically
},
2024-03-12 15:57:14 +01:00
"rank": {
"type": "long"
2024-03-14 21:04:35 +01:00
},
"roles": {
"type": "keyword" # TODO: ask to Miriam, not mapped automatically
2024-03-12 15:57:14 +01:00
}
}
},
"funding": {
2024-03-14 21:46:33 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"code": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"funder": {
"fields": {
"keyword": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
},
"type": "text"
},
"local_identifier": {
"type": "keyword"
},
"title": {
"type": "text"
}
}
},
"identifiers": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"scheme": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"value": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
}
},
"local_identifier": {
"type": "keyword"
},
"manifestations": {
"type": "object",
"properties": {
"access_right": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"biblio": {
"type": "object",
"properties": {
"end_page": {
"type": "text"
},
"publisher": {
"type": "text"
},
"start_page": {
"type": "text"
},
"volume": {
2024-03-14 21:04:35 +01:00
"type": "text"
},
# TODO unmapped values
"issue": {
"type": "text"
},
"edition": {
"type": "text"
},
"number": {
"type": "text"
},
"venue": {
"type": "text"
},
"hosting_data_source": {
2024-03-12 15:57:14 +01:00
"type": "text"
}
}
},
"dates": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"type": {
"type": "text"
},
"value": {
"type": "date"
}
}
},
2024-03-14 21:04:35 +01:00
#TODO: should be in biblio ???
2024-03-12 15:57:14 +01:00
"hosting_datasource": {
"type": "object",
"properties": {
"local_identifier": {
2024-03-14 21:04:35 +01:00
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"type": "text"
2024-03-12 15:57:14 +01:00
},
"name": {
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"type": "text"
}
}
},
"licence": {
"type": "text"
},
"metadata_curation": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"peer_review": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"pid": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"product_local_type": {
"type": "text"
},
"product_local_type_schema": {
"type": "text"
},
"url": {
"type": "text"
},
"venue": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"local_identifier": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"name": {
"type": "text"
}
}
}
}
},
"product_type": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"related_products": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"product_list": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"doi": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"local_identifier": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"title": {
"type": "text"
}
}
},
"relation_type": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
}
},
"relevant_organizations": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"isni": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"local_identifier": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"name": {
"type": "text"
},
"ror": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
"wikidata": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
}
},
"titles": {
"type": "object",
2024-03-14 21:04:35 +01:00
#TODO: other languages ??
2024-03-12 15:57:14 +01:00
"properties": {
"none": {
"type": "text"
}
}
},
"topics": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"provenance": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"trust": {
2024-03-14 21:04:35 +01:00
"type": "double"
2024-03-12 15:57:14 +01:00
},
"type": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
}
},
"topic": {
2024-03-14 21:04:35 +01:00
"type": "nested",
2024-03-12 15:57:14 +01:00
"properties": {
"local_identifier": {
2024-03-14 21:04:35 +01:00
"type": "keyword"
2024-03-12 15:57:14 +01:00
},
2024-03-14 21:04:35 +01:00
"value": { # TODO name of the topic???
"type": "keyword"
2024-03-12 15:57:14 +01:00
}
}
}
}
}
}
}