fix: ⬆️ ckan spatial 2-10 support

changed the import according to spatial plugin 2.10-support branch
feat: 🚧 update d4science customizations from python 2 to 3
2024-12-11 12:24:08 +01:00 · 2024-12-11 12:12:42 +01:00
1 changed files with 632 additions and 17 deletions
--- a/ckanext/geonetwork/harvesters/geonetwork.py
+++ b/ckanext/geonetwork/harvesters/geonetwork.py
@ -13,14 +13,31 @@ from ckan.plugins.core import SingletonPlugin
 from ckanext.spatial.lib.csw_client import CswService
 from ckanext.spatial.harvesters.csw import CSWHarvester

-from ckanext.spatial.harvested_metadata import ISODocument
-from ckanext.spatial.harvested_metadata import ISOElement
+#from ckanext.spatial.harvested_metadata import ISODocument
+#from ckanext.spatial.harvested_metadata import ISOElement
+#in spatial branch 2.10-support ISOObjects are located Here
+from ckanext.spatial.model.harvested_metadata import ISODocument
+from ckanext.spatial.model.harvested_metadata import ISOElement

 from ckan.logic import ValidationError, NotFound, get_action

 from ckan.common import config
 from datetime import datetime

+#add dependencies updated to python3
+import xml.etree.ElementTree as ElementTree
+import urllib.request
+import urllib.error
+from ckanext.spatial.model.harvested_metadata import ISOResourceLocator
+import re
+from urllib.parse import urlparse
+
+GIS_GEONETWORK_METADATA_SOURCE = 'gis_geonetwork:GN_Metadata_Source'
+
+GIS_GEONETWORK_METADATA_SHOW = 'gis_geonetwork:GN_Metadata_Show'
+
+GIS_GEONETWORK_GN_URL = 'gis_geonetwork:GN_URL'
+
 log = logging.getLogger(__name__)

 # Extend the ISODocument definitions by adding some more useful elements
@ -35,6 +52,72 @@ ISODocument.elements.append(
        multiplicity="*",
     ))

+# D4S MAPPING FOR AUTHOR IS THE ORGANIZATION NAME
+log.info('GeoNetwork harvester: extending ISODocument with organisation-name-responsible-party')
+ISODocument.elements.append(
+    ISOElement(
+        name="organisation-name-responsible-party",
+        search_paths=[
+            "gmd:contact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString/text()",
+        ],
+        multiplicity="*",
+    ))
+
+# D4S MAPPING FOR MAINTAINERS
+log.info('GeoNetwork harvester: extending ISODocument with identification-info-responsible-party')
+ISODocument.elements.append(
+    ISOElement(
+        name="identification-info-responsible-party",
+        search_paths=[
+            "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:pointOfContact/gmd:CI_ResponsibleParty",
+        ],
+        multiplicity="*",
+        elements=[
+            ISOElement(
+                name="individualName",
+                search_paths=[
+                    "gmd:individualName/gco:CharacterString/text()",
+                ],
+                multiplicity="*",
+            ),
+            ISOElement(
+                name="email",
+                search_paths=[
+                    "gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString/text()",
+                ],
+                multiplicity="*",
+            ),
+            ISOResourceLocator(
+                name="online-resource",
+                search_paths=[
+                    "gmd:contactInfo/gmd:CI_Contact/gmd:onlineResource/gmd:CI_OnlineResource",
+                ],
+                multiplicity="*",
+            ),
+
+        ]
+    ))
+
+# D4S MAPPING FOR MD_FeatureCatalogueDescription
+ISODocument.elements.append(
+    ISOElement(
+        name="featurecataloguedescription-responsible-party",
+        search_paths=[
+            "gmd:contentInfo/gmd:MD_FeatureCatalogueDescription/gmd:featureCatalogueCitation/gmd:CI_Citation/gmd:citedResponsibleParty/gmd:CI_ResponsibleParty",
+        ],
+        multiplicity="*",
+        elements=[
+            ISOResourceLocator(
+                name="online-resource",
+                search_paths=[
+                    "gmd:contactInfo/gmd:CI_Contact/gmd:onlineResource/gmd:CI_OnlineResource",
+                ],
+                multiplicity="*",
+            ),
+
+        ]
+    ))
+
 # Some old GN instances still uses the old GML URL
 # We'll add more xpath for addressing this issue
 log.info('GeoNetwork harvester: adding old GML URI')
@ -52,9 +135,78 @@ for element in ISODocument.elements:
        element.search_paths.append(newpath)
        log.info("Added old URI for gml to %s", element.name)

+#custom classes
+class Harvest_Session(dict):
+    def __missing__(self, key):
+        return None
+    
+class D4S_HTTP_Request_Util:
+    # Returns body response as string if the request is ok, None otherwise
+    @staticmethod
+    def get_response_body(uri, data=None, headers={}):
+        log.debug("Performing request to uri: %s" % uri)
+        log.debug("headers are: %s" % headers)
+        log.debug("data passed as body are: %s" % data)
+        req = urllib.request.Request(uri, data=data, headers=headers)
+        try:
+            resp = urllib.request.urlopen(req, timeout=2)
+        except urllib.error.HTTPError as e:
+            log.error("Error on contacting URI: %s" % uri)
+            log.error("HTTPError: %d" % e.code)
+            return None
+        except urllib.error.URLError as e:
+            log.error("URLError - Input URI: %s is not valid!!" % uri)
+            return None
+        else:
+            body = resp.read()
+            return body.decode("utf-8")  # Decodifica per ottenere una stringa (UTF-8)
+
+    # Returns True if status of the http request is Successful 2xx, False otherwise
+    @staticmethod
+    def check_url(uri):
+        try:
+            status_code = urllib.request.urlopen(uri).getcode()
+            if 200 <= status_code <= 206:
+                return True
+
+            log.error("Error on contacting URI: %s" % uri)
+            return False
+        except urllib.error.HTTPError as e:
+            log.error("Error on contacting URI: %s" % uri)
+            log.error("HTTPError: %d" % e.code)
+            return False
+        except Exception as e:
+            log.error("check_url fail on: %s " % uri)
+            return False
+        
+class D4S_IS_Resource_Discovery:
+    gcubeTokenParam = "gcube-token"
+
+    def __init__(self, urlICProxy, resourceID, gcubeToken):
+        self.urlICProxy = urlICProxy
+        self.resourceID = resourceID
+        self.gcubeToken = gcubeToken
+
+    def performRequest(self):
+        uri = ""
+        try:
+            uri = f"{self.urlICProxy}/{self.resourceID}?{D4S_IS_Resource_Discovery.gcubeTokenParam}={self.gcubeToken}"
+            log.debug("Contacting URL: %s" % uri)
+            theResource = D4S_HTTP_Request_Util.get_response_body(uri)
+            log.debug("Resource returned %s " % theResource)
+            if theResource:
+                return ElementTree.XML(theResource)
+            else:
+                return None
+        except Exception as e:
+            log.error("Error on performing the request from uri: {}".format(uri))
+            log.debug("Returning None")
+            return None

 class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):
-
+    catalogue_resolver = None
+    harvest_session = Harvest_Session()
+    
    def info(self):
        return {
            'name': 'geonetwork',
@ -63,6 +215,408 @@ class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):
            'form_config_interface': 'Text'
        }

+    #custom class methods
+    def add_geonetwork_informations_to_package(self, gn_localized_url, package_dict, harvest_object, harvest_session):
+        namespaces = {'ows': "http://www.opengis.net/ows", "xlink": "http://www.w3.org/1999/xlink"}
+
+        gn_url_session_key = self._get_session_key(harvest_object, "the_geonetwork_url")
+
+        if harvest_session.get(gn_url_session_key):
+            log.info('Harvest session for current job, the_geonetwork_url: %s, is_geonetwork_3: %s',
+                     harvest_session.get(gn_url_session_key),
+                     harvest_session.get(self._get_session_key(harvest_object, "is_geonetwork_3")))
+            return self._add_geoentworks_links(harvest_session.get(gn_url_session_key), package_dict, harvest_object,
+                                               harvest_session)
+
+        if gn_localized_url:
+            get_capabilities_request = "request=GetCapabilities&service=CSW&acceptVersions=2.0.2&acceptFormats=application%2Fxml"
+            resp = D4S_HTTP_Request_Util.get_response_body(gn_localized_url + "?" + get_capabilities_request)
+
+            if resp:
+                try:
+                    # Decodifica il contenuto se necessario
+                    if isinstance(resp, bytes):
+                        resp = resp.decode("utf-8")
+
+                    root = ElementTree.fromstring(resp)
+
+                    geonetwork_url = None
+                    for provideSites in root.findall('.//ows:ProviderSite', namespaces):
+                        geonetwork_url = provideSites.get('{http://www.w3.org/1999/xlink}href')
+                        log.info("Read geonetwork_url from GetCapabilities: %s", geonetwork_url)
+                        if geonetwork_url:
+                            parsedUrl = urlparse(geonetwork_url)
+                            # Rimuovi le porte 80 o 8080
+                            if parsedUrl.port in {80, 8080}:
+                                geonetwork_url = f"{parsedUrl.scheme}://{parsedUrl.hostname}{parsedUrl.path}"
+                                log.info("Removed port 80 or 8080 from geonetwork_url: %s", geonetwork_url)
+
+                    if geonetwork_url:
+                        harvest_session[gn_url_session_key] = geonetwork_url
+                        is_gn3_session_key = self._get_session_key(harvest_object, "is_geonetwork_3")
+                        harvest_session[is_gn3_session_key] = D4S_HTTP_Request_Util.check_url(
+                            geonetwork_url + "/srv/eng/catalog.search#/home")
+                        log.info('Updated Harvest session for current job %s', harvest_session)
+                        package_dict = self._add_geoentworks_links(geonetwork_url, package_dict, harvest_object, harvest_session)
+
+                except Exception as err:
+                    log.warning("No Geonetwork informations added!! Error on parsing the get capabilities: %s", err)
+                    return package_dict
+
+        return package_dict
+
+    def _add_geoentworks_links(self, geonetwork_url, package_dict, harvest_object, harvest_session):
+
+        geonetwork_service_url = geonetwork_url + '/srv/en'
+
+        if harvest_session.get(self._get_session_key(harvest_object, "is_geonetwork_3")):
+
+            #TODO check if package_dict['extras'] is corret/works
+            log.debug('GN3 Service URL is %s', geonetwork_service_url)
+            package_dict['extras'].append({'key': GIS_GEONETWORK_GN_URL, 'value': geonetwork_url})
+            package_dict['extras'].append({'key': GIS_GEONETWORK_METADATA_SHOW,
+                                           'value': geonetwork_service_url + '/catalog.search#/metadata/' + harvest_object.guid})
+            package_dict['extras'].append({'key': GIS_GEONETWORK_METADATA_SOURCE,
+                                           'value': geonetwork_service_url + '/xml.metadata.get?uuid=' + harvest_object.guid})
+        else:
+
+            log.debug('GN2 Service URL is %s', geonetwork_service_url)
+            package_dict['extras'].append({'key': GIS_GEONETWORK_GN_URL, 'value': geonetwork_url})
+            package_dict['extras'].append({'key': GIS_GEONETWORK_METADATA_SHOW,
+                                           'value': geonetwork_service_url + '/metadata.show?uuid=' + harvest_object.guid})
+            package_dict['extras'].append({'key': GIS_GEONETWORK_METADATA_SOURCE,
+                                           'value': geonetwork_service_url + '/xml.metadata.get?uuid=' + harvest_object.guid})
+
+        return package_dict
+    
+    def _get_session_key(self, harvest_object, key):
+        '''Returns a session key for the harvest job running'''
+        return harvest_object.job.id + "_key_" + key
+
+    # Added by Francesco Mangiacrapa
+    def add_item_url_to_package(self, gn_localized_url, package_dict):
+        log.debug("add_item_url_to_package")
+
+        set_item_url = config.get('ckan.d4science_theme.harvesting_set_item_url')
+
+        if set_item_url is not None and not set_item_url:
+            log.info("set item url configuration is False, returning")
+            return package_dict
+
+        if not self.catalogue_resolver:
+
+            urlICProxy = config.get('ckan.d4science_theme.ic_proxy_url')  # "https://registry.d4science.org/icproxy/gcube/service"
+            resourceID = config.get('ckan.d4science_theme.ckandatacatalogue_resourceid')  # CkanDataCatalogue GR
+            if not resourceID:
+                resourceID = "56ec4876-999f-4afc-a9e3-efbda5f5c8bc"  # DEV CkanDataCatalogue GR
+                resourceID = "2e067010-3d97-11e8-bcb7-f39deee66c72"  # PROD CkanDataCatalogue GR
+                log.warn("'ckan.d4science_theme.ckandatacatalogue_resourceid' not found into configuration. Hard-cabling CkanDataCatalogue GR resourceID: " + resourceID)
+
+            gcubeToken = config.get('ckan.d4science_theme.application_token')  # The gCube Token
+
+            log.debug("urlICProxy: " + urlICProxy)
+            log.debug("resourceID: " + resourceID)
+            log.debug("gcubeToken: " + gcubeToken)
+
+            disc = D4S_IS_Resource_Discovery(urlICProxy, resourceID, gcubeToken)
+            response = disc.performRequest()
+            # print ElementTree.tostring(response)
+
+            if response:
+                the_end_points = response.xpath(
+                    '/Resource/Profile/AccessPoint/Properties/Property/Name[text()="URL_RESOLVER"]/../Value')
+
+                if the_end_points:
+                    self.catalogue_resolver = the_end_points[0].text
+
+                log.info("Found catalogue_resolver: %s" % self.catalogue_resolver)
+
+        if self.catalogue_resolver:
+
+            url_split = gn_localized_url.split("/")
+
+            the_scope = None
+            # ONLY IF THE URL IS A GEONETWORK RESOLVER LINK, I TRY TO GET THE (GCUBE) SCOPE
+            if len(url_split) >= 4 and not 'csw' in url_split:
+                the_scope = url_split[4]
+                the_scope = the_scope.replace("|", "/")
+                the_scope = the_scope.replace("%7C", "/")
+                the_scope = the_scope if the_scope.startswith("/") else "/" + the_scope
+
+            log.debug("Found the scope: %s" % the_scope)
+
+            the_item_url = None
+
+            if the_scope:
+                query = '{"gcube_scope" : "' + the_scope + '","entity_context" : "dataset", "entity_name" : "'+ package_dict["name"] + '"}'
+                headers = {"Content-Type": "application/json"}
+                the_item_url = D4S_HTTP_Request_Util.get_response_body(self.catalogue_resolver, query, headers)
+
+            try:
+                # Python 3 change: use urllib.request.urlopen instead of urllib2.urlopen, check extras
+                with urllib.request.urlopen(the_item_url) as response:
+                    if response.getcode() == 200:
+                        package_dict['extras'].append({'key': 'Item URL', 'value': the_item_url})
+                        log.info("Added Item URL: %s" % the_item_url)
+            except Exception as inst:
+                log.warning(u"No Item URL added!! Error on performing the request from uri: {}".format(the_item_url))
+                return package_dict
+
+        return package_dict
+
+    def add_license_to_package(self, package_dict):
+        # Added by Francesco Mangiacrapa
+        k_license_id = 'license_id'
+        k_license_title = 'license_title'
+        k_harvest_license_id = 'ckan.d4science_theme.harvest_license_id'
+        k_harvest_license_title = 'ckan.d4science_theme.harvest_license_title'
+
+        try:
+            v_license_id = self.source_config.get(k_license_id)
+            log.debug(f'Read {k_license_id} as {v_license_id} from input configuration parameter')
+            if v_license_id is None:
+                v_license_id = config.get(k_harvest_license_id)
+                log.debug(f'Read {k_harvest_license_id} as {v_license_id} from production.ini')
+                if v_license_id is None:
+                    v_license_id = 'CC-BY-SA-4.0'
+                    log.debug(f'Using default {k_license_id} {v_license_id}')
+
+            v_license_title = self.source_config.get(k_license_title)
+            log.debug(f'Read {k_license_title} as {v_license_title} from input configuration parameter')
+            if v_license_title is None:
+                v_license_title = config.get(k_harvest_license_title)
+                log.debug(f'Read {k_harvest_license_title} as {v_license_title} from production.ini')
+                if v_license_title is None:
+                    if v_license_id != 'CC-BY-SA-4.0':
+                        v_license_title = 'Unknown License Title'
+                    else:
+                        v_license_title = 'Creative Commons Attribution Share-Alike 4.0'
+
+                    log.debug(f'Using default {k_license_title} {v_license_title}')
+
+            licence_v = None
+            for e in package_dict['extras']:
+                if ('key' in e) and (e['key'] == 'licence'):
+                    licence_v = e['value']
+                    log.debug(f'licence value in extra field has value {licence_v}')
+                    break
+
+            if licence_v is None or not licence_v or len(licence_v) == 0 or licence_v == '[]':
+                package_dict[k_license_id] = v_license_id
+                package_dict[k_license_title] = v_license_title
+                log.debug(f'license_id has value: {package_dict[k_license_id]}')
+                log.debug(f'license_title has value: {package_dict[k_license_title]}')
+
+        except Exception as inst:
+            log.warning(f"Impossible to add the license_id: {str(inst)}")
+            return package_dict
+
+        return package_dict
+    
+    def add_systemtype_to_package(self, package_dict):
+
+        # ADDED BY FRANCESCO MANGIACRAPRA
+        # Task #8726
+        k = self.source_config.get('systemtypefield')
+        v = self.source_config.get('systemtypevalue')
+
+        log.debug(f'Read systemtypefield {k} from input configuration parameter')
+        if k is None:
+            k = config.get('ckan.d4science_theme.systemtypefield')
+            log.debug(f'Read ckan.d4science_theme.systemtypefield {k} from production.ini')
+            if k is None:
+                k = 'system:type'
+                log.debug(f'Using default systemtypefield {k}')
+
+        log.debug(f'Read systemtypevalue {v} from input configuration parameter')
+        if v is None:
+            # Task #9281
+            for e in package_dict['extras']:
+                ''' Setting system:type based on "gmd:hierarchyLevel/gmd:MD_ScopeCode" the codelist codes.
+                    # ISO 19139 XML defines an extended list of scopes (GMX codelist).
+                    Sticking with this codelist, "process" is not included but the generic "service" should be used IMHO.
+                    There is also "software" that may be applicable. Here are the values from this codelist: attribute ,attributeType ,collectionHardware ,collectionSession ,
+                    dataset ,series ,nonGeographicDataset ,dimensionGroup ,feature ,featureType ,propertyType ,fieldSession ,software ,service ,model ,tile ,initiative ,stereomate ,
+                    sensor ,platformSeries ,sensorSeries ,productionSeries ,transferAggregate ,otherAggregate'''
+
+                if ('key' in e) and (e['key'] == 'resource-type'):
+                    v = e['value']
+                    log.debug(f'resource-type value in extra field has value {v}')
+                    break
+
+            if v is None or not v or len(v) == 0:
+                v = config.get('ckan.d4science_theme.harvestingsystemtypevalue')
+                log.debug(f'Read ckan.d4science_theme.harvestingsystemtypevalue {v} from production.ini')
+                if v is None or not v:
+                    v = 'Dataset'
+                    log.debug(f'Using hard-coded value for system:type {v}')
+
+        v = v.capitalize()
+        log.debug(f"adding key: {k} capitalized value: {v}")
+        package_dict['extras'].append({'key': k, 'value': v})
+        return package_dict
+
+    # Added by Francesco Mangiacrapa
+    def infer_authors(self, values):
+        authors = []
+
+        for author in values["organisation-name-responsible-party"]:
+            if author not in authors:
+                authors.append(author)
+
+        log.info(f"{len(authors)} AUTHOR found as OrganisationName in ResponsibleParty")
+        return authors
+    def infer_point_of_contacts(self, values):
+        point_of_contacts = []
+        for responsible_party in values["identification-info-responsible-party"]:
+            point_of_contact = {}
+            log.debug(f"responsible_party: {responsible_party}")
+
+            if 'individualName' in responsible_party:
+                if responsible_party['individualName']:
+                    point_of_contact['name'] = ', '.join(responsible_party['individualName'])
+
+            if 'email' in responsible_party:
+                if responsible_party['email']:
+                    point_of_contact['email'] = ', '.join(responsible_party['email'])
+
+            log.debug(f"Adding point_of_contact: {point_of_contact}")
+            point_of_contacts.append(point_of_contact)
+
+        log.info(f"{len(point_of_contacts)} Point of Contact/s found as Contact-Info in ResponsibleParty")
+        return point_of_contacts
+
+    # Added by Francesco Mangiacrapa
+    def infer_featurecataloguedescription_iso110(self, values):
+        feature_catalogue_descrs = []
+        for feature_catalogue in values["featurecataloguedescription-responsible-party"]:
+            feature_catalogue_field = {}
+            log.debug(f"Feature catalogue description: {feature_catalogue}")
+
+            if 'online-resource' in feature_catalogue:
+                feature_resource_list = feature_catalogue['online-resource']
+                log.debug(f"Feature resource list is: {feature_resource_list}")
+
+                for feature_resource in feature_resource_list:
+                    log.debug(f"Feature resource is: {feature_resource}")
+                    if 'url' in feature_resource:
+                        feature_catalogue_field['url'] = feature_resource['url']
+                        if 'description' in feature_resource:
+                            feature_catalogue_field['description'] = feature_resource['description']
+                        if 'name' in feature_resource:
+                            feature_catalogue_field['name'] = feature_resource['name']
+                        else:
+                            feature_catalogue_field['name'] = "Unnamed Resource"
+
+                log.debug(f"Adding feature catalogue description: {feature_catalogue_field}")
+                feature_catalogue_descrs.append(feature_catalogue_field)
+
+        log.info(f"{len(feature_catalogue_descrs)} Feature Catalogue description added")
+        return feature_catalogue_descrs
+
+    def add_topic_category_to_group(self, iso_values, package_dict, the_user='harvest'):
+        add_iso_categories = self.source_config.get('add_topic_category_to_group')
+        if add_iso_categories == "False":
+            return package_dict
+
+        if 'topic-category' in iso_values:
+            log.debug('iso_values contains topic category')
+            topic_category_values = ""
+            if len(iso_values['topic-category']) > 0:
+                topic_category_values = ", ".join(iso_values['topic-category'])
+
+            package_dict['extras'].append({'key': 'topic_category', 'value': topic_category_values})
+            log.debug(f'topic_category: {topic_category_values}')
+
+            # Adding to groups if the group as iso-category already exists
+            for iso_cat in iso_values['topic-category']:
+                log.debug(f"Adding topic_category: {iso_cat} as group")
+                isocat_name_lower = iso_cat.lower()
+                if 'groups' in package_dict:
+                    group_found = False
+                    for group in package_dict['groups']:
+                        if group['name'].lower() == isocat_name_lower:
+                            group_found = True
+                            break
+
+                    if not group_found:
+                        context = {'model': model, 'session': Session, 'user': the_user}
+                        try:
+                            data_dict = {'id': isocat_name_lower}
+                            get_action('group_show')(context, data_dict)
+                            package_dict['groups'].append({'name': isocat_name_lower})
+                            log.info(f'Dataset added to group: {isocat_name_lower}')
+                        except NotFound as e:
+                            log.warning(f'Group {iso_cat} from category {iso_cat} is not available: {e}')
+                else:
+                    log.debug(f'creating groups into dictionary and assigning to group: {iso_cat}')
+                    package_dict['groups'] = [{'name': isocat_name_lower}]
+        else:
+            log.info('iso_values does not contain topic category')
+
+        if 'groups' in package_dict:
+            log.info(f'groups are: {package_dict["groups"]}')
+
+        return package_dict
+    
+    def add_as_resources(self, package_dict, resource_list):
+        if not resource_list:
+            return package_dict
+
+        for resource in resource_list:
+            the_resource = {}
+            if 'name' in resource:
+                the_resource['name'] = resource['name']
+                if 'description' in resource:
+                    the_resource['description'] = resource['description']
+                if 'url' in resource:
+                    the_resource['url'] = resource['url']
+                if 'format' in resource:
+                    the_resource['format'] = resource['format']
+
+                package_dict['resources'].append(the_resource)
+                log.info(f'Added resource {the_resource} to resources')
+
+        return package_dict
+
+    # Added by Francesco Mangiacrapa
+    def add_as_tag(self, package_dict, tag_list):
+        if not tag_list:
+            return package_dict
+
+        for tag_name in tag_list:
+            package_dict['tags'].append({'name': tag_name})
+            log.info(f'Added tag {tag_name}')
+
+        return package_dict
+
+    # Added by Francesco Mangiacrapa
+    def validate_tags(self, package_dict):
+        new_tags = []
+        # log.info("New Tags working")
+        if package_dict['tags']:
+            # log.info("Tags are: %s"%package_dict['tags'])
+            check_duplicated_keys = {}
+            for tag in package_dict['tags']:
+                try:
+                    if tag['name'] and tag['name'] not in check_duplicated_keys:
+                        check_duplicated_keys[tag['name']] = 1
+                        purged_tag_name = re.sub('[^A-Za-z0-9 ._-]+', ' ', tag['name'])
+                        purged_tag_name = purged_tag_name.strip()
+                        new_tags.append({'name': purged_tag_name})
+                        # log.info(f"New Tag: {purged_tag_name}")
+                    else:
+                        log.info(f"Skipping None or Duplicated Tag: {tag['name']}")
+
+                except Exception as e:  # Use 'as' for exception handling in Python 3
+                    log.error(f"error validating tag: {e}")
+
+        if len(new_tags) > 0:
+            package_dict['tags'] = new_tags
+            # log.info(f"New Tags are: {package_dict['tags']}")
+
+        return package_dict
+
    def get_package_dict(self, iso_values, harvest_object):

        package_dict = super(GeoNetworkHarvester, self).get_package_dict(iso_values, harvest_object)
@ -92,21 +646,81 @@ class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):
                               harvest_job_id=str(harvest_object.job.id),
                               harvest_object_id=str(harvest_object.id),
                               guid=str(harvest_object.guid))
-                        package_dict['extras'].append({'key': key, 'value': value})
+                        #add check
+                        if key != "contact-email":
+                            package_dict['extras'].append({'key': key, 'value': value})
                    else:
                        log.debug('Skipping existing extra %s', key)

        # Add GeoNetwork specific extras
        gn_localized_url = harvest_object.job.source.url.strip('/')

-        if gn_localized_url[-3:] == 'csw':
-            gn_localized_url = gn_localized_url[:-3]
+        # this code is not needed(?)
+        #if gn_localized_url[-3:] == 'csw':
+        #    gn_localized_url = gn_localized_url[:-3]

        log.debug('GN localized URL %s', gn_localized_url)
        #log.debug('Package dict is %r ', package_dict['extras'])

-        package_dict['extras'].append({'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid})
-        package_dict['extras'].append({'key': 'gn_localized_url', 'value': gn_localized_url})
+        # not in d4science but in geonetwork 2.10
+        #package_dict['extras'].append({'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid})
+        #package_dict['extras'].append({'key': 'gn_localized_url', 'value': gn_localized_url})
+
+        #d4science code
+        package_dict = self.add_geonetwork_informations_to_package(gn_localized_url, package_dict, harvest_object, self.harvest_session)
+        package_dict = self.add_item_url_to_package(gn_localized_url, package_dict)
+        authors = self.infer_authors(iso_values)
+
+        # Adding Authors
+        if authors:
+            package_dict['author'] = ",".join(authors)
+            log.debug(f'Author/s are: {package_dict["author"]}')
+
+        # Adding Logged User as Maintainer
+        user_logged = super(GeoNetworkHarvester, self)._get_user_name()
+        if user_logged:
+            package_dict['maintainer'] = user_logged
+            main_mail = config.get('ckan.admin_email')
+            if main_mail:
+                package_dict['maintainer_email'] = main_mail
+
+        # Adding Point of Contacts
+        point_of_contacts = self.infer_point_of_contacts(iso_values)
+        for idx, item in enumerate(point_of_contacts):
+            log.debug(f"point_of_contact: {item}")
+
+            poc_value = ''
+            if 'name' in item:
+                poc_value = item['name']
+
+            if 'email' in item:
+                if len(poc_value) > 1:
+                    poc_value += f', {item["email"]}'
+                else:
+                    poc_value += item['email']
+
+            poc_key = 'point_of_contact'
+
+            if len(point_of_contacts) > 1:
+                poc_key += f' {idx + 1}'
+
+            if poc_value and len(poc_value) > 1:
+                package_dict['extras'].append({'key': poc_key, 'value': poc_value})
+                log.debug(f'Added point of contact: {poc_key} {poc_value}')
+
+        package_dict = self.add_topic_category_to_group(iso_values, package_dict, user_logged)
+        package_dict = self.add_systemtype_to_package(package_dict)
+        package_dict = self.add_license_to_package(package_dict)
+        fc_descriptions = self.infer_featurecataloguedescription_iso110(iso_values)
+
+        # Adding List of Feature Catalogue Description as CKAN RESOURCES
+        package_dict = self.add_as_resources(package_dict, fc_descriptions)
+
+        # If the previous list is not empty, It adds the label 'Feature Catalogue Resource' as CKAN TAG
+        if len(fc_descriptions) > 0:
+            package_dict = self.add_as_tag(package_dict, ['Feature Catalog Resource'])
+
+        package_dict = self.validate_tags(package_dict)

        # Add other elements from ISO metadata
        time_extents = self.infer_timeinstants(iso_values)
@ -193,14 +807,15 @@ class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):

    def fix_resource_type(self, resources):
        for resource in resources:
-            if 'OGC:WMS' in resource['resource_locator_protocol']:
-                resource['format'] = 'wms'
+            if 'resource_locator_protocol' in resource: #added check
+                if 'OGC:WMS' in resource['resource_locator_protocol']:
+                    resource['format'] = 'wms'

-                if config.get('ckanext.spatial.harvest.validate_wms', False):
-                    # Check if the service is a view service
-                    url = resource['url']
-                    test_url = url.split('?')[0] if '?' in url else url
-                    if self._is_wms(test_url):
-                        resource['verified'] = True
-                        resource['verified_date'] = datetime.now().isoformat()
+                    if config.get('ckanext.spatial.harvest.validate_wms', False):
+                        # Check if the service is a view service
+                        url = resource['url']
+                        test_url = url.split('?')[0] if '?' in url else url
+                        if self._is_wms(test_url):
+                            resource['verified'] = True
+                            resource['verified_date'] = datetime.now().isoformat()
Author	SHA1	Message	Date
Alessio Fabrizio	925db5d411	fix: ⬆️ ckan spatial 2-10 support changed the import according to spatial plugin 2.10-support branch	2024-12-11 12:24:08 +01:00
Alessio Fabrizio	5c483d60a6	feat: 🚧 update d4science customizations from python 2 to 3 updating python 2 code to python 3	2024-12-11 12:12:42 +01:00