From f2145778721c0c502cc742293fdfaced3e94c51b Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Tue, 13 Jan 2015 14:46:14 +0100 Subject: [PATCH 1/7] Fetch remote organization via action api Organizations used to be returned by /api/2/rest/group, this is what the old implementation used to fetch the information to create the remote organization on the local instance of CKAN. With this commit the Action API is used to fetch the same information. --- ckanext/harvest/harvesters/ckanharvester.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 835e27c..2c5c4d9 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -21,10 +21,14 @@ class CKANHarvester(HarvesterBase): config = None api_version = 2 + action_api_version = 3 def _get_rest_api_offset(self): return '/api/%d/rest' % self.api_version + def _get_action_api_offset(self): + return '/api/%d/action' % self.action_api_version + def _get_search_api_offset(self): return '/api/%d/search' % self.api_version @@ -48,6 +52,15 @@ class CKANHarvester(HarvesterBase): except Exception, e: raise e + def _get_organization(self, base_url, org_name): + url = base_url + self._get_action_api_offset() + '/organization_show?id=' + org_name + try: + content = self._get_content(url) + content_dict = json.loads(content) + return content_dict['result'] + except Exception, e: + raise e + def _set_config(self,config_str): if config_str: self.config = json.loads(config_str) @@ -324,7 +337,7 @@ class CKANHarvester(HarvesterBase): log.info('Organization %s is not available' % remote_org) if remote_orgs == 'create': try: - org = self._get_group(harvest_object.source.url, remote_org) + org = self._get_organization(harvest_object.source.url, remote_org) for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: org.pop(key, None) get_action('organization_create')(context, org) From 0fd38e0e54a8e0bb84ca5717db89e1587b6a6c79 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Wed, 14 Jan 2015 00:10:27 +0100 Subject: [PATCH 2/7] Use _get_group as a fallback for remote orgs First try to get a remote org from the remote Action API, if this fails try to use the old rest api call, which works on older CKAN versions. Only if both options fail, its currently not possible to get the remote organization. --- ckanext/harvest/harvesters/ckanharvester.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 2c5c4d9..3fb3935 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -337,7 +337,14 @@ class CKANHarvester(HarvesterBase): log.info('Organization %s is not available' % remote_org) if remote_orgs == 'create': try: - org = self._get_organization(harvest_object.source.url, remote_org) + try: + org = self._get_organization(harvest_object.source.url, remote_org) + except: + # fallback if remote CKAN exposes organizations as groups + # this especially targets older versions of CKAN + log.debug('_get_organization() failed, now trying _get_group()') + org = self._get_group(harvest_object.source.url, remote_org) + for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: org.pop(key, None) get_action('organization_create')(context, org) From ef35c21e2a587dc4136ff25acef774eabd7777fa Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 00:07:26 +0100 Subject: [PATCH 3/7] Improve exception handling with custom exception 1. Try whenever possible to catch specific exceptions 2. Raise custom exception where appropriate 3. Fix the exception handling in _get_group and _get_organization --- ckanext/harvest/harvesters/ckanharvester.py | 30 ++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 3fb3935..e6fe503 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -40,8 +40,14 @@ class CKANHarvester(HarvesterBase): api_key = self.config.get('api_key',None) if api_key: http_request.add_header('Authorization',api_key) - http_response = urllib2.urlopen(http_request) + try: + http_response = urllib2.urlopen(http_request) + except urllib2.HTTPError, e: + raise ContentFetchError( + 'Could not fetch url: %s, HTTP error code: %s' % + (url, e.code) + ) return http_response.read() def _get_group(self, base_url, group_name): @@ -49,8 +55,9 @@ class CKANHarvester(HarvesterBase): try: content = self._get_content(url) return json.loads(content) - except Exception, e: - raise e + except (ContentFetchError, ValueError): + log.debug('Could not fetch/decode remote group'); + raise RemoteResourceError('Could not fetch/decode remote group') def _get_organization(self, base_url, org_name): url = base_url + self._get_action_api_offset() + '/organization_show?id=' + org_name @@ -58,8 +65,9 @@ class CKANHarvester(HarvesterBase): content = self._get_content(url) content_dict = json.loads(content) return content_dict['result'] - except Exception, e: - raise e + except (ContentFetchError, ValueError, KeyError): + log.debug('Could not fetch/decode remote group'); + raise RemoteResourceError('Could not fetch/decode remote organization') def _set_config(self,config_str): if config_str: @@ -294,7 +302,7 @@ class CKANHarvester(HarvesterBase): if remote_groups == 'create': try: group = self._get_group(harvest_object.source.url, group_name) - except: + except RemoteResourceError: log.error('Could not get remote group %s' % group_name) continue @@ -339,10 +347,9 @@ class CKANHarvester(HarvesterBase): try: try: org = self._get_organization(harvest_object.source.url, remote_org) - except: + except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN - log.debug('_get_organization() failed, now trying _get_group()') org = self._get_group(harvest_object.source.url, remote_org) for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: @@ -350,7 +357,7 @@ class CKANHarvester(HarvesterBase): get_action('organization_create')(context, org) log.info('Organization %s has been newly created' % remote_org) validated_org = org['id'] - except: + except (RemoteResourceError, ValidationError): log.error('Could not get remote org %s' % remote_org) package_dict['owner_org'] = validated_org or local_org @@ -425,3 +432,8 @@ class CKANHarvester(HarvesterBase): except Exception, e: self._save_object_error('%r'%e,harvest_object,'Import') +class ContentFetchError(Exception): + pass + +class RemoteResourceError(Exception): + pass From 935b9dda0100325028b01eb45a363b5730670c16 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 00:09:43 +0100 Subject: [PATCH 4/7] Munge group name before fetching remote group The API call /api/2/rest/package/ returns the display name of the group instead of its ID. To properly match the group, munge the name before calling /api/2/rest/group --- ckanext/harvest/harvesters/ckanharvester.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index e6fe503..a5b3f32 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -5,6 +5,7 @@ from ckan import model from ckan.model import Session, Package from ckan.logic import ValidationError, NotFound, get_action from ckan.lib.helpers import json +from ckan.lib.munge import munge_name from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ HarvestObjectError @@ -51,7 +52,7 @@ class CKANHarvester(HarvesterBase): return http_response.read() def _get_group(self, base_url, group_name): - url = base_url + self._get_rest_api_offset() + '/group/' + group_name + url = base_url + self._get_rest_api_offset() + '/group/' + munge_name(group_name) try: content = self._get_content(url) return json.loads(content) From b978c26e70fa61024fcac2982a37782571b720a3 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 00:49:11 +0100 Subject: [PATCH 5/7] Use ContentFetchError instead of generic Exception --- ckanext/harvest/harvesters/ckanharvester.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index a5b3f32..70018ae 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -177,7 +177,7 @@ class CKANHarvester(HarvesterBase): url = base_rest_url + '/revision/%s' % revision_id try: content = self._get_content(url) - except Exception,e: + except ContentFetchError,e: self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) continue @@ -204,7 +204,7 @@ class CKANHarvester(HarvesterBase): url = base_rest_url + '/package' try: content = self._get_content(url) - except Exception,e: + except ContentFetchError,e: self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) return None @@ -241,7 +241,7 @@ class CKANHarvester(HarvesterBase): # Get contents try: content = self._get_content(url) - except Exception,e: + except ContentFetchError,e: self._save_object_error('Unable to get content for package: %s: %r' % \ (url, e),harvest_object) return None From 191c39ce5c13cd240dd9c5545f548bac09e214e1 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 10:57:24 +0100 Subject: [PATCH 6/7] Catch the more general URLError instead of HTTPError HTTPError is a subclass of URLError, so catch URLError is enough. I think the HTTP error code is not as important in this situation, so catching the more generic error seems like the best solution. --- ckanext/harvest/harvesters/ckanharvester.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 70018ae..26251dc 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -44,10 +44,10 @@ class CKANHarvester(HarvesterBase): try: http_response = urllib2.urlopen(http_request) - except urllib2.HTTPError, e: + except urllib2.URLError, e: raise ContentFetchError( - 'Could not fetch url: %s, HTTP error code: %s' % - (url, e.code) + 'Could not fetch url: %s, error: %s' % + (url, e.reason) ) return http_response.read() From c1bcee9684b23e1910055b75b673bc4b146a3336 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 11:36:15 +0100 Subject: [PATCH 7/7] Use str() to get the error message --- ckanext/harvest/harvesters/ckanharvester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 26251dc..7575bad 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -47,7 +47,7 @@ class CKANHarvester(HarvesterBase): except urllib2.URLError, e: raise ContentFetchError( 'Could not fetch url: %s, error: %s' % - (url, e.reason) + (url, str(e)) ) return http_response.read()