Merge pull request #198 from ckan/catch-exceptions

Catch exceptions from urllib2.urlopen more comprehensively
This commit is contained in:
David Read 2015-12-02 16:22:53 +00:00
commit 121e8bd918
1 changed files with 28 additions and 17 deletions

View File

@ -1,4 +1,5 @@
import urllib2 import urllib2
import httplib
from ckan.lib.base import c from ckan.lib.base import c
from ckan import model from ckan import model
@ -35,21 +36,23 @@ class CKANHarvester(HarvesterBase):
return '/api/%d/search' % self.api_version return '/api/%d/search' % self.api_version
def _get_content(self, url): def _get_content(self, url):
http_request = urllib2.Request( http_request = urllib2.Request(url=url)
url = url,
)
api_key = self.config.get('api_key',None) api_key = self.config.get('api_key')
if api_key: if api_key:
http_request.add_header('Authorization',api_key) http_request.add_header('Authorization', api_key)
try: try:
http_response = urllib2.urlopen(http_request) http_response = urllib2.urlopen(http_request)
except urllib2.HTTPError, e:
if e.getcode() == 404:
raise ContentNotFoundError('HTTP error: %s' % e.code)
else:
raise ContentFetchError('HTTP error: %s' % e.code)
except urllib2.URLError, e: except urllib2.URLError, e:
raise ContentFetchError( raise ContentFetchError('URL error: %s' % e.reason)
'Could not fetch url: %s, error: %s' % except httplib.HTTPException, e:
(url, str(e)) raise ContentFetchError('HTTP Exception: %s' % e)
)
return http_response.read() return http_response.read()
def _get_group(self, base_url, group_name): def _get_group(self, base_url, group_name):
@ -205,16 +208,21 @@ class CKANHarvester(HarvesterBase):
revision = json.loads(content) revision = json.loads(content)
package_ids = revision['packages'] package_ids = revision['packages']
else: else:
log.info('No packages have been updated on the remote CKAN instance since the last harvest job') log.info('No revisions since last harvest %s',
last_time)
return [] return []
except urllib2.HTTPError,e: except ContentNotFoundError, e:
if e.getcode() == 400: log.info('No revisions since last harvest %s', last_time)
log.info('CKAN instance %s does not suport revision filtering' % base_url) return []
except ContentFetchError, e:
# Any other error indicates that revision filtering is not
# working for whatever reason, so fallback to just getting
# all the packages, which is expensive but reliable.
log.info('CKAN instance %s does not suport revision '
'filtering: %s',
base_url, e)
get_all_packages = True get_all_packages = True
else:
self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job)
return None
if get_all_packages: if get_all_packages:
# Request all remote packages # Request all remote packages
@ -467,5 +475,8 @@ class CKANHarvester(HarvesterBase):
class ContentFetchError(Exception): class ContentFetchError(Exception):
pass pass
class ContentNotFoundError(ContentFetchError):
pass
class RemoteResourceError(Exception): class RemoteResourceError(Exception):
pass pass