2015-10-27 18:33:22 +01:00
|
|
|
import urllib
|
2011-04-19 15:54:59 +02:00
|
|
|
import urllib2
|
|
|
|
|
2011-11-18 14:20:41 +01:00
|
|
|
from ckan.lib.base import c
|
|
|
|
from ckan import model
|
|
|
|
from ckan.logic import ValidationError, NotFound, get_action
|
2011-04-19 15:54:59 +02:00
|
|
|
from ckan.lib.helpers import json
|
2015-01-15 00:09:43 +01:00
|
|
|
from ckan.lib.munge import munge_name
|
2015-10-23 19:30:28 +02:00
|
|
|
from ckan.plugins import toolkit
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2015-10-23 19:30:28 +02:00
|
|
|
from ckanext.harvest.model import HarvestJob, HarvestObject
|
2011-04-19 15:54:59 +02:00
|
|
|
|
|
|
|
import logging
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2011-06-02 12:07:07 +02:00
|
|
|
from base import HarvesterBase
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2015-10-23 19:30:28 +02:00
|
|
|
|
2011-06-02 12:07:07 +02:00
|
|
|
class CKANHarvester(HarvesterBase):
|
2011-04-19 15:54:59 +02:00
|
|
|
'''
|
|
|
|
A Harvester for CKAN instances
|
|
|
|
'''
|
2011-06-07 14:35:11 +02:00
|
|
|
config = None
|
2011-05-17 18:26:42 +02:00
|
|
|
|
2013-05-22 16:46:14 +02:00
|
|
|
api_version = 2
|
2015-01-13 14:46:14 +01:00
|
|
|
action_api_version = 3
|
2011-05-17 18:26:42 +02:00
|
|
|
|
2015-01-13 14:46:14 +01:00
|
|
|
def _get_action_api_offset(self):
|
|
|
|
return '/api/%d/action' % self.action_api_version
|
|
|
|
|
2011-05-17 18:26:42 +02:00
|
|
|
def _get_search_api_offset(self):
|
2015-09-10 17:46:25 +02:00
|
|
|
return "%s/package_search" % self._get_action_api_offset()
|
2011-04-19 15:54:59 +02:00
|
|
|
|
|
|
|
def _get_content(self, url):
|
2015-10-27 18:33:22 +01:00
|
|
|
http_request = urllib2.Request(url=url)
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
api_key = self.config.get('api_key', None)
|
2013-02-28 20:06:21 +01:00
|
|
|
if api_key:
|
2015-10-27 18:33:22 +01:00
|
|
|
http_request.add_header('Authorization', api_key)
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2015-01-15 00:07:26 +01:00
|
|
|
try:
|
|
|
|
http_response = urllib2.urlopen(http_request)
|
2015-01-15 10:57:24 +01:00
|
|
|
except urllib2.URLError, e:
|
2015-01-15 00:07:26 +01:00
|
|
|
raise ContentFetchError(
|
2015-10-27 18:33:22 +01:00
|
|
|
'Could not fetch url: "%s", URL error: %s' %
|
|
|
|
(url, e)
|
2015-01-15 00:07:26 +01:00
|
|
|
)
|
2013-02-28 20:06:21 +01:00
|
|
|
return http_response.read()
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2013-05-24 17:55:05 +02:00
|
|
|
def _get_group(self, base_url, group_name):
|
2015-09-10 17:46:25 +02:00
|
|
|
url = base_url + self._get_action_api_offset() + '/group_show?id=' + munge_name(group_name)
|
2013-05-24 17:55:05 +02:00
|
|
|
try:
|
|
|
|
content = self._get_content(url)
|
|
|
|
return json.loads(content)
|
2015-01-15 00:07:26 +01:00
|
|
|
except (ContentFetchError, ValueError):
|
|
|
|
log.debug('Could not fetch/decode remote group');
|
|
|
|
raise RemoteResourceError('Could not fetch/decode remote group')
|
2013-05-24 17:55:05 +02:00
|
|
|
|
2015-01-13 14:46:14 +01:00
|
|
|
def _get_organization(self, base_url, org_name):
|
|
|
|
url = base_url + self._get_action_api_offset() + '/organization_show?id=' + org_name
|
|
|
|
try:
|
|
|
|
content = self._get_content(url)
|
|
|
|
content_dict = json.loads(content)
|
|
|
|
return content_dict['result']
|
2015-01-15 00:07:26 +01:00
|
|
|
except (ContentFetchError, ValueError, KeyError):
|
|
|
|
log.debug('Could not fetch/decode remote group');
|
|
|
|
raise RemoteResourceError('Could not fetch/decode remote organization')
|
2015-01-13 14:46:14 +01:00
|
|
|
|
2011-06-07 14:35:11 +02:00
|
|
|
def _set_config(self,config_str):
|
|
|
|
if config_str:
|
|
|
|
self.config = json.loads(config_str)
|
2013-08-15 15:37:55 +02:00
|
|
|
if 'api_version' in self.config:
|
|
|
|
self.api_version = int(self.config['api_version'])
|
2011-07-18 18:35:32 +02:00
|
|
|
|
2011-06-07 14:35:11 +02:00
|
|
|
log.debug('Using config: %r', self.config)
|
|
|
|
else:
|
|
|
|
self.config = {}
|
|
|
|
|
2011-05-13 19:39:36 +02:00
|
|
|
def info(self):
|
|
|
|
return {
|
|
|
|
'name': 'ckan',
|
|
|
|
'title': 'CKAN',
|
2011-06-07 13:07:53 +02:00
|
|
|
'description': 'Harvests remote CKAN instances',
|
|
|
|
'form_config_interface':'Text'
|
2011-05-13 19:39:36 +02:00
|
|
|
}
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2011-06-07 13:07:53 +02:00
|
|
|
def validate_config(self,config):
|
2011-06-28 16:04:40 +02:00
|
|
|
if not config:
|
|
|
|
return config
|
|
|
|
|
2011-06-07 13:07:53 +02:00
|
|
|
try:
|
|
|
|
config_obj = json.loads(config)
|
2011-11-18 14:20:41 +01:00
|
|
|
|
2013-05-31 18:23:40 +02:00
|
|
|
if 'api_version' in config_obj:
|
|
|
|
try:
|
|
|
|
int(config_obj['api_version'])
|
|
|
|
except ValueError:
|
|
|
|
raise ValueError('api_version must be an integer')
|
|
|
|
|
2012-01-10 18:07:19 +01:00
|
|
|
if 'default_tags' in config_obj:
|
|
|
|
if not isinstance(config_obj['default_tags'],list):
|
|
|
|
raise ValueError('default_tags must be a list')
|
|
|
|
|
2011-11-18 14:20:41 +01:00
|
|
|
if 'default_groups' in config_obj:
|
2012-01-10 18:07:19 +01:00
|
|
|
if not isinstance(config_obj['default_groups'],list):
|
|
|
|
raise ValueError('default_groups must be a list')
|
|
|
|
|
2011-11-18 14:20:41 +01:00
|
|
|
# Check if default groups exist
|
|
|
|
context = {'model':model,'user':c.user}
|
|
|
|
for group_name in config_obj['default_groups']:
|
|
|
|
try:
|
|
|
|
group = get_action('group_show')(context,{'id':group_name})
|
|
|
|
except NotFound,e:
|
|
|
|
raise ValueError('Default group not found')
|
2011-11-18 15:12:30 +01:00
|
|
|
|
2012-01-10 18:07:19 +01:00
|
|
|
if 'default_extras' in config_obj:
|
|
|
|
if not isinstance(config_obj['default_extras'],dict):
|
|
|
|
raise ValueError('default_extras must be a dictionary')
|
|
|
|
|
2011-11-18 15:12:30 +01:00
|
|
|
if 'user' in config_obj:
|
|
|
|
# Check if user exists
|
|
|
|
context = {'model':model,'user':c.user}
|
|
|
|
try:
|
|
|
|
user = get_action('user_show')(context,{'id':config_obj.get('user')})
|
|
|
|
except NotFound,e:
|
|
|
|
raise ValueError('User not found')
|
2011-11-18 14:20:41 +01:00
|
|
|
|
2012-02-03 18:54:34 +01:00
|
|
|
for key in ('read_only','force_all'):
|
|
|
|
if key in config_obj:
|
|
|
|
if not isinstance(config_obj[key],bool):
|
|
|
|
raise ValueError('%s must be boolean' % key)
|
|
|
|
|
2011-06-07 13:07:53 +02:00
|
|
|
except ValueError,e:
|
|
|
|
raise e
|
|
|
|
|
|
|
|
return config
|
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
def gather_stage(self, harvest_job):
|
|
|
|
log.debug('In CKANHarvester gather_stage (%s)',
|
|
|
|
harvest_job.source.url)
|
2015-10-23 19:30:28 +02:00
|
|
|
toolkit.requires_ckan_version(min_version='2.0')
|
2011-05-17 18:26:42 +02:00
|
|
|
get_all_packages = True
|
|
|
|
|
2011-06-14 16:59:13 +02:00
|
|
|
self._set_config(harvest_job.source.config)
|
2011-06-07 14:35:11 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
# Check if there is a previous harvest, ignoring whether it was
|
|
|
|
# successful (i.e. current=True) or not
|
|
|
|
previous_job = \
|
|
|
|
model.Session.query(HarvestJob) \
|
|
|
|
.filter(HarvestJob.source == harvest_job.source) \
|
|
|
|
.filter(HarvestJob.gather_finished != None) \
|
|
|
|
.filter(HarvestJob.id != harvest_job.id) \
|
|
|
|
.order_by(HarvestJob.gather_finished.desc()) \
|
|
|
|
.first()
|
2011-04-19 15:54:59 +02:00
|
|
|
|
|
|
|
# Get source URL
|
2015-10-27 18:33:22 +01:00
|
|
|
remote_ckan_base_url = harvest_job.source.url.rstrip('/')
|
2011-06-07 13:07:53 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
log.debug('Previous job: %r', previous_job)
|
2015-09-10 17:46:25 +02:00
|
|
|
|
2015-10-23 19:30:28 +02:00
|
|
|
# Ideally we can request from the remote CKAN only those datasets
|
|
|
|
# modified since last harvest job
|
2015-10-27 18:33:22 +01:00
|
|
|
if (previous_job and
|
|
|
|
not previous_job.gather_errors and
|
|
|
|
not len(previous_job.objects) == 0 and
|
|
|
|
not self.config.get('force_all', False)):
|
|
|
|
get_all_packages = False
|
|
|
|
|
|
|
|
# Request only the datasets modified since last harvest job
|
|
|
|
last_time = previous_job.gather_started.isoformat()
|
|
|
|
# Note: SOLR works in UTC, and gather_started is also UTC, so
|
|
|
|
# this should work as long as local and remote clocks are
|
|
|
|
# relatively accurate
|
|
|
|
log.info('Searching for datasets modified since: %s UTC',
|
|
|
|
last_time)
|
|
|
|
|
|
|
|
fq = 'metadata_modified:[{last_check}Z+TO+*]'.format(
|
|
|
|
last_check=last_time)
|
2012-03-15 12:31:12 +01:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
try:
|
|
|
|
pkg_dicts = self._search_for_datasets(remote_ckan_base_url,
|
|
|
|
fq)
|
|
|
|
except SearchError, e:
|
|
|
|
log.info('Searching for datasets changed since last time '
|
|
|
|
'gave an error: s', e)
|
|
|
|
get_all_packages = True
|
|
|
|
|
|
|
|
if not pkg_dicts:
|
|
|
|
log.info('No datasets have been updated on the remote '
|
|
|
|
'CKAN instance since the last harvest job %s',
|
|
|
|
last_time)
|
|
|
|
return None
|
2011-05-17 18:26:42 +02:00
|
|
|
|
2015-10-23 19:30:28 +02:00
|
|
|
# Fall-back option - request all the datasets from the remote CKAN
|
2011-05-17 18:26:42 +02:00
|
|
|
if get_all_packages:
|
|
|
|
# Request all remote packages
|
|
|
|
try:
|
2015-10-27 18:33:22 +01:00
|
|
|
pkg_dicts = self._search_for_datasets(remote_ckan_base_url)
|
|
|
|
except SearchError, e:
|
|
|
|
log.info('Searching for all datasets gave an error: s', e)
|
|
|
|
self._save_gather_error(
|
|
|
|
'Unable to search remote CKAN for datasets: %s %s' % e,
|
|
|
|
harvest_job)
|
|
|
|
if not pkg_dicts:
|
|
|
|
self._save_gather_error(
|
|
|
|
'No datasets found at CKAN: %s' % remote_ckan_base_url,
|
|
|
|
harvest_job)
|
|
|
|
return None
|
2011-05-17 18:26:42 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
# Create harvest objects for each dataset
|
2011-05-17 18:26:42 +02:00
|
|
|
try:
|
2015-10-27 18:33:22 +01:00
|
|
|
package_ids = set()
|
2011-04-19 15:54:59 +02:00
|
|
|
object_ids = []
|
2015-10-27 18:33:22 +01:00
|
|
|
for pkg_dict in pkg_dicts:
|
|
|
|
if pkg_dict['id'] in package_ids:
|
|
|
|
log.info('Discarding duplicate dataset %s - probably due '
|
|
|
|
'to datasets being changed at the same time as '
|
|
|
|
'when the harvester was paging through',
|
|
|
|
pkg_dict['id'])
|
|
|
|
continue
|
|
|
|
package_ids.add(pkg_dict['id'])
|
|
|
|
|
|
|
|
log.info('Creating HarvestObject for %s %s',
|
|
|
|
pkg_dict['name'], pkg_dict['id'])
|
|
|
|
obj = HarvestObject(guid=pkg_dict['id'],
|
|
|
|
job=harvest_job,
|
|
|
|
content=json.dumps(pkg_dict))
|
|
|
|
obj.save()
|
|
|
|
object_ids.append(obj.id)
|
|
|
|
|
|
|
|
return object_ids
|
2011-04-19 15:54:59 +02:00
|
|
|
except Exception, e:
|
2015-10-27 18:33:22 +01:00
|
|
|
self._save_gather_error('%r' % e.message, harvest_job)
|
|
|
|
|
|
|
|
def _search_for_datasets(self, remote_ckan_base_url, fq=None):
|
|
|
|
'''Does a dataset search on a remote CKAN and returns the results.
|
|
|
|
|
|
|
|
Deals with paging to return all the results, not just the first page.
|
|
|
|
'''
|
|
|
|
base_search_url = remote_ckan_base_url + self._get_search_api_offset()
|
|
|
|
params = {'rows': '100', 'start': '0'}
|
|
|
|
if fq:
|
|
|
|
params['fq'] = fq
|
|
|
|
|
|
|
|
pkg_dicts = []
|
|
|
|
previous_content = None
|
|
|
|
while True:
|
|
|
|
url = base_search_url + '?' + urllib.urlencode(params)
|
|
|
|
log.debug('Searching for CKAN datasets: %s', url)
|
|
|
|
try:
|
|
|
|
content = self._get_content(url)
|
|
|
|
except urllib2.HTTPError, e:
|
|
|
|
raise SearchError('Remote CKAN instance %s returned HTTP '
|
|
|
|
'error %s for search: %s' %
|
|
|
|
(remote_ckan_base_url, e.getcode(), url))
|
|
|
|
|
|
|
|
if previous_content and content == previous_content:
|
|
|
|
raise SearchError('The paging doesn\'t seem to work. URL: %s' %
|
|
|
|
url)
|
|
|
|
try:
|
|
|
|
response_dict = json.loads(content)
|
|
|
|
except ValueError:
|
|
|
|
raise SearchError('Response from remote CKAN was not JSON: %r'
|
|
|
|
% content)
|
|
|
|
try:
|
|
|
|
pkg_dicts_page = response_dict.get('result', {}).get('results',
|
|
|
|
[])
|
|
|
|
except ValueError:
|
|
|
|
raise SearchError('Response JSON did not contain '
|
|
|
|
'result/results: %r' % response_dict)
|
|
|
|
pkg_dicts.extend(pkg_dicts_page)
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
if len(pkg_dicts_page) == 0:
|
|
|
|
break
|
2011-06-07 14:35:11 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
params['start'] = str(int(params['start']) + int(params['rows']))
|
2011-06-07 14:35:11 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
return pkg_dicts
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
def fetch_stage(self, harvest_object):
|
|
|
|
# Nothing to do here - we got the package dict in the search in the
|
|
|
|
# gather stage
|
2011-04-19 15:54:59 +02:00
|
|
|
return True
|
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
def import_stage(self, harvest_object):
|
2011-04-19 15:54:59 +02:00
|
|
|
log.debug('In CKANHarvester import_stage')
|
2015-06-11 11:38:33 +02:00
|
|
|
|
2015-10-27 18:33:22 +01:00
|
|
|
context = {'model': model, 'session': model.Session,
|
|
|
|
'user': self._get_user_name()}
|
2011-04-19 15:54:59 +02:00
|
|
|
if not harvest_object:
|
|
|
|
log.error('No harvest object received')
|
|
|
|
return False
|
|
|
|
|
|
|
|
if harvest_object.content is None:
|
2015-10-27 18:33:22 +01:00
|
|
|
self._save_object_error('Empty content for object %s' %
|
|
|
|
harvest_object.id,
|
|
|
|
harvest_object, 'Import')
|
2011-04-19 15:54:59 +02:00
|
|
|
return False
|
|
|
|
|
2011-06-14 16:59:13 +02:00
|
|
|
self._set_config(harvest_object.job.source.config)
|
2011-06-07 14:35:11 +02:00
|
|
|
|
2011-06-02 12:07:07 +02:00
|
|
|
try:
|
2011-04-19 15:54:59 +02:00
|
|
|
package_dict = json.loads(harvest_object.content)
|
2011-11-18 14:20:41 +01:00
|
|
|
|
2013-08-15 15:38:33 +02:00
|
|
|
if package_dict.get('type') == 'harvest':
|
|
|
|
log.warn('Remote dataset is a harvest source, ignoring...')
|
2013-10-23 07:40:55 +02:00
|
|
|
return True
|
2013-08-15 15:38:33 +02:00
|
|
|
|
2011-11-18 14:20:41 +01:00
|
|
|
# Set default tags if needed
|
2015-10-27 18:33:22 +01:00
|
|
|
default_tags = self.config.get('default_tags', [])
|
2011-11-18 14:20:41 +01:00
|
|
|
if default_tags:
|
|
|
|
if not 'tags' in package_dict:
|
|
|
|
package_dict['tags'] = []
|
2015-10-27 18:33:22 +01:00
|
|
|
package_dict['tags'].extend(
|
|
|
|
[t for t in default_tags if t not in package_dict['tags']])
|
2011-11-18 14:20:41 +01:00
|
|
|
|
2013-05-24 17:55:05 +02:00
|
|
|
remote_groups = self.config.get('remote_groups', None)
|
2013-05-30 19:06:15 +02:00
|
|
|
if not remote_groups in ('only_local', 'create'):
|
|
|
|
# Ignore remote groups
|
|
|
|
package_dict.pop('groups', None)
|
|
|
|
else:
|
|
|
|
if not 'groups' in package_dict:
|
|
|
|
package_dict['groups'] = []
|
|
|
|
|
2013-05-24 17:55:05 +02:00
|
|
|
# check if remote groups exist locally, otherwise remove
|
|
|
|
validated_groups = []
|
|
|
|
|
|
|
|
for group_name in package_dict['groups']:
|
|
|
|
try:
|
|
|
|
data_dict = {'id': group_name}
|
|
|
|
group = get_action('group_show')(context, data_dict)
|
2013-05-22 16:46:14 +02:00
|
|
|
if self.api_version == 1:
|
2013-05-24 17:55:05 +02:00
|
|
|
validated_groups.append(group['name'])
|
|
|
|
else:
|
|
|
|
validated_groups.append(group['id'])
|
|
|
|
except NotFound, e:
|
|
|
|
log.info('Group %s is not available' % group_name)
|
|
|
|
if remote_groups == 'create':
|
|
|
|
try:
|
|
|
|
group = self._get_group(harvest_object.source.url, group_name)
|
2015-01-15 00:07:26 +01:00
|
|
|
except RemoteResourceError:
|
2013-05-24 17:55:05 +02:00
|
|
|
log.error('Could not get remote group %s' % group_name)
|
|
|
|
continue
|
|
|
|
|
|
|
|
for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name']:
|
|
|
|
group.pop(key, None)
|
2015-06-11 11:38:33 +02:00
|
|
|
|
2013-05-24 17:55:05 +02:00
|
|
|
get_action('group_create')(context, group)
|
|
|
|
log.info('Group %s has been newly created' % group_name)
|
2013-05-27 13:36:56 +02:00
|
|
|
if self.api_version == 1:
|
2013-05-24 17:55:05 +02:00
|
|
|
validated_groups.append(group['name'])
|
|
|
|
else:
|
|
|
|
validated_groups.append(group['id'])
|
|
|
|
|
|
|
|
package_dict['groups'] = validated_groups
|
2013-05-16 18:30:54 +02:00
|
|
|
|
2013-10-22 17:24:43 +02:00
|
|
|
|
|
|
|
# Local harvest source organization
|
|
|
|
source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id})
|
|
|
|
local_org = source_dataset.get('owner_org')
|
|
|
|
|
2013-10-04 15:34:22 +02:00
|
|
|
remote_orgs = self.config.get('remote_orgs', None)
|
2013-10-22 17:24:43 +02:00
|
|
|
|
2013-10-07 11:22:19 +02:00
|
|
|
if not remote_orgs in ('only_local', 'create'):
|
2013-10-22 17:24:43 +02:00
|
|
|
# Assign dataset to the source organization
|
|
|
|
package_dict['owner_org'] = local_org
|
2013-10-04 15:34:22 +02:00
|
|
|
else:
|
|
|
|
if not 'owner_org' in package_dict:
|
|
|
|
package_dict['owner_org'] = None
|
|
|
|
|
|
|
|
# check if remote org exist locally, otherwise remove
|
|
|
|
validated_org = None
|
|
|
|
remote_org = package_dict['owner_org']
|
2013-10-22 17:24:43 +02:00
|
|
|
|
2013-10-11 18:08:32 +02:00
|
|
|
if remote_org:
|
|
|
|
try:
|
|
|
|
data_dict = {'id': remote_org}
|
|
|
|
org = get_action('organization_show')(context, data_dict)
|
|
|
|
validated_org = org['id']
|
|
|
|
except NotFound, e:
|
|
|
|
log.info('Organization %s is not available' % remote_org)
|
|
|
|
if remote_orgs == 'create':
|
|
|
|
try:
|
2015-01-14 00:10:27 +01:00
|
|
|
try:
|
|
|
|
org = self._get_organization(harvest_object.source.url, remote_org)
|
2015-01-15 00:07:26 +01:00
|
|
|
except RemoteResourceError:
|
2015-01-14 00:10:27 +01:00
|
|
|
# fallback if remote CKAN exposes organizations as groups
|
|
|
|
# this especially targets older versions of CKAN
|
|
|
|
org = self._get_group(harvest_object.source.url, remote_org)
|
|
|
|
|
2013-10-11 18:08:32 +02:00
|
|
|
for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']:
|
|
|
|
org.pop(key, None)
|
|
|
|
get_action('organization_create')(context, org)
|
|
|
|
log.info('Organization %s has been newly created' % remote_org)
|
|
|
|
validated_org = org['id']
|
2015-01-15 00:07:26 +01:00
|
|
|
except (RemoteResourceError, ValidationError):
|
2013-10-11 18:08:32 +02:00
|
|
|
log.error('Could not get remote org %s' % remote_org)
|
2013-10-04 15:34:22 +02:00
|
|
|
|
2013-10-22 17:24:43 +02:00
|
|
|
package_dict['owner_org'] = validated_org or local_org
|
2011-11-18 14:20:41 +01:00
|
|
|
|
|
|
|
# Set default groups if needed
|
2013-05-24 17:55:05 +02:00
|
|
|
default_groups = self.config.get('default_groups', [])
|
2011-11-18 14:20:41 +01:00
|
|
|
if default_groups:
|
2014-02-10 14:16:58 +01:00
|
|
|
if not 'groups' in package_dict:
|
|
|
|
package_dict['groups'] = []
|
2015-10-23 19:30:28 +02:00
|
|
|
package_dict['groups'].extend(
|
|
|
|
[g for g in default_groups
|
|
|
|
if g not in package_dict['groups']])
|
2013-05-31 14:56:53 +02:00
|
|
|
|
2012-01-10 18:07:19 +01:00
|
|
|
# Set default extras if needed
|
2015-10-23 19:30:28 +02:00
|
|
|
default_extras = self.config.get('default_extras', {})
|
|
|
|
def get_extra(key, package_dict):
|
|
|
|
for extra in package_dict.get('extras', []):
|
|
|
|
if extra['key'] == key:
|
|
|
|
return extra
|
2012-01-10 18:07:19 +01:00
|
|
|
if default_extras:
|
2015-10-23 19:30:28 +02:00
|
|
|
override_extras = self.config.get('override_extras', False)
|
2012-01-10 18:07:19 +01:00
|
|
|
if not 'extras' in package_dict:
|
|
|
|
package_dict['extras'] = {}
|
2015-10-23 19:30:28 +02:00
|
|
|
for key, value in default_extras.iteritems():
|
|
|
|
existing_extra = get_extra(key, package_dict)
|
|
|
|
if existing_extra and not override_extras:
|
|
|
|
continue # no need for the default
|
|
|
|
if existing_extra:
|
|
|
|
package_dict['extras'].remove(existing_extra)
|
|
|
|
# Look for replacement strings
|
|
|
|
if isinstance(value, basestring):
|
|
|
|
value = value.format(
|
|
|
|
harvest_source_id=harvest_object.job.source.id,
|
|
|
|
harvest_source_url=
|
|
|
|
harvest_object.job.source.url.strip('/'),
|
|
|
|
harvest_source_title=
|
|
|
|
harvest_object.job.source.title,
|
|
|
|
harvest_job_id=harvest_object.job.id,
|
|
|
|
harvest_object_id=harvest_object.id,
|
|
|
|
dataset_id=package_dict['id'])
|
|
|
|
|
|
|
|
package_dict['extras'].append({'key': key, 'value': value})
|
2012-01-10 18:07:19 +01:00
|
|
|
|
2014-02-11 18:27:19 +01:00
|
|
|
# Clear remote url_type for resources (eg datastore, upload) as we
|
|
|
|
# are only creating normal resources with links to the remote ones
|
|
|
|
for resource in package_dict.get('resources', []):
|
|
|
|
resource.pop('url_type', None)
|
|
|
|
|
2015-10-23 19:30:28 +02:00
|
|
|
result = self._create_or_update_package(
|
|
|
|
package_dict, harvest_object, package_dict_form='package_show')
|
2011-11-18 15:30:10 +01:00
|
|
|
|
2013-02-22 11:13:36 +01:00
|
|
|
return True
|
2015-10-23 19:30:28 +02:00
|
|
|
except ValidationError, e:
|
|
|
|
self._save_object_error('Invalid package with GUID %s: %r' %
|
|
|
|
(harvest_object.guid, e.error_dict),
|
|
|
|
harvest_object, 'Import')
|
2011-04-19 15:54:59 +02:00
|
|
|
except Exception, e:
|
2015-10-23 19:30:28 +02:00
|
|
|
self._save_object_error('%s' % e, harvest_object, 'Import')
|
|
|
|
|
2011-04-19 15:54:59 +02:00
|
|
|
|
2015-01-15 00:07:26 +01:00
|
|
|
class ContentFetchError(Exception):
|
|
|
|
pass
|
|
|
|
|
2015-10-23 19:30:28 +02:00
|
|
|
|
2015-01-15 00:07:26 +01:00
|
|
|
class RemoteResourceError(Exception):
|
|
|
|
pass
|
2015-10-27 18:33:22 +01:00
|
|
|
|
|
|
|
|
|
|
|
class SearchError(Exception):
|
|
|
|
pass
|