2011-06-02 12:07:07 +02:00
|
|
|
import logging
|
2011-11-23 12:05:52 +01:00
|
|
|
import re
|
2011-06-02 12:07:07 +02:00
|
|
|
|
2012-02-01 16:50:41 +01:00
|
|
|
from sqlalchemy.sql import update,and_, bindparam
|
|
|
|
|
2011-06-02 12:07:07 +02:00
|
|
|
from ckan import model
|
|
|
|
from ckan.model import Session, Package
|
2011-10-26 18:26:18 +02:00
|
|
|
from ckan.logic import ValidationError, NotFound, get_action
|
|
|
|
|
2011-06-02 12:07:07 +02:00
|
|
|
from ckan.logic.schema import default_package_schema
|
2012-01-10 15:46:12 +01:00
|
|
|
from ckan.lib.navl.validators import ignore_missing,ignore
|
2011-11-23 12:05:52 +01:00
|
|
|
from ckan.lib.munge import munge_title_to_name,substitute_ascii_equivalents
|
2011-06-02 12:07:07 +02:00
|
|
|
|
|
|
|
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
|
|
|
|
HarvestObjectError
|
|
|
|
|
|
|
|
from ckan.plugins.core import SingletonPlugin, implements
|
|
|
|
from ckanext.harvest.interfaces import IHarvester
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2011-11-23 12:05:52 +01:00
|
|
|
def munge_tag(tag):
|
|
|
|
tag = substitute_ascii_equivalents(tag)
|
|
|
|
tag = tag.lower().strip()
|
|
|
|
return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-')
|
|
|
|
|
2011-06-02 12:07:07 +02:00
|
|
|
class HarvesterBase(SingletonPlugin):
|
|
|
|
'''
|
2011-07-18 18:35:03 +02:00
|
|
|
Generic class for harvesters with helper functions
|
2011-06-02 12:07:07 +02:00
|
|
|
'''
|
|
|
|
implements(IHarvester)
|
|
|
|
|
|
|
|
def _gen_new_name(self,title):
|
2011-07-18 18:35:03 +02:00
|
|
|
'''
|
|
|
|
Creates a URL friendly name from a title
|
|
|
|
'''
|
2011-06-02 12:07:07 +02:00
|
|
|
name = munge_title_to_name(title).replace('_', '-')
|
|
|
|
while '--' in name:
|
|
|
|
name = name.replace('--', '-')
|
|
|
|
return name
|
|
|
|
|
|
|
|
def _check_name(self,name):
|
2011-07-18 18:35:03 +02:00
|
|
|
'''
|
|
|
|
Checks if a package name already exists in the database, and adds
|
|
|
|
a counter at the end if it does exist.
|
|
|
|
'''
|
2011-06-02 12:07:07 +02:00
|
|
|
like_q = u'%s%%' % name
|
|
|
|
pkg_query = Session.query(Package).filter(Package.name.ilike(like_q)).limit(100)
|
|
|
|
taken = [pkg.name for pkg in pkg_query]
|
|
|
|
if name not in taken:
|
|
|
|
return name
|
|
|
|
else:
|
|
|
|
counter = 1
|
|
|
|
while counter < 101:
|
|
|
|
if name+str(counter) not in taken:
|
|
|
|
return name+str(counter)
|
|
|
|
counter = counter + 1
|
|
|
|
return None
|
|
|
|
|
|
|
|
def _save_gather_error(self,message,job):
|
2011-07-18 18:35:03 +02:00
|
|
|
'''
|
|
|
|
Helper function to create an error during the gather stage.
|
|
|
|
'''
|
2011-06-02 12:07:07 +02:00
|
|
|
err = HarvestGatherError(message=message,job=job)
|
|
|
|
err.save()
|
|
|
|
log.error(message)
|
|
|
|
|
|
|
|
def _save_object_error(self,message,obj,stage=u'Fetch'):
|
2011-07-18 18:35:03 +02:00
|
|
|
'''
|
|
|
|
Helper function to create an error during the fetch or import stage.
|
|
|
|
'''
|
2011-06-02 12:07:07 +02:00
|
|
|
err = HarvestObjectError(message=message,object=obj,stage=stage)
|
|
|
|
err.save()
|
|
|
|
log.error(message)
|
|
|
|
|
|
|
|
def _create_harvest_objects(self, remote_ids, harvest_job):
|
2011-07-18 18:35:03 +02:00
|
|
|
'''
|
|
|
|
Given a list of remote ids and a Harvest Job, create as many Harvest Objects and
|
|
|
|
return a list of its ids to be returned to the fetch stage.
|
|
|
|
'''
|
2011-06-02 12:07:07 +02:00
|
|
|
try:
|
|
|
|
object_ids = []
|
|
|
|
if len(remote_ids):
|
|
|
|
for remote_id in remote_ids:
|
|
|
|
# Create a new HarvestObject for this identifier
|
|
|
|
obj = HarvestObject(guid = remote_id, job = harvest_job)
|
|
|
|
obj.save()
|
|
|
|
object_ids.append(obj.id)
|
|
|
|
return object_ids
|
|
|
|
else:
|
|
|
|
self._save_gather_error('No remote datasets could be identified', harvest_job)
|
|
|
|
except Exception, e:
|
|
|
|
self._save_gather_error('%r' % e.message, harvest_job)
|
|
|
|
|
|
|
|
def _create_or_update_package(self, package_dict, harvest_object):
|
|
|
|
'''
|
|
|
|
Creates a new package or updates an exisiting one according to the
|
|
|
|
package dictionary provided. The package dictionary should look like
|
|
|
|
the REST API response for a package:
|
|
|
|
|
|
|
|
http://ckan.net/api/rest/package/statistics-catalunya
|
|
|
|
|
|
|
|
Note that the package_dict must contain an id, which will be used to
|
|
|
|
check if the package needs to be created or updated (use the remote
|
|
|
|
dataset id).
|
|
|
|
|
|
|
|
If the remote server provides the modification date of the remote
|
|
|
|
package, add it to package_dict['metadata_modified'].
|
|
|
|
|
|
|
|
'''
|
|
|
|
try:
|
2011-07-18 18:35:03 +02:00
|
|
|
# Change default schema
|
2011-06-02 12:07:07 +02:00
|
|
|
schema = default_package_schema()
|
2012-01-10 15:46:12 +01:00
|
|
|
schema['id'] = [ignore_missing, unicode]
|
|
|
|
schema['__junk'] = [ignore]
|
2011-06-02 12:07:07 +02:00
|
|
|
|
2011-11-18 14:20:41 +01:00
|
|
|
# Check API version
|
|
|
|
if self.config:
|
|
|
|
api_version = self.config.get('api_version','2')
|
2011-11-18 15:12:30 +01:00
|
|
|
#TODO: use site user when available
|
|
|
|
user_name = self.config.get('user',u'harvest')
|
2011-11-18 14:20:41 +01:00
|
|
|
else:
|
2011-11-18 18:53:01 +01:00
|
|
|
api_version = '2'
|
2011-11-18 15:12:30 +01:00
|
|
|
user_name = u'harvest'
|
2011-11-18 14:20:41 +01:00
|
|
|
|
2011-06-02 12:07:07 +02:00
|
|
|
context = {
|
|
|
|
'model': model,
|
|
|
|
'session': Session,
|
2011-11-18 15:12:30 +01:00
|
|
|
'user': user_name,
|
2011-11-18 14:20:41 +01:00
|
|
|
'api_version': api_version,
|
2011-06-02 12:07:07 +02:00
|
|
|
'schema': schema,
|
|
|
|
}
|
|
|
|
|
|
|
|
tags = package_dict.get('tags', [])
|
|
|
|
tags = [munge_tag(t) for t in tags]
|
|
|
|
tags = list(set(tags))
|
|
|
|
package_dict['tags'] = tags
|
|
|
|
|
|
|
|
# Check if package exists
|
2011-07-29 12:31:03 +02:00
|
|
|
data_dict = {}
|
|
|
|
data_dict['id'] = package_dict['id']
|
2011-06-02 12:07:07 +02:00
|
|
|
try:
|
2011-10-26 18:26:18 +02:00
|
|
|
existing_package_dict = get_action('package_show')(context, data_dict)
|
2011-06-02 12:07:07 +02:00
|
|
|
# Check modified date
|
|
|
|
if not 'metadata_modified' in package_dict or \
|
|
|
|
package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
|
|
|
|
log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
|
|
|
|
# Update package
|
2011-07-29 12:31:03 +02:00
|
|
|
context.update({'id':package_dict['id']})
|
2012-02-01 16:50:41 +01:00
|
|
|
new_package = get_action('package_update_rest')(context, package_dict)
|
2011-06-02 12:07:07 +02:00
|
|
|
|
|
|
|
else:
|
|
|
|
log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
|
|
|
|
|
|
|
|
except NotFound:
|
|
|
|
# Package needs to be created
|
|
|
|
|
|
|
|
# Check if name has not already been used
|
|
|
|
package_dict['name'] = self._check_name(package_dict['name'])
|
|
|
|
|
|
|
|
log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
|
2011-10-26 18:26:18 +02:00
|
|
|
new_package = get_action('package_create_rest')(context, package_dict)
|
2011-06-02 12:07:07 +02:00
|
|
|
harvest_object.package_id = new_package['id']
|
2012-02-01 16:50:41 +01:00
|
|
|
|
|
|
|
# Flag the other objects linking to this package as not current anymore
|
|
|
|
from ckanext.harvest.model import harvest_object_table
|
|
|
|
conn = Session.connection()
|
|
|
|
u = update(harvest_object_table) \
|
|
|
|
.where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
|
|
|
|
.values(current=False)
|
|
|
|
conn.execute(u, b_package_id=new_package['id'])
|
|
|
|
Session.commit()
|
|
|
|
|
|
|
|
# Flag this as the current harvest object
|
|
|
|
|
|
|
|
harvest_object.package_id = new_package['id']
|
|
|
|
harvest_object.current = True
|
|
|
|
harvest_object.save()
|
2011-06-02 12:07:07 +02:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
except ValidationError,e:
|
|
|
|
log.exception(e)
|
|
|
|
self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
|
|
|
|
except Exception, e:
|
|
|
|
log.exception(e)
|
|
|
|
self._save_object_error('%r'%e,harvest_object,'Import')
|
|
|
|
|
|
|
|
return None
|