2012-08-09 14:38:17 +02:00
import re
2011-09-08 11:27:36 +02:00
from lxml import etree
from lxml . etree import XMLSyntaxError
2011-03-25 18:01:26 +01:00
from pylons . i18n import _
2012-03-06 17:01:43 +01:00
from ckan . authz import Authorizer
2012-02-29 11:59:02 +01:00
from ckan import model
2012-03-06 17:01:43 +01:00
from ckan . model . group import Group
2012-02-29 11:59:02 +01:00
2011-03-25 18:01:26 +01:00
import ckan . lib . helpers as h , json
2011-03-09 19:56:55 +01:00
from ckan . lib . base import BaseController , c , g , request , \
2011-03-10 15:02:21 +01:00
response , session , render , config , abort , redirect
2011-03-22 18:33:58 +01:00
2011-05-13 15:17:58 +02:00
from ckan . lib . navl . dictization_functions import DataError
2012-03-01 13:02:16 +01:00
from ckan . logic import NotFound , ValidationError , get_action , NotAuthorized
2011-05-13 15:17:58 +02:00
from ckanext . harvest . logic . schema import harvest_source_form_schema
2012-03-07 16:04:50 +01:00
2012-03-07 16:03:33 +01:00
from ckan . lib . helpers import Page , pager_url
2012-08-16 10:16:34 +02:00
import ckan . plugins as p
2012-03-07 16:04:50 +01:00
2011-05-13 15:17:58 +02:00
import logging
log = logging . getLogger ( __name__ )
2011-03-09 19:56:55 +01:00
class ViewController ( BaseController ) :
2012-03-01 13:02:16 +01:00
not_auth_message = _ ( ' Not authorized to see this page ' )
2011-03-25 18:01:26 +01:00
2012-03-06 17:01:43 +01:00
def __before__ ( self , action , * * params ) :
super ( ViewController , self ) . __before__ ( action , * * params )
c . publisher_auth = ( config . get ( ' ckan.harvest.auth.profile ' , None ) == ' publisher ' )
def _get_publishers ( self ) :
groups = None
if c . publisher_auth :
if Authorizer ( ) . is_sysadmin ( c . user ) :
groups = Group . all ( group_type = ' publisher ' )
elif c . userobj :
groups = c . userobj . get_groups ( ' publisher ' )
else : # anonymous user shouldn't have access to this page anyway.
groups = [ ]
# Be explicit about which fields we make available in the template
groups = [ {
' name ' : g . name ,
' id ' : g . id ,
' title ' : g . title ,
} for g in groups ]
return groups
2011-05-13 15:17:58 +02:00
def index ( self ) :
2012-03-01 13:02:16 +01:00
context = { ' model ' : model , ' user ' : c . user , ' session ' : model . Session }
try :
# Request all harvest sources
c . sources = get_action ( ' harvest_source_list ' ) ( context , { } )
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-03-10 10:45:16 +01:00
2012-03-07 12:49:12 +01:00
if c . publisher_auth :
c . sources = sorted ( c . sources , key = lambda source : source [ ' publisher_title ' ] )
2012-07-19 16:17:50 +02:00
c . status = config . get ( ' ckan.harvest.status ' )
2011-05-13 15:17:58 +02:00
return render ( ' index.html ' )
2011-03-10 16:32:51 +01:00
2011-05-13 15:17:58 +02:00
def new ( self , data = None , errors = None , error_summary = None ) :
2011-04-05 14:39:23 +02:00
2011-05-13 15:17:58 +02:00
if ( ' save ' in request . params ) and not data :
return self . _save_new ( )
data = data or { }
errors = errors or { }
error_summary = error_summary or { }
2012-03-01 13:02:16 +01:00
try :
context = { ' model ' : model , ' user ' : c . user }
harvesters_info = get_action ( ' harvesters_info_show ' ) ( context , { } )
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2012-02-29 16:20:35 +01:00
vars = { ' data ' : data , ' errors ' : errors , ' error_summary ' : error_summary , ' harvesters ' : harvesters_info }
2012-02-14 15:24:32 +01:00
2012-03-06 17:01:43 +01:00
c . groups = self . _get_publishers ( )
2012-08-14 18:33:32 +02:00
2012-08-14 19:01:29 +02:00
vars [ ' form_items ' ] = self . _make_autoform_items ( harvesters_info )
2012-08-14 18:33:32 +02:00
2011-05-13 15:17:58 +02:00
c . form = render ( ' source/new_source_form.html ' , extra_vars = vars )
return render ( ' source/new.html ' )
2012-08-14 19:01:29 +02:00
2011-05-13 15:17:58 +02:00
def _save_new ( self ) :
2011-03-10 16:32:51 +01:00
try :
2011-05-13 15:17:58 +02:00
data_dict = dict ( request . params )
self . _check_data_dict ( data_dict )
2012-03-06 17:01:43 +01:00
context = { ' model ' : model , ' user ' : c . user , ' session ' : model . Session ,
' schema ' : harvest_source_form_schema ( ) }
2012-02-29 16:20:35 +01:00
source = get_action ( ' harvest_source_create ' ) ( context , data_dict )
2011-03-10 16:32:51 +01:00
2011-05-13 15:17:58 +02:00
# Create a harvest job for the new source
2012-02-29 16:20:35 +01:00
get_action ( ' harvest_job_create ' ) ( context , { ' source_id ' : source [ ' id ' ] } )
2011-04-05 14:39:23 +02:00
2011-05-13 15:17:58 +02:00
h . flash_success ( _ ( ' New harvest source added successfully. '
' A new harvest job for the source has also been created. ' ) )
2012-03-06 17:01:43 +01:00
redirect ( ' /harvest/ %s ' % source [ ' id ' ] )
2012-03-01 13:02:16 +01:00
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-05-13 15:17:58 +02:00
except DataError , e :
abort ( 400 , ' Integrity Error ' )
except ValidationError , e :
errors = e . error_dict
2011-05-13 19:39:36 +02:00
error_summary = e . error_summary if hasattr ( e , ' error_summary ' ) else None
2011-05-13 15:17:58 +02:00
return self . new ( data_dict , errors , error_summary )
2011-03-10 15:02:21 +01:00
2011-05-13 15:17:58 +02:00
def edit ( self , id , data = None , errors = None , error_summary = None ) :
if ( ' save ' in request . params ) and not data :
return self . _save_edit ( id )
2012-02-29 11:59:02 +01:00
2011-05-13 15:17:58 +02:00
if not data :
2011-03-11 13:35:27 +01:00
try :
2012-03-01 13:02:16 +01:00
context = { ' model ' : model , ' user ' : c . user }
2012-02-29 11:59:02 +01:00
old_data = get_action ( ' harvest_source_show ' ) ( context , { ' id ' : id } )
2011-05-13 15:17:58 +02:00
except NotFound :
abort ( 404 , _ ( ' Harvest Source not found ' ) )
2012-03-01 13:02:16 +01:00
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-05-13 15:17:58 +02:00
data = data or old_data
errors = errors or { }
error_summary = error_summary or { }
2012-03-01 13:02:16 +01:00
try :
context = { ' model ' : model , ' user ' : c . user }
harvesters_info = get_action ( ' harvesters_info_show ' ) ( context , { } )
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2012-02-14 15:24:32 +01:00
2012-02-29 16:20:35 +01:00
vars = { ' data ' : data , ' errors ' : errors , ' error_summary ' : error_summary , ' harvesters ' : harvesters_info }
2012-02-14 15:24:32 +01:00
2012-03-06 17:01:43 +01:00
c . groups = self . _get_publishers ( )
2012-08-14 18:33:32 +02:00
2012-08-14 19:01:29 +02:00
vars [ ' form_items ' ] = self . _make_autoform_items ( harvesters_info )
2012-08-14 18:33:32 +02:00
2011-05-13 15:17:58 +02:00
c . form = render ( ' source/new_source_form.html ' , extra_vars = vars )
2012-08-14 18:33:32 +02:00
2011-05-13 15:17:58 +02:00
return render ( ' source/edit.html ' )
def _save_edit ( self , id ) :
try :
data_dict = dict ( request . params )
2012-02-29 16:20:35 +01:00
data_dict [ ' id ' ] = id
2011-05-13 15:17:58 +02:00
self . _check_data_dict ( data_dict )
2012-03-06 17:01:43 +01:00
context = { ' model ' : model , ' user ' : c . user , ' session ' : model . Session ,
' schema ' : harvest_source_form_schema ( ) }
2011-05-13 15:17:58 +02:00
2012-02-29 16:20:35 +01:00
source = get_action ( ' harvest_source_update ' ) ( context , data_dict )
2011-05-13 15:17:58 +02:00
h . flash_success ( _ ( ' Harvest source edited successfully. ' ) )
2012-03-06 17:01:43 +01:00
redirect ( ' /harvest/ %s ' % id )
2012-03-01 13:02:16 +01:00
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-05-13 15:17:58 +02:00
except DataError , e :
abort ( 400 , _ ( ' Integrity Error ' ) )
except NotFound , e :
abort ( 404 , _ ( ' Harvest Source not found ' ) )
except ValidationError , e :
errors = e . error_dict
2011-05-13 19:39:36 +02:00
error_summary = e . error_summary if hasattr ( e , ' error_summary ' ) else None
2011-05-13 15:17:58 +02:00
return self . edit ( id , data_dict , errors , error_summary )
def _check_data_dict ( self , data_dict ) :
''' Check if the return data is correct '''
2012-11-01 15:12:01 +01:00
# TODO: remove frequency once it is added to the frontend!
surplus_keys_schema = [ ' id ' , ' publisher_id ' , ' user_id ' , ' config ' , ' save ' , ' frequency ' ]
2011-05-13 15:17:58 +02:00
schema_keys = harvest_source_form_schema ( ) . keys ( )
keys_in_schema = set ( schema_keys ) - set ( surplus_keys_schema )
2012-03-06 17:01:43 +01:00
# user_id is not yet used, we'll set the logged user one for the time being
if not data_dict . get ( ' user_id ' , None ) :
if c . userobj :
data_dict [ ' user_id ' ] = c . userobj . id
2011-05-13 15:17:58 +02:00
if keys_in_schema - set ( data_dict . keys ( ) ) :
log . info ( _ ( ' Incorrect form fields posted ' ) )
raise DataError ( data_dict )
def read ( self , id ) :
2011-04-11 17:30:56 +02:00
try :
2012-03-01 13:02:16 +01:00
context = { ' model ' : model , ' user ' : c . user }
2012-02-29 11:59:02 +01:00
c . source = get_action ( ' harvest_source_show ' ) ( context , { ' id ' : id } )
2011-06-14 11:27:48 +02:00
c . page = Page (
collection = c . source [ ' status ' ] [ ' packages ' ] ,
page = request . params . get ( ' page ' , 1 ) ,
2012-03-07 16:03:33 +01:00
items_per_page = 20 ,
url = pager_url
2011-06-14 11:27:48 +02:00
)
2011-04-11 17:30:56 +02:00
2011-05-13 15:17:58 +02:00
return render ( ' source/read.html ' )
except NotFound :
abort ( 404 , _ ( ' Harvest source not found ' ) )
2012-03-01 13:02:16 +01:00
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-03-23 18:02:02 +01:00
2011-03-11 13:41:13 +01:00
def delete ( self , id ) :
2011-03-23 18:02:02 +01:00
try :
2012-03-01 13:02:16 +01:00
context = { ' model ' : model , ' user ' : c . user }
2012-02-29 16:20:35 +01:00
get_action ( ' harvest_source_delete ' ) ( context , { ' id ' : id } )
2011-03-23 18:02:02 +01:00
2012-02-29 16:20:35 +01:00
h . flash_success ( _ ( ' Harvesting source successfully inactivated ' ) )
2011-05-13 15:17:58 +02:00
redirect ( h . url_for ( ' harvest ' ) )
except NotFound :
abort ( 404 , _ ( ' Harvest source not found ' ) )
2012-03-01 13:02:16 +01:00
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-04-05 14:39:23 +02:00
2011-03-10 18:24:23 +01:00
def create_harvesting_job ( self , id ) :
2011-03-11 13:35:27 +01:00
try :
2012-03-01 13:02:16 +01:00
context = { ' model ' : model , ' user ' : c . user , ' session ' : model . Session }
2012-02-29 16:20:35 +01:00
get_action ( ' harvest_job_create ' ) ( context , { ' source_id ' : id } )
2011-05-13 15:17:58 +02:00
h . flash_success ( _ ( ' Refresh requested, harvesting will take place within 15 minutes. ' ) )
except NotFound :
abort ( 404 , _ ( ' Harvest source not found ' ) )
2012-03-01 13:02:16 +01:00
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-05-13 18:02:18 +02:00
except Exception , e :
2012-08-15 19:28:08 +02:00
if ' Can not create jobs on inactive sources ' in str ( e ) :
2012-08-16 10:27:36 +02:00
h . flash_error ( _ ( ' Cannot create new harvest jobs on inactive sources. '
2012-08-16 10:30:04 +02:00
+ ' First, please change the source status to \' active \' . ' ) )
2012-08-15 19:28:08 +02:00
elif ' There already is an unrun job for this source ' in str ( e ) :
2012-08-16 09:49:35 +02:00
h . flash_notice ( _ ( ' A harvest job has already been scheduled for this source ' ) )
2012-08-15 19:28:08 +02:00
else :
msg = ' An error occurred: [ %s ] ' % str ( e )
h . flash_error ( msg )
2011-03-22 18:33:58 +01:00
2011-05-13 18:02:18 +02:00
redirect ( h . url_for ( ' harvest ' ) )
2011-09-08 11:27:36 +02:00
def show_object ( self , id ) :
2012-03-01 13:02:16 +01:00
2011-09-08 11:27:36 +02:00
try :
2012-03-01 13:02:16 +01:00
context = { ' model ' : model , ' user ' : c . user }
2012-02-29 11:59:02 +01:00
obj = get_action ( ' harvest_object_show ' ) ( context , { ' id ' : id } )
2011-09-08 11:27:36 +02:00
# Check content type. It will probably be either XML or JSON
try :
2012-11-13 13:06:36 +01:00
if obj [ ' content ' ] :
content = obj [ ' content ' ]
elif ' original_document ' in obj [ ' extras ' ] :
content = obj [ ' extras ' ] [ ' original_document ' ]
else :
abort ( 404 , _ ( ' No content found ' ) )
content = re . sub ( ' < \ ?xml(.*) \ ?> ' , ' ' , content )
2012-08-09 14:38:17 +02:00
etree . fromstring ( content )
2011-09-08 11:27:36 +02:00
response . content_type = ' application/xml '
except XMLSyntaxError :
try :
2012-02-29 11:59:02 +01:00
json . loads ( obj [ ' content ' ] )
2011-09-08 11:27:36 +02:00
response . content_type = ' application/json '
except ValueError :
pass
2012-11-13 13:06:36 +01:00
response . headers [ ' Content-Length ' ] = len ( content )
return content
2011-09-08 11:27:36 +02:00
except NotFound :
abort ( 404 , _ ( ' Harvest object not found ' ) )
2012-03-01 13:02:16 +01:00
except NotAuthorized , e :
abort ( 401 , self . not_auth_message )
2011-09-08 11:27:36 +02:00
except Exception , e :
2012-08-09 14:38:17 +02:00
msg = ' An error occurred: [ %s ] ' % str ( e )
abort ( 500 , msg )
2012-08-14 19:01:29 +02:00
def _make_autoform_items ( self , harvesters_info ) :
states = [ { ' text ' : ' active ' , ' value ' : ' True ' } ,
{ ' text ' : ' withdrawn ' , ' value ' : ' False ' } , ]
harvest_list = [ ]
2012-08-16 10:16:34 +02:00
harvest_descriptions = p . toolkit . literal ( ' <ul> ' )
2012-08-14 19:01:29 +02:00
for harvester in harvesters_info :
harvest_list . append ( { ' text ' : harvester [ ' title ' ] , ' value ' : harvester [ ' name ' ] } )
2012-08-16 10:16:34 +02:00
harvest_descriptions + = p . toolkit . literal ( ' <li><span class= " harvester-title " > ' )
harvest_descriptions + = harvester [ ' title ' ]
harvest_descriptions + = p . toolkit . literal ( ' </span>: ' )
harvest_descriptions + = harvester [ ' description ' ]
harvest_descriptions + = p . toolkit . literal ( ' </li> ' )
harvest_descriptions + = p . toolkit . literal ( ' </ul> ' )
2012-08-14 19:01:29 +02:00
items = [
2012-08-15 19:26:03 +02:00
{ ' name ' : ' url ' , ' control ' : ' input ' , ' label ' : _ ( ' URL ' ) , ' placeholder ' : _ ( ' ' ) , ' extra_info ' : ' This should include the http:// part of the URL ' } ,
2012-08-14 19:01:29 +02:00
{ ' name ' : ' type ' , ' control ' : ' select ' , ' options ' : harvest_list , ' label ' : _ ( ' Source type ' ) , ' placeholder ' : _ ( ' ' ) , ' extra_info ' : ' Which type of source does the URL above represent? ' } ,
2012-08-16 10:16:34 +02:00
{ ' control ' : ' html ' , ' html ' : harvest_descriptions } ,
2012-08-14 19:01:29 +02:00
{ ' name ' : ' title ' , ' control ' : ' input ' , ' label ' : _ ( ' Title ' ) , ' placeholder ' : _ ( ' ' ) , ' extra_info ' : ' This will be shown as the datasets source. ' } ,
{ ' name ' : ' description ' , ' control ' : ' textarea ' , ' label ' : _ ( ' Description ' ) , ' placeholder ' : _ ( ' ' ) , ' extra_info ' : ' You can add your own notes here about what the URL above represents to remind you later. ' } , ]
if c . groups :
pubs = [ ]
for group in c . groups :
pubs . append ( { ' text ' : group [ ' title ' ] , ' value ' : group [ ' id ' ] } )
items . append ( { ' name ' : ' publisher_id ' , ' control ' : ' select ' , ' options ' : pubs , ' label ' : _ ( ' Publisher ' ) , ' placeholder ' : _ ( ' ' ) } )
items + = [
2012-08-15 11:40:52 +02:00
{ ' name ' : ' config ' , ' control ' : ' textarea ' , ' label ' : _ ( ' Configuration ' ) , ' placeholder ' : _ ( ' ' ) , ' extra_info ' : ' ' } ,
2012-08-14 19:01:29 +02:00
{ ' name ' : ' active ' , ' control ' : ' select ' , ' options ' : states , ' label ' : _ ( ' State ' ) , ' placeholder ' : _ ( ' ' ) , ' extra_text ' : ' ' } ,
]
return items