diff --git a/.gitignore b/.gitignore index 6d036a4..32940b0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ syntax: glob *~ build/ dist/ +credentials.json +token.dat \ No newline at end of file diff --git a/README.rst b/README.rst index a0d8f63..9395c7a 100644 --- a/README.rst +++ b/README.rst @@ -32,6 +32,7 @@ Installation :: googleanalytics.id = UA-1010101-1 + googleanalytics.account = Account name (i.e. data.gov.uk, see top level item at https://www.google.com/analytics) googleanalytics.username = googleaccount@gmail.com googleanalytics.password = googlepassword @@ -85,6 +86,8 @@ Installation If ``track_events`` is set, Google Analytics event tracking will be enabled. + Follow the steps described in the Authorization section below. + 5. Restart CKAN (e.g. by restarting Apache) 6. Wait a while for some stats to be recorded in Google @@ -92,9 +95,10 @@ Installation 7. Import Google stats by running the following command from ``src/ckanext-googleanalytics``:: - paster loadanalytics --config=../ckan/development.ini + paster loadanalytics token.dat 2012-10-10 --config=../ckan/development.ini - (Of course, pointing config at your specific site config) + (Of course, pointing config at your specific site config and token.dat at the + oauth file generated from the authorization step) 8. Look at some stats within CKAN @@ -109,6 +113,39 @@ Installation 9. Consider running the import command reguarly as a cron job, or remember to run it by hand, or your statistics won't get updated. + +Authorization +-------------- + +Before you can access the data, you need to set up the OAUTH details which you can do by following the `instructions `_ the outcome of which will be a file called credentials.json which should look like credentials.json.template with the relevant fields completed. These steps are below for convenience: + +1. Visit the `Google APIs Console `_ + +2. Sign-in and create a project or use an existing project. + +3. In the `Services pane `_ , activate Analytics API for your project. If prompted, read and accept the terms of service. + +4. Go to the `API Access pane `_ + +5. Click Create an OAuth 2.0 client ID.... + +6. Fill out the Branding Information fields and click Next. + +7. In Client ID Settings, set Application type to Installed application. + +8. Click Create client ID + +9. The details you need below are Client ID, Client secret, and Redirect URIs + + +Once you have set up your credentials.json file you can generate an oauth token file by using the +following command, which will store your oauth token in a file called token.dat once you have finished +giving permission in the browser:: + + $ paster getauthtoken --config=../ckan/development.ini + + + Testing ------- diff --git a/ckanext/googleanalytics/commands.py b/ckanext/googleanalytics/commands.py index 5933db9..7d23c61 100644 --- a/ckanext/googleanalytics/commands.py +++ b/ckanext/googleanalytics/commands.py @@ -1,3 +1,5 @@ +import os +import re import logging import datetime import time @@ -13,6 +15,35 @@ log = logging.getLogger('ckanext.googleanalytics') PACKAGE_URL = '/dataset/' # XXX get from routes... DEFAULT_RESOURCE_URL_TAG = '/downloads/' +RESOURCE_URL_REGEX = re.compile('/dataset/[a-z0-9-_]+/resource/[a-z0-9-_]+') +DATASET_EDIT_REGEX = re.compile('/dataset/edit/([a-z0-9-_]+)') + +class GetAuthToken(CkanCommand): + """ Get's the Google auth token + + Usage: paster getauthtoken + + Where is the file name containing the details + for the service (obtained from https://code.google.com/apis/console). + By default this is set to credentials.json + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 1 + min_args = 0 + + def command(self): + """ + In this case we don't want a valid service, but rather just to + force the user through the auth flow. We allow this to complete to + act as a form of verification instead of just getting the token and + assuming it is correct. + """ + from ga_auth import init_service + init_service('token.dat', + self.args[0] if self.args + else 'credentials.json') + class InitDB(CkanCommand): """Initialise the local stats database tables @@ -37,14 +68,15 @@ class LoadAnalytics(CkanCommand): in a local database Options: - internal [date] use ckan internal tracking tables + internal [date] use ckan internal tracking tables + token_file specifies the OAUTH token file date specifies start date for retrieving analytics data YYYY-MM-DD format """ summary = __doc__.split('\n')[0] usage = __doc__ - max_args = 2 - min_args = 0 + max_args = 3 + min_args = 1 TEST_HOST = None CONFIG = None @@ -71,10 +103,10 @@ class LoadAnalytics(CkanCommand): engine.execute(sql) for url, count in packages_data.iteritems(): - if url.startswith(DEFAULT_RESOURCE_URL_TAG): + # If it matches the resource then we should mark it as a resource. + # For resources we don't currently find the package ID. + if RESOURCE_URL_REGEX.match(url): tracking_type = 'resource' - # remove the leading identifier - url = url[len(DEFAULT_RESOURCE_URL_TAG):] else: tracking_type = 'page' @@ -91,6 +123,14 @@ class LoadAnalytics(CkanCommand): WHERE t.package_id IS NULL AND tracking_type = 'page';''' engine.execute(sql, PACKAGE_URL) + # get ids for dataset edit urls which aren't captured otherwise + sql = '''UPDATE tracking_summary t + SET package_id = COALESCE( + (SELECT id FROM package p WHERE t.url = %s || p.name) + ,'~~not~found~~') + WHERE t.package_id = '~~not~found~~' AND tracking_type = 'page';''' + engine.execute(sql, '%sedit/' % PACKAGE_URL) + # update summary totals for resources sql = '''UPDATE tracking_summary t1 SET running_total = ( @@ -128,9 +168,9 @@ class LoadAnalytics(CkanCommand): engine.execute(sql) def bulk_import(self): - if len(self.args) == 2: + if len(self.args) == 3: # Get summeries from specified date - start_date = datetime.datetime.strptime(self.args[1], '%Y-%m-%d') + start_date = datetime.datetime.strptime(self.args[2], '%Y-%m-%d') else: # No date given. See when we last have data for and get data # from 2 days before then in case new data is available. @@ -156,8 +196,8 @@ class LoadAnalytics(CkanCommand): # sleep to rate limit requests time.sleep(0.25) start_date = stop_date - log.info('%s recieved %s' % (len(packages_data), start_date)) - print '%s recieved %s' % (len(packages_data), start_date) + log.info('%s received %s' % (len(packages_data), start_date)) + print '%s received %s' % (len(packages_data), start_date) def get_ga_data_new(self, start_date=None, end_date=None): """Get raw data from Google Analtyics for packages and @@ -171,7 +211,7 @@ class LoadAnalytics(CkanCommand): end_date = end_date.strftime("%Y-%m-%d") packages = {} - query = 'ga:pagePath=~^%s,ga:pagePath=~^%s' % \ + query = 'ga:pagePath=~%s,ga:pagePath=~%s' % \ (PACKAGE_URL, self.resource_url_tag) metrics = 'ga:uniquePageviews' sort = '-ga:uniquePageviews' @@ -179,33 +219,52 @@ class LoadAnalytics(CkanCommand): start_index = 1 max_results = 10000 # data retrival is chunked - while True: - feed = self.ga_query(query_filter=query, - from_date=start_date, + completed = False + while not completed: + results = self.service.data().ga().get(ids='ga:%s' % self.profile_id, + filters=query, + dimensions='ga:pagePath', + start_date=start_date, start_index=start_index, max_results=max_results, metrics=metrics, sort=sort, - to_date=end_date) - for entry in feed.entry: - for dim in entry.dimension: - if dim.name == "ga:pagePath": - package = dim.value - count = entry.get_metric( - 'ga:uniquePageviews').value or 0 - packages[package] = int(count) - if len(feed.entry) < max_results: - break + end_date=end_date).execute() + result_count = len(results.get('rows', [])) + if result_count < max_results: + completed = True + + for result in results.get('rows', []): + package = result[0] + package = '/' + '/'.join(package.split('/')[2:]) + count = result[1] + packages[package] = int(count) + start_index += max_results + # rate limiting - time.sleep(0.25) + time.sleep(0.2) return packages def parse_and_save(self): """Grab raw data from Google Analytics and save to the database""" + from ga_auth import (init_service, get_profile_id) + + tokenfile = self.args[0] + if not os.path.exists(tokenfile): + raise Exception('Cannot find the token file %s' % self.args[0]) + + try: + self.service = init_service(self.args[0], None) + except TypeError: + print ('Have you correctly run the getauthtoken task and ' + 'specified the correct file here') + raise Exception('Unable to create a service') + self.profile_id = get_profile_id(self.service) + if len(self.args): - if self.args[0].lower() != 'internal': - raise Exception('Illegal argument %s' % self.args[0]) + if len(self.args) > 1 and self.args[1].lower() != 'internal': + raise Exception('Illegal argument %s' % self.args[1]) self.bulk_import() else: packages_data = self.get_ga_data() @@ -255,19 +314,19 @@ class LoadAnalytics(CkanCommand): http_client=self.TEST_HOST) else: my_client = client.AnalyticsClient(source=SOURCE_APP_NAME) - my_client.ClientLogin(username, password, SOURCE_APP_NAME) - account_query = client.AccountFeedQuery({'max-results': '300'}) - feed = my_client.GetAccountFeed(account_query) - table_id = None - for entry in feed.entry: - if entry.get_property("ga:webPropertyId").value == ga_id: - table_id = entry.table_id.text - break - if not table_id: - msg = "Couldn't find a profile with id '%s'" % ga_id - raise Exception(msg) - self.table_id = table_id - self.client = my_client + #my_client.ClientLogin(username, password, SOURCE_APP_NAME) + #account_query = client.AccountFeedQuery({'max-results': '300'}) + #feed = my_client.GetAccountFeed(account_query) + #table_id = None + #for entry in feed.entry: + # if entry.get_property("ga:webPropertyId").value == ga_id: + # table_id = entry.table_id.text + # break + #if not table_id: + # msg = "Couldn't find a profile with id '%s'" % ga_id + # raise Exception(msg) + #self.table_id = table_id + #self.client = my_client def ga_query(self, query_filter=None, from_date=None, to_date=None, start_index=1, max_results=10000, metrics=None, sort=None): @@ -280,18 +339,18 @@ class LoadAnalytics(CkanCommand): metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews' if not sort: sort = '-ga:newVisits' - query = client.DataFeedQuery({'ids': '%s' % self.table_id, - 'start-date': from_date, - 'end-date': to_date, - 'dimensions': 'ga:pagePath', - 'metrics': metrics, - 'sort': sort, - 'start-index': start_index, - 'filters': query_filter, - 'max-results': max_results - }) - feed = self.client.GetDataFeed(query) - return feed + + results = self.service.data().ga().get(ids='ga:' + self.profile_id, + start_date=from_date, + end_date=to_date, + dimensions='ga:pagePath', + metrics=metrics, + sort=sort, + start_index=start_index, + filters=query_filter, + max_results=max_results + ).execute() + return results def get_ga_data(self, query_filter=None, start_date=None, end_date=None): """Get raw data from Google Analtyics for packages and @@ -311,14 +370,12 @@ class LoadAnalytics(CkanCommand): dates = {'recent': recent_date, 'ever': floor_date} for date_name, date in dates.items(): for query in queries: - feed = self.ga_query(query_filter=query, + results = self.ga_query(query_filter=query, from_date=date) - for entry in feed.entry: - for dim in entry.dimension: - if dim.name == "ga:pagePath": - package = dim.value - count = entry.get_metric( - 'ga:uniquePageviews').value or 0 - packages.setdefault(package, {})[date_name] = count + for result in results.get('rows'): + package = result[0] + package = '/' + '/'.join(package.split('/')[2:]) + count = result[1] + packages.setdefault(package, {})[date_name] = count return packages diff --git a/ckanext/googleanalytics/ga_auth.py b/ckanext/googleanalytics/ga_auth.py new file mode 100644 index 0000000..c556d1e --- /dev/null +++ b/ckanext/googleanalytics/ga_auth.py @@ -0,0 +1,69 @@ +import os +import httplib2 +from apiclient.discovery import build +from oauth2client.client import flow_from_clientsecrets +from oauth2client.file import Storage +from oauth2client.tools import run + +from pylons import config + + +def _prepare_credentials(token_filename, credentials_filename): + """ + Either returns the user's oauth credentials or uses the credentials + file to generate a token (by forcing the user to login in the browser) + """ + storage = Storage(token_filename) + credentials = storage.get() + + if credentials is None or credentials.invalid: + flow = flow_from_clientsecrets(credentials_filename, + scope='https://www.googleapis.com/auth/analytics.readonly', + message="Can't find the credentials file") + credentials = run(flow, storage) + + return credentials + + +def init_service(token_file, credentials_file): + """ + Given a file containing the user's oauth token (and another with + credentials in case we need to generate the token) will return a + service object representing the analytics API. + """ + http = httplib2.Http() + + credentials = _prepare_credentials(token_file, credentials_file) + http = credentials.authorize(http) # authorize the http object + + return build('analytics', 'v3', http=http) + + +def get_profile_id(service): + """ + Get the profile ID for this user and the service specified by the + 'googleanalytics.id' configuration option. This function iterates + over all of the accounts available to the user who invoked the + service to find one where the account name matches (in case the + user has several). + """ + accounts = service.management().accounts().list().execute() + + if not accounts.get('items'): + return None + + accountName = config.get('googleanalytics.account') + webPropertyId = config.get('googleanalytics.id') + for acc in accounts.get('items'): + if acc.get('name') == accountName: + accountId = acc.get('id') + + webproperties = service.management().webproperties().list(accountId=accountId).execute() + + profiles = service.management().profiles().list( + accountId=accountId, webPropertyId=webPropertyId).execute() + + if profiles.get('items'): + return profiles.get('items')[0].get('id') + + return None diff --git a/setup.py b/setup.py index 6764221..34de505 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,8 @@ setup( include_package_data=True, zip_safe=False, install_requires=[ - 'gdata' + 'gdata', + 'google-api-python-client' ], entry_points=\ """ @@ -31,5 +32,6 @@ setup( [paste.paster_command] loadanalytics = ckanext.googleanalytics.commands:LoadAnalytics initdb = ckanext.googleanalytics.commands:InitDB + getauthtoken = ckanext.googleanalytics.commands:GetAuthToken """, )