Merge pull request #3 from okfn/v3support

V3support
This commit is contained in:
Ross Jones 2012-10-18 07:22:12 -07:00
commit c4a720840b
5 changed files with 249 additions and 95 deletions

2
.gitignore vendored
View File

@ -6,3 +6,5 @@ syntax: glob
*~
build/
dist/
credentials.json
token.dat

View File

@ -3,7 +3,7 @@ CKAN Google Analytics Extension
**Status:** Production
**CKAN Version:** 1.5.*
**CKAN Version:** >= 1.5.*
Overview
@ -32,6 +32,7 @@ Installation
::
googleanalytics.id = UA-1010101-1
googleanalytics.account = Account name (i.e. data.gov.uk, see top level item at https://www.google.com/analytics)
googleanalytics.username = googleaccount@gmail.com
googleanalytics.password = googlepassword
@ -54,7 +55,11 @@ Installation
ckan.plugins = googleanalytics
(If there are other plugins activated, add this to the list. Each
plugin should be separated with a space)
plugin should be separated with a space). If you are using this plugin
with a version of CKAN < 2.0 then you should also set the following to
make sure the correct templates are found for the reports
ckan.legacy_templates = true
Finally, there are some optional configuration settings (shown here
@ -85,6 +90,8 @@ Installation
If ``track_events`` is set, Google Analytics event tracking will be
enabled.
Follow the steps described in the Authorization section below.
5. Restart CKAN (e.g. by restarting Apache)
6. Wait a while for some stats to be recorded in Google
@ -92,9 +99,10 @@ Installation
7. Import Google stats by running the following command from
``src/ckanext-googleanalytics``::
paster loadanalytics --config=../ckan/development.ini
paster loadanalytics token.dat --config=../ckan/development.ini
(Of course, pointing config at your specific site config)
(Of course, pointing config at your specific site config and token.dat at the
oauth file generated from the authorization step)
8. Look at some stats within CKAN
@ -109,6 +117,39 @@ Installation
9. Consider running the import command reguarly as a cron job, or
remember to run it by hand, or your statistics won't get updated.
Authorization
--------------
Before you can access the data, you need to set up the OAUTH details which you can do by following the `instructions <https://developers.google.com/analytics/resources/tutorials/hello-analytics-api>`_ the outcome of which will be a file called credentials.json which should look like credentials.json.template with the relevant fields completed. These steps are below for convenience:
1. Visit the `Google APIs Console <https://code.google.com/apis/console>`_
2. Sign-in and create a project or use an existing project.
3. In the `Services pane <https://code.google.com/apis/console#:services>`_ , activate Analytics API for your project. If prompted, read and accept the terms of service.
4. Go to the `API Access pane <https://code.google.com/apis/console/#:access>`_
5. Click Create an OAuth 2.0 client ID....
6. Fill out the Branding Information fields and click Next.
7. In Client ID Settings, set Application type to Installed application.
8. Click Create client ID
9. The details you need below are Client ID, Client secret, and Redirect URIs
Once you have set up your credentials.json file you can generate an oauth token file by using the
following command, which will store your oauth token in a file called token.dat once you have finished
giving permission in the browser::
$ paster getauthtoken --config=../ckan/development.ini
Testing
-------

View File

@ -1,10 +1,11 @@
import os
import re
import logging
import datetime
import time
from pylons import config as pylonsconfig
from ckan.lib.cli import CkanCommand
from gdata.analytics import client
import ckan.model as model
import dbutil
@ -13,6 +14,35 @@ log = logging.getLogger('ckanext.googleanalytics')
PACKAGE_URL = '/dataset/' # XXX get from routes...
DEFAULT_RESOURCE_URL_TAG = '/downloads/'
RESOURCE_URL_REGEX = re.compile('/dataset/[a-z0-9-_]+/resource/([a-z0-9-_]+)')
DATASET_EDIT_REGEX = re.compile('/dataset/edit/([a-z0-9-_]+)')
class GetAuthToken(CkanCommand):
""" Get's the Google auth token
Usage: paster getauthtoken <credentials_file>
Where <credentials_file> is the file name containing the details
for the service (obtained from https://code.google.com/apis/console).
By default this is set to credentials.json
"""
summary = __doc__.split('\n')[0]
usage = __doc__
max_args = 1
min_args = 0
def command(self):
"""
In this case we don't want a valid service, but rather just to
force the user through the auth flow. We allow this to complete to
act as a form of verification instead of just getting the token and
assuming it is correct.
"""
from ga_auth import init_service
init_service('token.dat',
self.args[0] if self.args else 'credentials.json')
class InitDB(CkanCommand):
"""Initialise the local stats database tables
@ -24,8 +54,6 @@ class InitDB(CkanCommand):
def command(self):
self._load_config()
# funny dance we need to do to make sure we've got a
# configured session
model.Session.remove()
model.Session.configure(bind=model.meta.engine)
dbutil.init_tables()
@ -37,14 +65,15 @@ class LoadAnalytics(CkanCommand):
in a local database
Options:
internal [date] use ckan internal tracking tables
<token_file> internal [date] use ckan internal tracking tables
token_file specifies the OAUTH token file
date specifies start date for retrieving
analytics data YYYY-MM-DD format
"""
summary = __doc__.split('\n')[0]
usage = __doc__
max_args = 2
min_args = 0
max_args = 3
min_args = 1
TEST_HOST = None
CONFIG = None
@ -56,7 +85,7 @@ class LoadAnalytics(CkanCommand):
self.resource_url_tag = self.CONFIG.get(
'googleanalytics.resource_prefix',
DEFAULT_RESOURCE_URL_TAG)
self.setup_ga_connection()
# funny dance we need to do to make sure we've got a
# configured session
model.Session.remove()
@ -71,10 +100,10 @@ class LoadAnalytics(CkanCommand):
engine.execute(sql)
for url, count in packages_data.iteritems():
if url.startswith(DEFAULT_RESOURCE_URL_TAG):
# If it matches the resource then we should mark it as a resource.
# For resources we don't currently find the package ID.
if RESOURCE_URL_REGEX.match(url):
tracking_type = 'resource'
# remove the leading identifier
url = url[len(DEFAULT_RESOURCE_URL_TAG):]
else:
tracking_type = 'page'
@ -91,6 +120,14 @@ class LoadAnalytics(CkanCommand):
WHERE t.package_id IS NULL AND tracking_type = 'page';'''
engine.execute(sql, PACKAGE_URL)
# get ids for dataset edit urls which aren't captured otherwise
sql = '''UPDATE tracking_summary t
SET package_id = COALESCE(
(SELECT id FROM package p WHERE t.url = %s || p.name)
,'~~not~found~~')
WHERE t.package_id = '~~not~found~~' AND tracking_type = 'page';'''
engine.execute(sql, '%sedit/' % PACKAGE_URL)
# update summary totals for resources
sql = '''UPDATE tracking_summary t1
SET running_total = (
@ -128,9 +165,9 @@ class LoadAnalytics(CkanCommand):
engine.execute(sql)
def bulk_import(self):
if len(self.args) == 2:
if len(self.args) == 3:
# Get summeries from specified date
start_date = datetime.datetime.strptime(self.args[1], '%Y-%m-%d')
start_date = datetime.datetime.strptime(self.args[2], '%Y-%m-%d')
else:
# No date given. See when we last have data for and get data
# from 2 days before then in case new data is available.
@ -156,8 +193,8 @@ class LoadAnalytics(CkanCommand):
# sleep to rate limit requests
time.sleep(0.25)
start_date = stop_date
log.info('%s recieved %s' % (len(packages_data), start_date))
print '%s recieved %s' % (len(packages_data), start_date)
log.info('%s received %s' % (len(packages_data), start_date))
print '%s received %s' % (len(packages_data), start_date)
def get_ga_data_new(self, start_date=None, end_date=None):
"""Get raw data from Google Analtyics for packages and
@ -171,7 +208,7 @@ class LoadAnalytics(CkanCommand):
end_date = end_date.strftime("%Y-%m-%d")
packages = {}
query = 'ga:pagePath=~^%s,ga:pagePath=~^%s' % \
query = 'ga:pagePath=~%s,ga:pagePath=~%s' % \
(PACKAGE_URL, self.resource_url_tag)
metrics = 'ga:uniquePageviews'
sort = '-ga:uniquePageviews'
@ -179,36 +216,57 @@ class LoadAnalytics(CkanCommand):
start_index = 1
max_results = 10000
# data retrival is chunked
while True:
feed = self.ga_query(query_filter=query,
from_date=start_date,
completed = False
while not completed:
results = self.service.data().ga().get(ids='ga:%s' % self.profile_id,
filters=query,
dimensions='ga:pagePath',
start_date=start_date,
start_index=start_index,
max_results=max_results,
metrics=metrics,
sort=sort,
to_date=end_date)
for entry in feed.entry:
for dim in entry.dimension:
if dim.name == "ga:pagePath":
package = dim.value
count = entry.get_metric(
'ga:uniquePageviews').value or 0
packages[package] = int(count)
if len(feed.entry) < max_results:
break
end_date=end_date).execute()
result_count = len(results.get('rows', []))
if result_count < max_results:
completed = True
for result in results.get('rows', []):
package = result[0]
package = '/' + '/'.join(package.split('/')[2:])
count = result[1]
packages[package] = int(count)
start_index += max_results
# rate limiting
time.sleep(0.25)
time.sleep(0.2)
return packages
def parse_and_save(self):
"""Grab raw data from Google Analytics and save to the database"""
if len(self.args):
if self.args[0].lower() != 'internal':
raise Exception('Illegal argument %s' % self.args[0])
from ga_auth import (init_service, get_profile_id)
tokenfile = self.args[0]
if not os.path.exists(tokenfile):
raise Exception('Cannot find the token file %s' % self.args[0])
try:
self.service = init_service(self.args[0], None)
except TypeError:
print ('Have you correctly run the getauthtoken task and '
'specified the correct file here')
raise Exception('Unable to create a service')
self.profile_id = get_profile_id(self.service)
if len(self.args) > 1:
if len(self.args) > 2 and self.args[1].lower() != 'internal':
raise Exception('Illegal argument %s' % self.args[1])
self.bulk_import()
else:
packages_data = self.get_ga_data()
query = 'ga:pagePath=~%s,ga:pagePath=~%s' % \
(PACKAGE_URL, self.resource_url_tag)
packages_data = self.get_ga_data(query_filter=query)
self.save_ga_data(packages_data)
log.info("Saved %s records from google" % len(packages_data))
@ -218,10 +276,11 @@ class LoadAnalytics(CkanCommand):
for identifier, visits in packages_data.items():
recently = visits.get('recent', 0)
ever = visits.get('ever', 0)
if identifier.startswith(self.resource_url_tag):
matches = RESOURCE_URL_REGEX.match(identifier)
if matches:
resource_url = identifier[len(self.resource_url_tag):]
resource = model.Session.query(model.Resource).autoflush(True)\
.filter_by(url=resource_url).first()
.filter_by(id=matches.group(1)).first()
if not resource:
log.warning("Couldn't find resource %s" % resource_url)
continue
@ -240,35 +299,6 @@ class LoadAnalytics(CkanCommand):
log.info("Updated %s with %s visits" % (item.id, visits))
model.Session.commit()
def setup_ga_connection(self):
"""Log into the Google Data API, and find out the ``table_id``
that is associated with the profile, for later querying
"""
SOURCE_APP_NAME = "CKAN Google Analytics Plugin"
username = self.CONFIG.get('googleanalytics.username')
password = self.CONFIG.get('googleanalytics.password')
ga_id = self.CONFIG.get('googleanalytics.id')
if not username or not password or not ga_id:
raise Exception("No googleanalytics profile info in config")
if self.TEST_HOST:
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME,
http_client=self.TEST_HOST)
else:
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME)
my_client.ClientLogin(username, password, SOURCE_APP_NAME)
account_query = client.AccountFeedQuery({'max-results': '300'})
feed = my_client.GetAccountFeed(account_query)
table_id = None
for entry in feed.entry:
if entry.get_property("ga:webPropertyId").value == ga_id:
table_id = entry.table_id.text
break
if not table_id:
msg = "Couldn't find a profile with id '%s'" % ga_id
raise Exception(msg)
self.table_id = table_id
self.client = my_client
def ga_query(self, query_filter=None, from_date=None, to_date=None,
start_index=1, max_results=10000, metrics=None, sort=None):
"""Execute a query against Google Analytics
@ -276,22 +306,26 @@ class LoadAnalytics(CkanCommand):
if not to_date:
now = datetime.datetime.now()
to_date = now.strftime("%Y-%m-%d")
if isinstance(from_date, datetime.date):
from_date = from_date.strftime("%Y-%m-%d")
if not metrics:
metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews'
if not sort:
sort = '-ga:newVisits'
query = client.DataFeedQuery({'ids': '%s' % self.table_id,
'start-date': from_date,
'end-date': to_date,
'dimensions': 'ga:pagePath',
'metrics': metrics,
'sort': sort,
'start-index': start_index,
'filters': query_filter,
'max-results': max_results
})
feed = self.client.GetDataFeed(query)
return feed
sort = '-ga:uniquePageviews'
print '%s -> %s' % (from_date, to_date)
results = self.service.data().ga().get(ids='ga:' + self.profile_id,
start_date=from_date,
end_date=to_date,
dimensions='ga:pagePath',
metrics=metrics,
sort=sort,
start_index=start_index,
filters=query_filter,
max_results=max_results
).execute()
return results
def get_ga_data(self, query_filter=None, start_date=None, end_date=None):
"""Get raw data from Google Analtyics for packages and
@ -306,19 +340,25 @@ class LoadAnalytics(CkanCommand):
recent_date = recent_date.strftime("%Y-%m-%d")
floor_date = datetime.date(2005, 1, 1)
packages = {}
queries = ['ga:pagePath=~^%s' % PACKAGE_URL,
'ga:pagePath=~^%s' % self.resource_url_tag]
queries = ['ga:pagePath=~%s' % PACKAGE_URL]
dates = {'recent': recent_date, 'ever': floor_date}
for date_name, date in dates.items():
for date_name, date in dates.iteritems():
for query in queries:
feed = self.ga_query(query_filter=query,
from_date=date)
for entry in feed.entry:
for dim in entry.dimension:
if dim.name == "ga:pagePath":
package = dim.value
count = entry.get_metric(
'ga:uniquePageviews').value or 0
packages.setdefault(package, {})[date_name] = count
return packages
results = self.ga_query(query_filter=query,
metrics='ga:uniquePageviews',
from_date=date)
if 'rows' in results:
for result in results.get('rows'):
package = result[0]
if not package.startswith(PACKAGE_URL):
package = '/' + '/'.join(package.split('/')[2:])
count = result[1]
# Make sure we add the different representations of the same
# dataset /mysite.com & /www.mysite.com ...
val = 0
if package in packages and date_name in packages[package]:
val += packages[package][date_name]
packages.setdefault(package, {})[date_name] = \
int(count) + val
return packages

View File

@ -0,0 +1,69 @@
import os
import httplib2
from apiclient.discovery import build
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client.tools import run
from pylons import config
def _prepare_credentials(token_filename, credentials_filename):
"""
Either returns the user's oauth credentials or uses the credentials
file to generate a token (by forcing the user to login in the browser)
"""
storage = Storage(token_filename)
credentials = storage.get()
if credentials is None or credentials.invalid:
flow = flow_from_clientsecrets(credentials_filename,
scope='https://www.googleapis.com/auth/analytics.readonly',
message="Can't find the credentials file")
credentials = run(flow, storage)
return credentials
def init_service(token_file, credentials_file):
"""
Given a file containing the user's oauth token (and another with
credentials in case we need to generate the token) will return a
service object representing the analytics API.
"""
http = httplib2.Http()
credentials = _prepare_credentials(token_file, credentials_file)
http = credentials.authorize(http) # authorize the http object
return build('analytics', 'v3', http=http)
def get_profile_id(service):
"""
Get the profile ID for this user and the service specified by the
'googleanalytics.id' configuration option. This function iterates
over all of the accounts available to the user who invoked the
service to find one where the account name matches (in case the
user has several).
"""
accounts = service.management().accounts().list().execute()
if not accounts.get('items'):
return None
accountName = config.get('googleanalytics.account')
webPropertyId = config.get('googleanalytics.id')
for acc in accounts.get('items'):
if acc.get('name') == accountName:
accountId = acc.get('id')
webproperties = service.management().webproperties().list(accountId=accountId).execute()
profiles = service.management().profiles().list(
accountId=accountId, webPropertyId=webPropertyId).execute()
if profiles.get('items'):
return profiles.get('items')[0].get('id')
return None

View File

@ -20,7 +20,8 @@ setup(
include_package_data=True,
zip_safe=False,
install_requires=[
'gdata'
'gdata',
'google-api-python-client'
],
entry_points=\
"""
@ -31,5 +32,6 @@ setup(
[paste.paster_command]
loadanalytics = ckanext.googleanalytics.commands:LoadAnalytics
initdb = ckanext.googleanalytics.commands:InitDB
getauthtoken = ckanext.googleanalytics.commands:GetAuthToken
""",
)