commit
c4a720840b
|
@ -6,3 +6,5 @@ syntax: glob
|
|||
*~
|
||||
build/
|
||||
dist/
|
||||
credentials.json
|
||||
token.dat
|
49
README.rst
49
README.rst
|
@ -3,7 +3,7 @@ CKAN Google Analytics Extension
|
|||
|
||||
**Status:** Production
|
||||
|
||||
**CKAN Version:** 1.5.*
|
||||
**CKAN Version:** >= 1.5.*
|
||||
|
||||
|
||||
Overview
|
||||
|
@ -32,6 +32,7 @@ Installation
|
|||
::
|
||||
|
||||
googleanalytics.id = UA-1010101-1
|
||||
googleanalytics.account = Account name (i.e. data.gov.uk, see top level item at https://www.google.com/analytics)
|
||||
googleanalytics.username = googleaccount@gmail.com
|
||||
googleanalytics.password = googlepassword
|
||||
|
||||
|
@ -54,7 +55,11 @@ Installation
|
|||
ckan.plugins = googleanalytics
|
||||
|
||||
(If there are other plugins activated, add this to the list. Each
|
||||
plugin should be separated with a space)
|
||||
plugin should be separated with a space). If you are using this plugin
|
||||
with a version of CKAN < 2.0 then you should also set the following to
|
||||
make sure the correct templates are found for the reports
|
||||
|
||||
ckan.legacy_templates = true
|
||||
|
||||
|
||||
Finally, there are some optional configuration settings (shown here
|
||||
|
@ -85,6 +90,8 @@ Installation
|
|||
If ``track_events`` is set, Google Analytics event tracking will be
|
||||
enabled.
|
||||
|
||||
Follow the steps described in the Authorization section below.
|
||||
|
||||
5. Restart CKAN (e.g. by restarting Apache)
|
||||
|
||||
6. Wait a while for some stats to be recorded in Google
|
||||
|
@ -92,9 +99,10 @@ Installation
|
|||
7. Import Google stats by running the following command from
|
||||
``src/ckanext-googleanalytics``::
|
||||
|
||||
paster loadanalytics --config=../ckan/development.ini
|
||||
paster loadanalytics token.dat --config=../ckan/development.ini
|
||||
|
||||
(Of course, pointing config at your specific site config)
|
||||
(Of course, pointing config at your specific site config and token.dat at the
|
||||
oauth file generated from the authorization step)
|
||||
|
||||
8. Look at some stats within CKAN
|
||||
|
||||
|
@ -109,6 +117,39 @@ Installation
|
|||
9. Consider running the import command reguarly as a cron job, or
|
||||
remember to run it by hand, or your statistics won't get updated.
|
||||
|
||||
|
||||
Authorization
|
||||
--------------
|
||||
|
||||
Before you can access the data, you need to set up the OAUTH details which you can do by following the `instructions <https://developers.google.com/analytics/resources/tutorials/hello-analytics-api>`_ the outcome of which will be a file called credentials.json which should look like credentials.json.template with the relevant fields completed. These steps are below for convenience:
|
||||
|
||||
1. Visit the `Google APIs Console <https://code.google.com/apis/console>`_
|
||||
|
||||
2. Sign-in and create a project or use an existing project.
|
||||
|
||||
3. In the `Services pane <https://code.google.com/apis/console#:services>`_ , activate Analytics API for your project. If prompted, read and accept the terms of service.
|
||||
|
||||
4. Go to the `API Access pane <https://code.google.com/apis/console/#:access>`_
|
||||
|
||||
5. Click Create an OAuth 2.0 client ID....
|
||||
|
||||
6. Fill out the Branding Information fields and click Next.
|
||||
|
||||
7. In Client ID Settings, set Application type to Installed application.
|
||||
|
||||
8. Click Create client ID
|
||||
|
||||
9. The details you need below are Client ID, Client secret, and Redirect URIs
|
||||
|
||||
|
||||
Once you have set up your credentials.json file you can generate an oauth token file by using the
|
||||
following command, which will store your oauth token in a file called token.dat once you have finished
|
||||
giving permission in the browser::
|
||||
|
||||
$ paster getauthtoken --config=../ckan/development.ini
|
||||
|
||||
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import os
|
||||
import re
|
||||
import logging
|
||||
import datetime
|
||||
import time
|
||||
|
||||
from pylons import config as pylonsconfig
|
||||
from ckan.lib.cli import CkanCommand
|
||||
from gdata.analytics import client
|
||||
import ckan.model as model
|
||||
|
||||
import dbutil
|
||||
|
@ -13,6 +14,35 @@ log = logging.getLogger('ckanext.googleanalytics')
|
|||
PACKAGE_URL = '/dataset/' # XXX get from routes...
|
||||
DEFAULT_RESOURCE_URL_TAG = '/downloads/'
|
||||
|
||||
RESOURCE_URL_REGEX = re.compile('/dataset/[a-z0-9-_]+/resource/([a-z0-9-_]+)')
|
||||
DATASET_EDIT_REGEX = re.compile('/dataset/edit/([a-z0-9-_]+)')
|
||||
|
||||
|
||||
class GetAuthToken(CkanCommand):
|
||||
""" Get's the Google auth token
|
||||
|
||||
Usage: paster getauthtoken <credentials_file>
|
||||
|
||||
Where <credentials_file> is the file name containing the details
|
||||
for the service (obtained from https://code.google.com/apis/console).
|
||||
By default this is set to credentials.json
|
||||
"""
|
||||
summary = __doc__.split('\n')[0]
|
||||
usage = __doc__
|
||||
max_args = 1
|
||||
min_args = 0
|
||||
|
||||
def command(self):
|
||||
"""
|
||||
In this case we don't want a valid service, but rather just to
|
||||
force the user through the auth flow. We allow this to complete to
|
||||
act as a form of verification instead of just getting the token and
|
||||
assuming it is correct.
|
||||
"""
|
||||
from ga_auth import init_service
|
||||
init_service('token.dat',
|
||||
self.args[0] if self.args else 'credentials.json')
|
||||
|
||||
|
||||
class InitDB(CkanCommand):
|
||||
"""Initialise the local stats database tables
|
||||
|
@ -24,8 +54,6 @@ class InitDB(CkanCommand):
|
|||
|
||||
def command(self):
|
||||
self._load_config()
|
||||
# funny dance we need to do to make sure we've got a
|
||||
# configured session
|
||||
model.Session.remove()
|
||||
model.Session.configure(bind=model.meta.engine)
|
||||
dbutil.init_tables()
|
||||
|
@ -37,14 +65,15 @@ class LoadAnalytics(CkanCommand):
|
|||
in a local database
|
||||
|
||||
Options:
|
||||
internal [date] use ckan internal tracking tables
|
||||
<token_file> internal [date] use ckan internal tracking tables
|
||||
token_file specifies the OAUTH token file
|
||||
date specifies start date for retrieving
|
||||
analytics data YYYY-MM-DD format
|
||||
"""
|
||||
summary = __doc__.split('\n')[0]
|
||||
usage = __doc__
|
||||
max_args = 2
|
||||
min_args = 0
|
||||
max_args = 3
|
||||
min_args = 1
|
||||
TEST_HOST = None
|
||||
CONFIG = None
|
||||
|
||||
|
@ -56,7 +85,7 @@ class LoadAnalytics(CkanCommand):
|
|||
self.resource_url_tag = self.CONFIG.get(
|
||||
'googleanalytics.resource_prefix',
|
||||
DEFAULT_RESOURCE_URL_TAG)
|
||||
self.setup_ga_connection()
|
||||
|
||||
# funny dance we need to do to make sure we've got a
|
||||
# configured session
|
||||
model.Session.remove()
|
||||
|
@ -71,10 +100,10 @@ class LoadAnalytics(CkanCommand):
|
|||
engine.execute(sql)
|
||||
|
||||
for url, count in packages_data.iteritems():
|
||||
if url.startswith(DEFAULT_RESOURCE_URL_TAG):
|
||||
# If it matches the resource then we should mark it as a resource.
|
||||
# For resources we don't currently find the package ID.
|
||||
if RESOURCE_URL_REGEX.match(url):
|
||||
tracking_type = 'resource'
|
||||
# remove the leading identifier
|
||||
url = url[len(DEFAULT_RESOURCE_URL_TAG):]
|
||||
else:
|
||||
tracking_type = 'page'
|
||||
|
||||
|
@ -91,6 +120,14 @@ class LoadAnalytics(CkanCommand):
|
|||
WHERE t.package_id IS NULL AND tracking_type = 'page';'''
|
||||
engine.execute(sql, PACKAGE_URL)
|
||||
|
||||
# get ids for dataset edit urls which aren't captured otherwise
|
||||
sql = '''UPDATE tracking_summary t
|
||||
SET package_id = COALESCE(
|
||||
(SELECT id FROM package p WHERE t.url = %s || p.name)
|
||||
,'~~not~found~~')
|
||||
WHERE t.package_id = '~~not~found~~' AND tracking_type = 'page';'''
|
||||
engine.execute(sql, '%sedit/' % PACKAGE_URL)
|
||||
|
||||
# update summary totals for resources
|
||||
sql = '''UPDATE tracking_summary t1
|
||||
SET running_total = (
|
||||
|
@ -128,9 +165,9 @@ class LoadAnalytics(CkanCommand):
|
|||
engine.execute(sql)
|
||||
|
||||
def bulk_import(self):
|
||||
if len(self.args) == 2:
|
||||
if len(self.args) == 3:
|
||||
# Get summeries from specified date
|
||||
start_date = datetime.datetime.strptime(self.args[1], '%Y-%m-%d')
|
||||
start_date = datetime.datetime.strptime(self.args[2], '%Y-%m-%d')
|
||||
else:
|
||||
# No date given. See when we last have data for and get data
|
||||
# from 2 days before then in case new data is available.
|
||||
|
@ -156,8 +193,8 @@ class LoadAnalytics(CkanCommand):
|
|||
# sleep to rate limit requests
|
||||
time.sleep(0.25)
|
||||
start_date = stop_date
|
||||
log.info('%s recieved %s' % (len(packages_data), start_date))
|
||||
print '%s recieved %s' % (len(packages_data), start_date)
|
||||
log.info('%s received %s' % (len(packages_data), start_date))
|
||||
print '%s received %s' % (len(packages_data), start_date)
|
||||
|
||||
def get_ga_data_new(self, start_date=None, end_date=None):
|
||||
"""Get raw data from Google Analtyics for packages and
|
||||
|
@ -171,7 +208,7 @@ class LoadAnalytics(CkanCommand):
|
|||
end_date = end_date.strftime("%Y-%m-%d")
|
||||
|
||||
packages = {}
|
||||
query = 'ga:pagePath=~^%s,ga:pagePath=~^%s' % \
|
||||
query = 'ga:pagePath=~%s,ga:pagePath=~%s' % \
|
||||
(PACKAGE_URL, self.resource_url_tag)
|
||||
metrics = 'ga:uniquePageviews'
|
||||
sort = '-ga:uniquePageviews'
|
||||
|
@ -179,36 +216,57 @@ class LoadAnalytics(CkanCommand):
|
|||
start_index = 1
|
||||
max_results = 10000
|
||||
# data retrival is chunked
|
||||
while True:
|
||||
feed = self.ga_query(query_filter=query,
|
||||
from_date=start_date,
|
||||
completed = False
|
||||
while not completed:
|
||||
results = self.service.data().ga().get(ids='ga:%s' % self.profile_id,
|
||||
filters=query,
|
||||
dimensions='ga:pagePath',
|
||||
start_date=start_date,
|
||||
start_index=start_index,
|
||||
max_results=max_results,
|
||||
metrics=metrics,
|
||||
sort=sort,
|
||||
to_date=end_date)
|
||||
for entry in feed.entry:
|
||||
for dim in entry.dimension:
|
||||
if dim.name == "ga:pagePath":
|
||||
package = dim.value
|
||||
count = entry.get_metric(
|
||||
'ga:uniquePageviews').value or 0
|
||||
packages[package] = int(count)
|
||||
if len(feed.entry) < max_results:
|
||||
break
|
||||
end_date=end_date).execute()
|
||||
result_count = len(results.get('rows', []))
|
||||
if result_count < max_results:
|
||||
completed = True
|
||||
|
||||
for result in results.get('rows', []):
|
||||
package = result[0]
|
||||
package = '/' + '/'.join(package.split('/')[2:])
|
||||
count = result[1]
|
||||
packages[package] = int(count)
|
||||
|
||||
start_index += max_results
|
||||
|
||||
# rate limiting
|
||||
time.sleep(0.25)
|
||||
time.sleep(0.2)
|
||||
return packages
|
||||
|
||||
def parse_and_save(self):
|
||||
"""Grab raw data from Google Analytics and save to the database"""
|
||||
if len(self.args):
|
||||
if self.args[0].lower() != 'internal':
|
||||
raise Exception('Illegal argument %s' % self.args[0])
|
||||
from ga_auth import (init_service, get_profile_id)
|
||||
|
||||
tokenfile = self.args[0]
|
||||
if not os.path.exists(tokenfile):
|
||||
raise Exception('Cannot find the token file %s' % self.args[0])
|
||||
|
||||
try:
|
||||
self.service = init_service(self.args[0], None)
|
||||
except TypeError:
|
||||
print ('Have you correctly run the getauthtoken task and '
|
||||
'specified the correct file here')
|
||||
raise Exception('Unable to create a service')
|
||||
self.profile_id = get_profile_id(self.service)
|
||||
|
||||
if len(self.args) > 1:
|
||||
if len(self.args) > 2 and self.args[1].lower() != 'internal':
|
||||
raise Exception('Illegal argument %s' % self.args[1])
|
||||
self.bulk_import()
|
||||
else:
|
||||
packages_data = self.get_ga_data()
|
||||
query = 'ga:pagePath=~%s,ga:pagePath=~%s' % \
|
||||
(PACKAGE_URL, self.resource_url_tag)
|
||||
packages_data = self.get_ga_data(query_filter=query)
|
||||
self.save_ga_data(packages_data)
|
||||
log.info("Saved %s records from google" % len(packages_data))
|
||||
|
||||
|
@ -218,10 +276,11 @@ class LoadAnalytics(CkanCommand):
|
|||
for identifier, visits in packages_data.items():
|
||||
recently = visits.get('recent', 0)
|
||||
ever = visits.get('ever', 0)
|
||||
if identifier.startswith(self.resource_url_tag):
|
||||
matches = RESOURCE_URL_REGEX.match(identifier)
|
||||
if matches:
|
||||
resource_url = identifier[len(self.resource_url_tag):]
|
||||
resource = model.Session.query(model.Resource).autoflush(True)\
|
||||
.filter_by(url=resource_url).first()
|
||||
.filter_by(id=matches.group(1)).first()
|
||||
if not resource:
|
||||
log.warning("Couldn't find resource %s" % resource_url)
|
||||
continue
|
||||
|
@ -240,35 +299,6 @@ class LoadAnalytics(CkanCommand):
|
|||
log.info("Updated %s with %s visits" % (item.id, visits))
|
||||
model.Session.commit()
|
||||
|
||||
def setup_ga_connection(self):
|
||||
"""Log into the Google Data API, and find out the ``table_id``
|
||||
that is associated with the profile, for later querying
|
||||
"""
|
||||
SOURCE_APP_NAME = "CKAN Google Analytics Plugin"
|
||||
username = self.CONFIG.get('googleanalytics.username')
|
||||
password = self.CONFIG.get('googleanalytics.password')
|
||||
ga_id = self.CONFIG.get('googleanalytics.id')
|
||||
if not username or not password or not ga_id:
|
||||
raise Exception("No googleanalytics profile info in config")
|
||||
if self.TEST_HOST:
|
||||
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME,
|
||||
http_client=self.TEST_HOST)
|
||||
else:
|
||||
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME)
|
||||
my_client.ClientLogin(username, password, SOURCE_APP_NAME)
|
||||
account_query = client.AccountFeedQuery({'max-results': '300'})
|
||||
feed = my_client.GetAccountFeed(account_query)
|
||||
table_id = None
|
||||
for entry in feed.entry:
|
||||
if entry.get_property("ga:webPropertyId").value == ga_id:
|
||||
table_id = entry.table_id.text
|
||||
break
|
||||
if not table_id:
|
||||
msg = "Couldn't find a profile with id '%s'" % ga_id
|
||||
raise Exception(msg)
|
||||
self.table_id = table_id
|
||||
self.client = my_client
|
||||
|
||||
def ga_query(self, query_filter=None, from_date=None, to_date=None,
|
||||
start_index=1, max_results=10000, metrics=None, sort=None):
|
||||
"""Execute a query against Google Analytics
|
||||
|
@ -276,22 +306,26 @@ class LoadAnalytics(CkanCommand):
|
|||
if not to_date:
|
||||
now = datetime.datetime.now()
|
||||
to_date = now.strftime("%Y-%m-%d")
|
||||
if isinstance(from_date, datetime.date):
|
||||
from_date = from_date.strftime("%Y-%m-%d")
|
||||
if not metrics:
|
||||
metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews'
|
||||
if not sort:
|
||||
sort = '-ga:newVisits'
|
||||
query = client.DataFeedQuery({'ids': '%s' % self.table_id,
|
||||
'start-date': from_date,
|
||||
'end-date': to_date,
|
||||
'dimensions': 'ga:pagePath',
|
||||
'metrics': metrics,
|
||||
'sort': sort,
|
||||
'start-index': start_index,
|
||||
'filters': query_filter,
|
||||
'max-results': max_results
|
||||
})
|
||||
feed = self.client.GetDataFeed(query)
|
||||
return feed
|
||||
sort = '-ga:uniquePageviews'
|
||||
|
||||
print '%s -> %s' % (from_date, to_date)
|
||||
|
||||
results = self.service.data().ga().get(ids='ga:' + self.profile_id,
|
||||
start_date=from_date,
|
||||
end_date=to_date,
|
||||
dimensions='ga:pagePath',
|
||||
metrics=metrics,
|
||||
sort=sort,
|
||||
start_index=start_index,
|
||||
filters=query_filter,
|
||||
max_results=max_results
|
||||
).execute()
|
||||
return results
|
||||
|
||||
def get_ga_data(self, query_filter=None, start_date=None, end_date=None):
|
||||
"""Get raw data from Google Analtyics for packages and
|
||||
|
@ -306,19 +340,25 @@ class LoadAnalytics(CkanCommand):
|
|||
recent_date = recent_date.strftime("%Y-%m-%d")
|
||||
floor_date = datetime.date(2005, 1, 1)
|
||||
packages = {}
|
||||
queries = ['ga:pagePath=~^%s' % PACKAGE_URL,
|
||||
'ga:pagePath=~^%s' % self.resource_url_tag]
|
||||
queries = ['ga:pagePath=~%s' % PACKAGE_URL]
|
||||
dates = {'recent': recent_date, 'ever': floor_date}
|
||||
for date_name, date in dates.items():
|
||||
for date_name, date in dates.iteritems():
|
||||
for query in queries:
|
||||
feed = self.ga_query(query_filter=query,
|
||||
from_date=date)
|
||||
for entry in feed.entry:
|
||||
for dim in entry.dimension:
|
||||
if dim.name == "ga:pagePath":
|
||||
package = dim.value
|
||||
count = entry.get_metric(
|
||||
'ga:uniquePageviews').value or 0
|
||||
packages.setdefault(package, {})[date_name] = count
|
||||
return packages
|
||||
results = self.ga_query(query_filter=query,
|
||||
metrics='ga:uniquePageviews',
|
||||
from_date=date)
|
||||
if 'rows' in results:
|
||||
for result in results.get('rows'):
|
||||
package = result[0]
|
||||
if not package.startswith(PACKAGE_URL):
|
||||
package = '/' + '/'.join(package.split('/')[2:])
|
||||
|
||||
count = result[1]
|
||||
# Make sure we add the different representations of the same
|
||||
# dataset /mysite.com & /www.mysite.com ...
|
||||
val = 0
|
||||
if package in packages and date_name in packages[package]:
|
||||
val += packages[package][date_name]
|
||||
packages.setdefault(package, {})[date_name] = \
|
||||
int(count) + val
|
||||
return packages
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
import os
|
||||
import httplib2
|
||||
from apiclient.discovery import build
|
||||
from oauth2client.client import flow_from_clientsecrets
|
||||
from oauth2client.file import Storage
|
||||
from oauth2client.tools import run
|
||||
|
||||
from pylons import config
|
||||
|
||||
|
||||
def _prepare_credentials(token_filename, credentials_filename):
|
||||
"""
|
||||
Either returns the user's oauth credentials or uses the credentials
|
||||
file to generate a token (by forcing the user to login in the browser)
|
||||
"""
|
||||
storage = Storage(token_filename)
|
||||
credentials = storage.get()
|
||||
|
||||
if credentials is None or credentials.invalid:
|
||||
flow = flow_from_clientsecrets(credentials_filename,
|
||||
scope='https://www.googleapis.com/auth/analytics.readonly',
|
||||
message="Can't find the credentials file")
|
||||
credentials = run(flow, storage)
|
||||
|
||||
return credentials
|
||||
|
||||
|
||||
def init_service(token_file, credentials_file):
|
||||
"""
|
||||
Given a file containing the user's oauth token (and another with
|
||||
credentials in case we need to generate the token) will return a
|
||||
service object representing the analytics API.
|
||||
"""
|
||||
http = httplib2.Http()
|
||||
|
||||
credentials = _prepare_credentials(token_file, credentials_file)
|
||||
http = credentials.authorize(http) # authorize the http object
|
||||
|
||||
return build('analytics', 'v3', http=http)
|
||||
|
||||
|
||||
def get_profile_id(service):
|
||||
"""
|
||||
Get the profile ID for this user and the service specified by the
|
||||
'googleanalytics.id' configuration option. This function iterates
|
||||
over all of the accounts available to the user who invoked the
|
||||
service to find one where the account name matches (in case the
|
||||
user has several).
|
||||
"""
|
||||
accounts = service.management().accounts().list().execute()
|
||||
|
||||
if not accounts.get('items'):
|
||||
return None
|
||||
|
||||
accountName = config.get('googleanalytics.account')
|
||||
webPropertyId = config.get('googleanalytics.id')
|
||||
for acc in accounts.get('items'):
|
||||
if acc.get('name') == accountName:
|
||||
accountId = acc.get('id')
|
||||
|
||||
webproperties = service.management().webproperties().list(accountId=accountId).execute()
|
||||
|
||||
profiles = service.management().profiles().list(
|
||||
accountId=accountId, webPropertyId=webPropertyId).execute()
|
||||
|
||||
if profiles.get('items'):
|
||||
return profiles.get('items')[0].get('id')
|
||||
|
||||
return None
|
4
setup.py
4
setup.py
|
@ -20,7 +20,8 @@ setup(
|
|||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
install_requires=[
|
||||
'gdata'
|
||||
'gdata',
|
||||
'google-api-python-client'
|
||||
],
|
||||
entry_points=\
|
||||
"""
|
||||
|
@ -31,5 +32,6 @@ setup(
|
|||
[paste.paster_command]
|
||||
loadanalytics = ckanext.googleanalytics.commands:LoadAnalytics
|
||||
initdb = ckanext.googleanalytics.commands:InitDB
|
||||
getauthtoken = ckanext.googleanalytics.commands:GetAuthToken
|
||||
""",
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue