From b69d89bf18876b53dde6e02d529780617e203a54 Mon Sep 17 00:00:00 2001 From: Seb Bacon Date: Mon, 4 Apr 2011 17:44:39 +0100 Subject: [PATCH] big refactor --- README.txt | 17 +++- ckanext/googleanalytics/commands.py | 132 ++++++++++++++++++++++++++ ckanext/googleanalytics/controller.py | 73 ++------------ ckanext/googleanalytics/dbutil.py | 88 +++++++++++++++++ ckanext/googleanalytics/model.py | 11 +++ ckanext/googleanalytics/plugin.py | 34 ++++++- setup.py | 3 + 7 files changed, 285 insertions(+), 73 deletions(-) create mode 100644 ckanext/googleanalytics/commands.py create mode 100644 ckanext/googleanalytics/dbutil.py create mode 100644 ckanext/googleanalytics/model.py diff --git a/README.txt b/README.txt index 283db06..6f1393c 100644 --- a/README.txt +++ b/README.txt @@ -25,13 +25,28 @@ Installation # the following *must* match profile name in GA dashboard googleanalytics.profile_name = mydomain.com/ +3. Wait a day or so for some stats to be recorded in Google -3. Look at some stats within CKAN +4. Import Google stats by running the following command from + ``src/ckanext-googleanalytics``:: + + paster loadanalytics --config=../ckan/development.ini + + (Of course, pointing config at your specific site config) + +5. Look at some stats within CKAN Once your GA account has gathered some data, you can see some basic information about the most popular packages at: http://localhost:5000/analytics/package/top + By default the only data that is injected into the public-facing + website is on the package page, where number of downloads are + displayed next to each resource. + +6. Consider putting the import command as a daily cron job, or + remember to run it by hand! + TODO ==== diff --git a/ckanext/googleanalytics/commands.py b/ckanext/googleanalytics/commands.py new file mode 100644 index 0000000..b60e69f --- /dev/null +++ b/ckanext/googleanalytics/commands.py @@ -0,0 +1,132 @@ +import logging +import datetime +from pylons import config +from ckan.lib.cli import CkanCommand +from gdata.analytics import client +import ckan.model as model +from sqlalchemy.orm import sessionmaker + +import dbutil + +log = logging.getLogger('ckanext.googleanalytics') +PACKAGE_URL = '/package/' # XXX get from routes... +DEFAULT_RESOURCE_URL_TAG = '/downloads/' + + +class LoadAnalytics(CkanCommand): + """Parse data from Google Analytics API and store it in a local + database + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def command(self): + self._load_config() + self.resource_url_tag = config.get('googleanalytics.resource_prefix', + DEFAULT_RESOURCE_URL_TAG) + self.setup_ga_connection() + # funny dance we need to do to make sure we've got a + # configured session + model.Session.remove() + model.Session.configure(bind=model.meta.engine) + self.parse_and_save() + + def parse_and_save(self): + packages_data = self.get_ga_data() + self.save_ga_data(packages_data) + log.info("Saved %s records from google" % len(packages_data)) + + def save_ga_data(self, packages_data): + dbutil.init_tables() + for identifier, visits in packages_data.items(): + recently = visits.get('recent', 0) + ever = visits.get('ever', 0) + if identifier.startswith(self.resource_url_tag): + resource_url = identifier[len(self.resource_url_tag):] + resource = model.Session.query(model.Resource).autoflush(True)\ + .filter_by(url=resource_url).first() + if not resource: + log.warning("Couldn't find resource %s" % resource_url) + continue + dbutil.update_resource_visits(resource.id, recently, ever) + log.info("Updated %s with %s visits" % (resource.id, visits)) + else: + package_name = identifier[len(PACKAGE_URL):] + if "/" in package_name: + log.warning("%s not a valid package name" % package_name) + continue + item = model.Package.by_name(package_name) + if not item: + log.warning("Couldn't find package %s" % package_name) + continue + dbutil.update_package_visits(item.id, recently, ever) + log.info("Updated %s with %s visits" % (item.id, visits)) + model.Session.commit() + + def setup_ga_connection(self): + SOURCE_APP_NAME = "CKAN Google Analytics Plugin" + username = config.get('googleanalytics.username') + password = config.get('googleanalytics.password') + profile_name = config.get('googleanalytics.profile_name') + if not username or not password or not profile_name: + raise Exception("No googleanalytics profile info in config") + my_client = client.AnalyticsClient(source=SOURCE_APP_NAME) + my_client.ClientLogin(username, + password, + SOURCE_APP_NAME) + account_query = client.AccountFeedQuery({'max-results': '300'}) + feed = my_client.GetAccountFeed(account_query) + table_id = None + for entry in feed.entry: + if entry.title.text == profile_name: + table_id = entry.table_id.text + break + if not table_id: + msg = "Couldn't find a profile called '%s'" % profile_name + raise Exception(msg) + self.table_id = table_id + self.client = my_client + + def ga_query(self, query_filter=None, from_date=None): + now = datetime.datetime.now() + to_date = now.strftime("%Y-%m-%d") + metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews' + query = client.DataFeedQuery({'ids': '%s' % self.table_id, + 'start-date': from_date, + 'end-date': to_date, + 'dimensions': 'ga:pagePath', + 'metrics': metrics, + 'sort': '-ga:newVisits', + 'filters': query_filter, + 'max-results': '10000' + }) + feed = self.client.GetDataFeed(query) + return feed + + def get_ga_data(self, query_filter=None): + """Return a dictionary like + {'identifier': {'recent':3, 'ever':6}} + """ + now = datetime.datetime.now() + recent_date = now - datetime.timedelta(14) + recent_date = recent_date.strftime("%Y-%m-%d") + floor_date = datetime.date(2005, 1, 1) + packages = {} + queries = ['ga:pagePath=~^%s' % PACKAGE_URL, + 'ga:pagePath=~^%s' % self.resource_url_tag] + dates = {'recent': recent_date, 'ever': floor_date} + for date_name, date in dates.items(): + for query in queries: + feed = self.ga_query(query_filter=query, + from_date=date) + for entry in feed.entry: + for dim in entry.dimension: + if dim.name == "ga:pagePath": + package = dim.value + count = entry.get_metric( + 'ga:uniquePageviews').value or 0 + packages.setdefault(package, {})[date_name] = count + return packages + diff --git a/ckanext/googleanalytics/controller.py b/ckanext/googleanalytics/controller.py index dc65e59..2877429 100644 --- a/ckanext/googleanalytics/controller.py +++ b/ckanext/googleanalytics/controller.py @@ -1,20 +1,15 @@ -from datetime import datetime -from datetime import timedelta -from pylons import config, request -from beaker import cache +import logging from ckan.lib.base import * -from ckan.authz import Authorizer -from gdata.analytics import client -from ckan import model -from ckan.model.authz import PSEUDO_USER__VISITOR -from ckanext.googleanalytics import GoogleAnalyticsException +import dbutil -PACKAGE_URL = '/package/' # XXX get from routes... + +log = logging.getLogger('ckanext.googleanalytics') class GAController(BaseController): def view(self): # get package objects corresponding to popular GA content + self.parse_ga_data() c.top_packages = self.get_top_packages() return render('index.html') @@ -24,61 +19,5 @@ class GAController(BaseController): return "analyticscontroller" def get_top_packages(self): - packages_data = self._get_ga_data() - items = [] - authorizer = Authorizer() - q = authorizer.authorized_query(PSEUDO_USER__VISITOR, model.Package) - for package, visits in packages_data[:10]: - url_frag = package[len(PACKAGE_URL):] - if "/" in url_frag: - continue - item = q.filter("name = '%s'" % url_frag) - if not item.count(): - continue - items.append((item.first(), visits)) + items = dbutil.get_top_packages() return items - - @cache.cache(expire=3600) - def _get_ga_data(self): - SOURCE_APP_NAME = "CKAN Google Analytics Plugin" - username = config.get('googleanalytics.username') - password = config.get('googleanalytics.password') - profile_name = config.get('googleanalytics.profile_name') - if not username or not password or not profile_name: - return [] - my_client = client.AnalyticsClient(source=SOURCE_APP_NAME) - my_client.ClientLogin(username, - password, - SOURCE_APP_NAME) - account_query = client.AccountFeedQuery({'max-results': '300'}) - feed = my_client.GetAccountFeed(account_query) - table_id = None - for entry in feed.entry: - if entry.title.text == profile_name: - table_id = entry.table_id.text - break - if not table_id: - msg = "Couldn't find a profile called '%s'" % profile_name - raise GoogleAnalyticsException(msg) - now = datetime.now() - to_date = now.strftime("%Y-%m-%d") - from_date = now - timedelta(14) - from_date = from_date.strftime("%Y-%m-%d") - query = client.DataFeedQuery({'ids': '%s' % table_id, - 'start-date': from_date, - 'end-date': to_date, - 'dimensions': 'ga:pagePath', - 'metrics': 'ga:visits,ga:visitors,ga:newVisits', - 'sort': '-ga:newVisits', - 'filters': 'ga:pagePath=~^%s' % PACKAGE_URL, - 'max-results': '50' - }) - feed = my_client.GetDataFeed(query) - packages = [] - for entry in feed.entry: - for dim in entry.dimension: - if dim.name == "ga:pagePath": - package = dim.value - newVisits = entry.get_metric('ga:visits').value - packages.append((package, newVisits)) - return packages diff --git a/ckanext/googleanalytics/dbutil.py b/ckanext/googleanalytics/dbutil.py new file mode 100644 index 0000000..ed1bb4e --- /dev/null +++ b/ckanext/googleanalytics/dbutil.py @@ -0,0 +1,88 @@ +import ckan.model as model +from ckan.authz import Authorizer +from ckan.model.authz import PSEUDO_USER__VISITOR +from ckan.lib.base import * + + +def init_tables(): + try: + connection = model.Session.connection() + connection.execute("""CREATE TABLE package_stats ( + package_id varchar(60) primary key, + visits_recently integer, + visits_ever integer);""") + except Exception, e: + if not "already exists" in e.args[0]: + raise + model.Session.commit() + try: + connection = model.Session.connection() + connection.execute("""CREATE TABLE resource_stats ( + resource_id varchar(60) primary key, + visits_recently integer, + visits_ever integer);""") + except Exception, e: + if not "already exists" in e.args[0]: + raise + model.Session.commit() + + +def update_resource_visits(resource_id, recently, ever): + connection = model.Session.connection() + count = connection.execute( + """SELECT count(resource_id) FROM resource_stats + WHERE resource_id = '%s'""" % resource_id).fetchone() + if count[0]: + connection.execute( + """UPDATE resource_stats SET visits_recently = %s, + visits_ever = %s + WHERE resource_id = '%s'""" % (recently, ever, resource_id) + ) + else: + connection.execute( + """INSERT INTO resource_stats + (resource_id, visits_recently, visits_ever) VALUES + ('%s', %s, %s)""" % (resource_id, recently, ever)) + + +def get_resource_visits_for_url(url): + connection = model.Session.connection() + count = connection.execute( + """SELECT visits_ever FROM resource_stats, resource + WHERE resource_id = resource.id + AND resource.url = '%s'""" % url).fetchone() + return count and count[0] or "" + + +def update_package_visits(package_id, recently, ever): + connection = model.Session.connection() + count = connection.execute( + """SELECT count(package_id) FROM package_stats + WHERE package_id = '%s'""" % package_id).fetchone() + if count[0]: + connection.execute( + """UPDATE package_stats SET visits = %s + WHERE package_id = '%s'""" % (recently, ever, package_id) + ) + else: + connection.execute( + """INSERT INTO package_stats + (package_id, visits_recently, visits_ever) VALUES + ('%s', %s, %s)""" % (package_id, recently, ever)) + + +def get_top_packages(limit=20): + items = [] + authorizer = Authorizer() + q = authorizer.authorized_query(PSEUDO_USER__VISITOR, + model.Package) + connection = model.Session.connection() + res = connection.execute("""SELECT package_id, visits_recently + FROM package_stats + ORDER BY visits_recently DESC;""").fetchmany(limit) + for package_id, visits in res: + item = q.filter("package.id = '%s'" % package_id) + if not item.count(): + continue + items.append((item.first(), visits)) + return items diff --git a/ckanext/googleanalytics/model.py b/ckanext/googleanalytics/model.py new file mode 100644 index 0000000..cb9e874 --- /dev/null +++ b/ckanext/googleanalytics/model.py @@ -0,0 +1,11 @@ +from ckan import model + +def setup(): + connection = model.Session.connection() + connection.execute("""CREATE TABLE IF NOT EXISTS package_downloads ( + id integer primary_key, + package_id varchar(60), + download_visits integer, + views_visits integer);""") + + diff --git a/ckanext/googleanalytics/plugin.py b/ckanext/googleanalytics/plugin.py index e6b957d..046df0b 100644 --- a/ckanext/googleanalytics/plugin.py +++ b/ckanext/googleanalytics/plugin.py @@ -1,15 +1,19 @@ import logging import urllib - -log = logging.getLogger(__name__) import os from genshi.filters import Transformer from genshi import HTML +from genshi.core import START, TEXT +from genshi.filters.transform import INSIDE +from pylons import config from ckan.plugins import implements, SingletonPlugin from ckan.plugins import IGenshiStreamFilter, IConfigurable, IRoutes from ckan.plugins import IConfigurer -from ckan import model from gasnippet import gacode +from commands import DEFAULT_RESOURCE_URL_TAG +import dbutil + +log = logging.getLogger('ckanext.googleanalytics') class GoogleAnalyticsException(Exception): @@ -34,16 +38,36 @@ class GoogleAnalyticsPlugin(SingletonPlugin): ga_id = self.config['googleanalytics.id'] code = HTML(gacode % ga_id) stream = stream | Transformer('head').append(code) + resource_url = config.get('googleanalytics.resource_prefix', + DEFAULT_RESOURCE_URL_TAG) # add download tracking link def js_attr(name, event): attrs = event[1][1] - link = '/downloads/%s' % urllib.quote(attrs.get('href')) + link = '%s%s' % (resource_url, + urllib.quote(attrs.get('href'))) js = "javascript: _gaq.push(['_trackPageview', '%s']);" % link return js + + # add some stats + def download_adder(stream): + download_html = ' (%s downloads)' + count = None + for mark, (kind, data, pos) in stream: + if mark and kind == START: + href = data[1].get('href') + count = dbutil.get_resource_visits_for_url(href) + if count and kind == TEXT and mark == INSIDE: + yield mark, (kind, + data + download_html % count, + pos) + else: + yield mark, (kind, data, pos) + + # perform the stream transform stream = stream | Transformer( '//div[@id="package"]//td/a')\ - .attr('onclick', js_attr) + .apply(download_adder).attr('onclick', js_attr) return stream diff --git a/setup.py b/setup.py index 972060c..53dd9f7 100644 --- a/setup.py +++ b/setup.py @@ -27,5 +27,8 @@ setup( [ckan.plugins] # Add plugins here, eg googleanalytics=ckanext.googleanalytics.plugin:GoogleAnalyticsPlugin + + [paste.paster_command] + loadanalytics = ckanext.googleanalytics.commands:LoadAnalytics """, )