big refactor

This commit is contained in:
Seb Bacon 2011-04-04 17:44:39 +01:00
parent 1f0f25a26f
commit b69d89bf18
7 changed files with 285 additions and 73 deletions

View File

@ -25,13 +25,28 @@ Installation
# the following *must* match profile name in GA dashboard
googleanalytics.profile_name = mydomain.com/
3. Wait a day or so for some stats to be recorded in Google
3. Look at some stats within CKAN
4. Import Google stats by running the following command from
``src/ckanext-googleanalytics``::
paster loadanalytics --config=../ckan/development.ini
(Of course, pointing config at your specific site config)
5. Look at some stats within CKAN
Once your GA account has gathered some data, you can see some basic
information about the most popular packages at:
http://localhost:5000/analytics/package/top
By default the only data that is injected into the public-facing
website is on the package page, where number of downloads are
displayed next to each resource.
6. Consider putting the import command as a daily cron job, or
remember to run it by hand!
TODO
====

View File

@ -0,0 +1,132 @@
import logging
import datetime
from pylons import config
from ckan.lib.cli import CkanCommand
from gdata.analytics import client
import ckan.model as model
from sqlalchemy.orm import sessionmaker
import dbutil
log = logging.getLogger('ckanext.googleanalytics')
PACKAGE_URL = '/package/' # XXX get from routes...
DEFAULT_RESOURCE_URL_TAG = '/downloads/'
class LoadAnalytics(CkanCommand):
"""Parse data from Google Analytics API and store it in a local
database
"""
summary = __doc__.split('\n')[0]
usage = __doc__
max_args = 0
min_args = 0
def command(self):
self._load_config()
self.resource_url_tag = config.get('googleanalytics.resource_prefix',
DEFAULT_RESOURCE_URL_TAG)
self.setup_ga_connection()
# funny dance we need to do to make sure we've got a
# configured session
model.Session.remove()
model.Session.configure(bind=model.meta.engine)
self.parse_and_save()
def parse_and_save(self):
packages_data = self.get_ga_data()
self.save_ga_data(packages_data)
log.info("Saved %s records from google" % len(packages_data))
def save_ga_data(self, packages_data):
dbutil.init_tables()
for identifier, visits in packages_data.items():
recently = visits.get('recent', 0)
ever = visits.get('ever', 0)
if identifier.startswith(self.resource_url_tag):
resource_url = identifier[len(self.resource_url_tag):]
resource = model.Session.query(model.Resource).autoflush(True)\
.filter_by(url=resource_url).first()
if not resource:
log.warning("Couldn't find resource %s" % resource_url)
continue
dbutil.update_resource_visits(resource.id, recently, ever)
log.info("Updated %s with %s visits" % (resource.id, visits))
else:
package_name = identifier[len(PACKAGE_URL):]
if "/" in package_name:
log.warning("%s not a valid package name" % package_name)
continue
item = model.Package.by_name(package_name)
if not item:
log.warning("Couldn't find package %s" % package_name)
continue
dbutil.update_package_visits(item.id, recently, ever)
log.info("Updated %s with %s visits" % (item.id, visits))
model.Session.commit()
def setup_ga_connection(self):
SOURCE_APP_NAME = "CKAN Google Analytics Plugin"
username = config.get('googleanalytics.username')
password = config.get('googleanalytics.password')
profile_name = config.get('googleanalytics.profile_name')
if not username or not password or not profile_name:
raise Exception("No googleanalytics profile info in config")
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME)
my_client.ClientLogin(username,
password,
SOURCE_APP_NAME)
account_query = client.AccountFeedQuery({'max-results': '300'})
feed = my_client.GetAccountFeed(account_query)
table_id = None
for entry in feed.entry:
if entry.title.text == profile_name:
table_id = entry.table_id.text
break
if not table_id:
msg = "Couldn't find a profile called '%s'" % profile_name
raise Exception(msg)
self.table_id = table_id
self.client = my_client
def ga_query(self, query_filter=None, from_date=None):
now = datetime.datetime.now()
to_date = now.strftime("%Y-%m-%d")
metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews'
query = client.DataFeedQuery({'ids': '%s' % self.table_id,
'start-date': from_date,
'end-date': to_date,
'dimensions': 'ga:pagePath',
'metrics': metrics,
'sort': '-ga:newVisits',
'filters': query_filter,
'max-results': '10000'
})
feed = self.client.GetDataFeed(query)
return feed
def get_ga_data(self, query_filter=None):
"""Return a dictionary like
{'identifier': {'recent':3, 'ever':6}}
"""
now = datetime.datetime.now()
recent_date = now - datetime.timedelta(14)
recent_date = recent_date.strftime("%Y-%m-%d")
floor_date = datetime.date(2005, 1, 1)
packages = {}
queries = ['ga:pagePath=~^%s' % PACKAGE_URL,
'ga:pagePath=~^%s' % self.resource_url_tag]
dates = {'recent': recent_date, 'ever': floor_date}
for date_name, date in dates.items():
for query in queries:
feed = self.ga_query(query_filter=query,
from_date=date)
for entry in feed.entry:
for dim in entry.dimension:
if dim.name == "ga:pagePath":
package = dim.value
count = entry.get_metric(
'ga:uniquePageviews').value or 0
packages.setdefault(package, {})[date_name] = count
return packages

View File

@ -1,20 +1,15 @@
from datetime import datetime
from datetime import timedelta
from pylons import config, request
from beaker import cache
import logging
from ckan.lib.base import *
from ckan.authz import Authorizer
from gdata.analytics import client
from ckan import model
from ckan.model.authz import PSEUDO_USER__VISITOR
from ckanext.googleanalytics import GoogleAnalyticsException
import dbutil
PACKAGE_URL = '/package/' # XXX get from routes...
log = logging.getLogger('ckanext.googleanalytics')
class GAController(BaseController):
def view(self):
# get package objects corresponding to popular GA content
self.parse_ga_data()
c.top_packages = self.get_top_packages()
return render('index.html')
@ -24,61 +19,5 @@ class GAController(BaseController):
return "analyticscontroller"
def get_top_packages(self):
packages_data = self._get_ga_data()
items = []
authorizer = Authorizer()
q = authorizer.authorized_query(PSEUDO_USER__VISITOR, model.Package)
for package, visits in packages_data[:10]:
url_frag = package[len(PACKAGE_URL):]
if "/" in url_frag:
continue
item = q.filter("name = '%s'" % url_frag)
if not item.count():
continue
items.append((item.first(), visits))
items = dbutil.get_top_packages()
return items
@cache.cache(expire=3600)
def _get_ga_data(self):
SOURCE_APP_NAME = "CKAN Google Analytics Plugin"
username = config.get('googleanalytics.username')
password = config.get('googleanalytics.password')
profile_name = config.get('googleanalytics.profile_name')
if not username or not password or not profile_name:
return []
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME)
my_client.ClientLogin(username,
password,
SOURCE_APP_NAME)
account_query = client.AccountFeedQuery({'max-results': '300'})
feed = my_client.GetAccountFeed(account_query)
table_id = None
for entry in feed.entry:
if entry.title.text == profile_name:
table_id = entry.table_id.text
break
if not table_id:
msg = "Couldn't find a profile called '%s'" % profile_name
raise GoogleAnalyticsException(msg)
now = datetime.now()
to_date = now.strftime("%Y-%m-%d")
from_date = now - timedelta(14)
from_date = from_date.strftime("%Y-%m-%d")
query = client.DataFeedQuery({'ids': '%s' % table_id,
'start-date': from_date,
'end-date': to_date,
'dimensions': 'ga:pagePath',
'metrics': 'ga:visits,ga:visitors,ga:newVisits',
'sort': '-ga:newVisits',
'filters': 'ga:pagePath=~^%s' % PACKAGE_URL,
'max-results': '50'
})
feed = my_client.GetDataFeed(query)
packages = []
for entry in feed.entry:
for dim in entry.dimension:
if dim.name == "ga:pagePath":
package = dim.value
newVisits = entry.get_metric('ga:visits').value
packages.append((package, newVisits))
return packages

View File

@ -0,0 +1,88 @@
import ckan.model as model
from ckan.authz import Authorizer
from ckan.model.authz import PSEUDO_USER__VISITOR
from ckan.lib.base import *
def init_tables():
try:
connection = model.Session.connection()
connection.execute("""CREATE TABLE package_stats (
package_id varchar(60) primary key,
visits_recently integer,
visits_ever integer);""")
except Exception, e:
if not "already exists" in e.args[0]:
raise
model.Session.commit()
try:
connection = model.Session.connection()
connection.execute("""CREATE TABLE resource_stats (
resource_id varchar(60) primary key,
visits_recently integer,
visits_ever integer);""")
except Exception, e:
if not "already exists" in e.args[0]:
raise
model.Session.commit()
def update_resource_visits(resource_id, recently, ever):
connection = model.Session.connection()
count = connection.execute(
"""SELECT count(resource_id) FROM resource_stats
WHERE resource_id = '%s'""" % resource_id).fetchone()
if count[0]:
connection.execute(
"""UPDATE resource_stats SET visits_recently = %s,
visits_ever = %s
WHERE resource_id = '%s'""" % (recently, ever, resource_id)
)
else:
connection.execute(
"""INSERT INTO resource_stats
(resource_id, visits_recently, visits_ever) VALUES
('%s', %s, %s)""" % (resource_id, recently, ever))
def get_resource_visits_for_url(url):
connection = model.Session.connection()
count = connection.execute(
"""SELECT visits_ever FROM resource_stats, resource
WHERE resource_id = resource.id
AND resource.url = '%s'""" % url).fetchone()
return count and count[0] or ""
def update_package_visits(package_id, recently, ever):
connection = model.Session.connection()
count = connection.execute(
"""SELECT count(package_id) FROM package_stats
WHERE package_id = '%s'""" % package_id).fetchone()
if count[0]:
connection.execute(
"""UPDATE package_stats SET visits = %s
WHERE package_id = '%s'""" % (recently, ever, package_id)
)
else:
connection.execute(
"""INSERT INTO package_stats
(package_id, visits_recently, visits_ever) VALUES
('%s', %s, %s)""" % (package_id, recently, ever))
def get_top_packages(limit=20):
items = []
authorizer = Authorizer()
q = authorizer.authorized_query(PSEUDO_USER__VISITOR,
model.Package)
connection = model.Session.connection()
res = connection.execute("""SELECT package_id, visits_recently
FROM package_stats
ORDER BY visits_recently DESC;""").fetchmany(limit)
for package_id, visits in res:
item = q.filter("package.id = '%s'" % package_id)
if not item.count():
continue
items.append((item.first(), visits))
return items

View File

@ -0,0 +1,11 @@
from ckan import model
def setup():
connection = model.Session.connection()
connection.execute("""CREATE TABLE IF NOT EXISTS package_downloads (
id integer primary_key,
package_id varchar(60),
download_visits integer,
views_visits integer);""")

View File

@ -1,15 +1,19 @@
import logging
import urllib
log = logging.getLogger(__name__)
import os
from genshi.filters import Transformer
from genshi import HTML
from genshi.core import START, TEXT
from genshi.filters.transform import INSIDE
from pylons import config
from ckan.plugins import implements, SingletonPlugin
from ckan.plugins import IGenshiStreamFilter, IConfigurable, IRoutes
from ckan.plugins import IConfigurer
from ckan import model
from gasnippet import gacode
from commands import DEFAULT_RESOURCE_URL_TAG
import dbutil
log = logging.getLogger('ckanext.googleanalytics')
class GoogleAnalyticsException(Exception):
@ -34,16 +38,36 @@ class GoogleAnalyticsPlugin(SingletonPlugin):
ga_id = self.config['googleanalytics.id']
code = HTML(gacode % ga_id)
stream = stream | Transformer('head').append(code)
resource_url = config.get('googleanalytics.resource_prefix',
DEFAULT_RESOURCE_URL_TAG)
# add download tracking link
def js_attr(name, event):
attrs = event[1][1]
link = '/downloads/%s' % urllib.quote(attrs.get('href'))
link = '%s%s' % (resource_url,
urllib.quote(attrs.get('href')))
js = "javascript: _gaq.push(['_trackPageview', '%s']);" % link
return js
# add some stats
def download_adder(stream):
download_html = ' <span="downloads-count">(%s downloads)</span>'
count = None
for mark, (kind, data, pos) in stream:
if mark and kind == START:
href = data[1].get('href')
count = dbutil.get_resource_visits_for_url(href)
if count and kind == TEXT and mark == INSIDE:
yield mark, (kind,
data + download_html % count,
pos)
else:
yield mark, (kind, data, pos)
# perform the stream transform
stream = stream | Transformer(
'//div[@id="package"]//td/a')\
.attr('onclick', js_attr)
.apply(download_adder).attr('onclick', js_attr)
return stream

View File

@ -27,5 +27,8 @@ setup(
[ckan.plugins]
# Add plugins here, eg
googleanalytics=ckanext.googleanalytics.plugin:GoogleAnalyticsPlugin
[paste.paster_command]
loadanalytics = ckanext.googleanalytics.commands:LoadAnalytics
""",
)