From db13b0b9cde7791073e8db5da6c54ef2a41addad Mon Sep 17 00:00:00 2001 From: Toby Date: Thu, 5 Apr 2012 18:12:25 +0100 Subject: [PATCH 1/2] changes for new internal tracking --- ckanext/googleanalytics/commands.py | 169 +++++++++++++++++++++++++--- ckanext/googleanalytics/plugin.py | 10 +- 2 files changed, 161 insertions(+), 18 deletions(-) diff --git a/ckanext/googleanalytics/commands.py b/ckanext/googleanalytics/commands.py index 2ee0551..ca0b88f 100644 --- a/ckanext/googleanalytics/commands.py +++ b/ckanext/googleanalytics/commands.py @@ -1,5 +1,7 @@ import logging import datetime +import time + from pylons import config as pylonsconfig from ckan.lib.cli import CkanCommand from gdata.analytics import client @@ -31,12 +33,17 @@ class InitDB(CkanCommand): class LoadAnalytics(CkanCommand): - """Parse data from Google Analytics API and store it in a local - database + """Parse data from Google Analytics API and store it + in a local database + + Options: + internal [date] use ckan internal tracking tables + date specifies start date for retrieving + analytics data YYYY-MM-DD format """ summary = __doc__.split('\n')[0] usage = __doc__ - max_args = 0 + max_args = 2 min_args = 0 TEST_HOST = None CONFIG = None @@ -56,11 +63,139 @@ class LoadAnalytics(CkanCommand): model.Session.configure(bind=model.meta.engine) self.parse_and_save() + def internal_save(self, packages_data, summary_date): + engine = model.meta.engine + # clear out existing data before adding new + sql = '''DELETE FROM tracking_summary + WHERE date='%s'; ''' % summary_date + engine.execute(sql) + + for url, count in packages_data.iteritems(): + if url.startswith(DEFAULT_RESOURCE_URL_TAG): + tracking_type = 'resource' + # remove the leading identifier + url = url[len(DEFAULT_RESOURCE_URL_TAG):] + else: + tracking_type = 'page' + + sql = '''INSERT INTO tracking_summary + (url, count, date, tracking_type) + VALUES (%s, %s, %s, %s);''' + engine.execute(sql, url, count, summary_date, tracking_type) + + # get ids for dataset urls + sql = '''UPDATE tracking_summary t + SET package_id = COALESCE( + (SELECT id FROM package p WHERE t.url = %s || p.name) + ,'~~not~found~~') + WHERE t.package_id IS NULL AND tracking_type = 'page';''' + engine.execute(sql, PACKAGE_URL) + + # update summary totals for resources + sql = '''UPDATE tracking_summary t1 + SET running_total = ( + SELECT sum(count) + FROM tracking_summary t2 + WHERE t1.url = t2.url AND t2.date <= t1.date) + WHERE t1.running_total = 0 AND tracking_type = 'resource';''' + engine.execute(sql) + + # update summary totals for pages + sql = '''UPDATE tracking_summary t1 + SET running_total = ( + SELECT sum(count) + FROM tracking_summary t2 + WHERE t1.package_id = t2.package_id + AND t2.date <= t1.date) + WHERE t1.running_total = 0 AND tracking_type = 'page' + AND t1.package_id IS NOT NULL + AND t1.package_id != '~~not~found~~';''' + engine.execute(sql) + + def bulk_import(self): + if len(self.args) == 2: + # Get summeries from specified date + start_date = datetime.datetime.strptime(self.args[1], '%Y-%m-%d') + else: + # No date given. See when we last have data for and get data + # from 2 days before then in case new data is available. + # If no date here then use 2010-01-01 as the start date + engine = model.meta.engine + sql = '''SELECT date from tracking_summary + ORDER BY date DESC LIMIT 1;''' + result = engine.execute(sql).fetchall() + if result: + start_date = result[0]['date'] + start_date += datetime.timedelta(-2) + # convert date to datetime + combine = datetime.datetime.combine + start_date = combine(start_date, datetime.time(0)) + else: + start_date = datetime.datetime(2011, 1, 1) + end_date = datetime.datetime.now() + while start_date < end_date: + stop_date = start_date + datetime.timedelta(1) + packages_data = self.get_ga_data_new(start_date=start_date, + end_date=stop_date) + self.internal_save(packages_data, start_date) + # sleep to rate limit requests + time.sleep(0.25) + start_date = stop_date + log.info('%s recieved %s' % (len(packages_data), start_date)) + print '%s recieved %s' % (len(packages_data), start_date) + + def get_ga_data_new(self, start_date=None, end_date=None): + """Get raw data from Google Analtyics for packages and + resources. + + Returns a dictionary like:: + + {'identifier': 3} + """ + start_date = start_date.strftime("%Y-%m-%d") + end_date = end_date.strftime("%Y-%m-%d") + + packages = {} + query = 'ga:pagePath=~^%s,ga:pagePath=~^%s' % \ + (PACKAGE_URL, self.resource_url_tag) + metrics = 'ga:uniquePageviews' + sort = '-ga:uniquePageviews' + + start_index = 1 + max_results = 10000 + # data retrival is chunked + while True: + feed = self.ga_query(query_filter=query, + from_date=start_date, + start_index=start_index, + max_results=max_results, + metrics=metrics, + sort=sort, + to_date=end_date) + for entry in feed.entry: + for dim in entry.dimension: + if dim.name == "ga:pagePath": + package = dim.value + count = entry.get_metric( + 'ga:uniquePageviews').value or 0 + packages[package] = int(count) + if len(feed.entry) < max_results: + break + start_index += max_results + # rate limiting + time.sleep(0.25) + return packages + def parse_and_save(self): """Grab raw data from Google Analytics and save to the database""" - packages_data = self.get_ga_data() - self.save_ga_data(packages_data) - log.info("Saved %s records from google" % len(packages_data)) + if len(self.args): + if self.args[0].lower() != 'internal': + raise Exception('Illegal argument %s' % self.args[0]) + self.bulk_import() + else: + packages_data = self.get_ga_data() + self.save_ga_data(packages_data) + log.info("Saved %s records from google" % len(packages_data)) def save_ga_data(self, packages_data): """Save tuples of packages_data to the database @@ -119,25 +254,31 @@ class LoadAnalytics(CkanCommand): self.table_id = table_id self.client = my_client - def ga_query(self, query_filter=None, from_date=None): - """Executie a query against Google Analytics + def ga_query(self, query_filter=None, from_date=None, to_date=None, + start_index=1, max_results=10000, metrics=None, sort=None): + """Execute a query against Google Analytics """ - now = datetime.datetime.now() - to_date = now.strftime("%Y-%m-%d") - metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews' + if not to_date: + now = datetime.datetime.now() + to_date = now.strftime("%Y-%m-%d") + if not metrics: + metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews' + if not sort: + sort = '-ga:newVisits' query = client.DataFeedQuery({'ids': '%s' % self.table_id, 'start-date': from_date, 'end-date': to_date, 'dimensions': 'ga:pagePath', 'metrics': metrics, - 'sort': '-ga:newVisits', + 'sort': sort, + 'start-index': start_index, 'filters': query_filter, - 'max-results': '10000' + 'max-results': max_results }) feed = self.client.GetDataFeed(query) return feed - def get_ga_data(self, query_filter=None): + def get_ga_data(self, query_filter=None, start_date=None, end_date=None): """Get raw data from Google Analtyics for packages and resources, and for both the last two weeks and ever. diff --git a/ckanext/googleanalytics/plugin.py b/ckanext/googleanalytics/plugin.py index 10827b4..98dcb42 100644 --- a/ckanext/googleanalytics/plugin.py +++ b/ckanext/googleanalytics/plugin.py @@ -43,15 +43,17 @@ class GoogleAnalyticsPlugin(SingletonPlugin): DEFAULT_RESOURCE_URL_TAG) routes = request.environ.get('pylons.routes_dict') - if (routes.get('controller') == 'package' and - routes.get('action') in ['read', 'resource_read']): + action = routes.get('action') + controller = routes.get('controller') + if (controller == 'package' and \ + action in ['search', 'read', 'resource_read']) or \ + (controller == 'group' and action == 'read'): log.info("Tracking of resource downloads") show_downloads = ( asbool(config.get('googleanalytics.show_downloads', True)) and - routes.get('action') == 'read' + action == 'read' and controller == 'package' ) - # add download tracking link def js_attr(name, event): attrs = event[1][1] From 143dafab450a4b48df8917d9dac9d48a252e89fc Mon Sep 17 00:00:00 2001 From: Toby Date: Fri, 20 Apr 2012 14:46:00 +0100 Subject: [PATCH 2/2] summary_tracking renames and fixes --- ckanext/googleanalytics/commands.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/ckanext/googleanalytics/commands.py b/ckanext/googleanalytics/commands.py index ca0b88f..5933db9 100644 --- a/ckanext/googleanalytics/commands.py +++ b/ckanext/googleanalytics/commands.py @@ -67,7 +67,7 @@ class LoadAnalytics(CkanCommand): engine = model.meta.engine # clear out existing data before adding new sql = '''DELETE FROM tracking_summary - WHERE date='%s'; ''' % summary_date + WHERE tracking_date='%s'; ''' % summary_date engine.execute(sql) for url, count in packages_data.iteritems(): @@ -79,7 +79,7 @@ class LoadAnalytics(CkanCommand): tracking_type = 'page' sql = '''INSERT INTO tracking_summary - (url, count, date, tracking_type) + (url, count, tracking_date, tracking_type) VALUES (%s, %s, %s, %s);''' engine.execute(sql, url, count, summary_date, tracking_type) @@ -96,7 +96,15 @@ class LoadAnalytics(CkanCommand): SET running_total = ( SELECT sum(count) FROM tracking_summary t2 - WHERE t1.url = t2.url AND t2.date <= t1.date) + WHERE t1.url = t2.url + AND t2.tracking_date <= t1.tracking_date + ) + t1.count + ,recent_views = ( + SELECT sum(count) + FROM tracking_summary t2 + WHERE t1.url = t2.url + AND t2.tracking_date <= t1.tracking_date AND t2.tracking_date >= t1.tracking_date - 14 + ) + t1.count WHERE t1.running_total = 0 AND tracking_type = 'resource';''' engine.execute(sql) @@ -106,7 +114,14 @@ class LoadAnalytics(CkanCommand): SELECT sum(count) FROM tracking_summary t2 WHERE t1.package_id = t2.package_id - AND t2.date <= t1.date) + AND t2.tracking_date <= t1.tracking_date + ) + t1.count + ,recent_views = ( + SELECT sum(count) + FROM tracking_summary t2 + WHERE t1.package_id = t2.package_id + AND t2.tracking_date <= t1.tracking_date AND t2.tracking_date >= t1.tracking_date - 14 + ) + t1.count WHERE t1.running_total = 0 AND tracking_type = 'page' AND t1.package_id IS NOT NULL AND t1.package_id != '~~not~found~~';''' @@ -121,11 +136,11 @@ class LoadAnalytics(CkanCommand): # from 2 days before then in case new data is available. # If no date here then use 2010-01-01 as the start date engine = model.meta.engine - sql = '''SELECT date from tracking_summary - ORDER BY date DESC LIMIT 1;''' + sql = '''SELECT tracking_date from tracking_summary + ORDER BY tracking_date DESC LIMIT 1;''' result = engine.execute(sql).fetchall() if result: - start_date = result[0]['date'] + start_date = result[0]['tracking_date'] start_date += datetime.timedelta(-2) # convert date to datetime combine = datetime.datetime.combine