Fixes to make sure that data collected for reports is working

Make sure we handle the changes to the API to ensure we collect the
dataset and resource information
This commit is contained in:
Ross Jones 2012-10-18 15:11:01 +01:00
parent 2283571562
commit 459ff608a6
2 changed files with 35 additions and 37 deletions

View File

@ -3,7 +3,7 @@ CKAN Google Analytics Extension
**Status:** Production
**CKAN Version:** 1.5.*
**CKAN Version:** >= 1.5.*
Overview
@ -95,7 +95,7 @@ Installation
7. Import Google stats by running the following command from
``src/ckanext-googleanalytics``::
paster loadanalytics token.dat 2012-10-10 --config=../ckan/development.ini
paster loadanalytics token.dat --config=../ckan/development.ini
(Of course, pointing config at your specific site config and token.dat at the
oauth file generated from the authorization step)

View File

@ -15,7 +15,7 @@ log = logging.getLogger('ckanext.googleanalytics')
PACKAGE_URL = '/dataset/' # XXX get from routes...
DEFAULT_RESOURCE_URL_TAG = '/downloads/'
RESOURCE_URL_REGEX = re.compile('/dataset/[a-z0-9-_]+/resource/[a-z0-9-_]+')
RESOURCE_URL_REGEX = re.compile('/dataset/[a-z0-9-_]+/resource/([a-z0-9-_]+)')
DATASET_EDIT_REGEX = re.compile('/dataset/edit/([a-z0-9-_]+)')
class GetAuthToken(CkanCommand):
@ -262,12 +262,14 @@ class LoadAnalytics(CkanCommand):
raise Exception('Unable to create a service')
self.profile_id = get_profile_id(self.service)
if len(self.args):
if len(self.args) > 1 and self.args[1].lower() != 'internal':
if len(self.args) > 1:
if len(self.args) > 2 and self.args[1].lower() != 'internal':
raise Exception('Illegal argument %s' % self.args[1])
self.bulk_import()
else:
packages_data = self.get_ga_data()
query = 'ga:pagePath=~%s,ga:pagePath=~%s' % \
(PACKAGE_URL, self.resource_url_tag)
packages_data = self.get_ga_data(query_filter=query)
self.save_ga_data(packages_data)
log.info("Saved %s records from google" % len(packages_data))
@ -277,10 +279,11 @@ class LoadAnalytics(CkanCommand):
for identifier, visits in packages_data.items():
recently = visits.get('recent', 0)
ever = visits.get('ever', 0)
if identifier.startswith(self.resource_url_tag):
matches = RESOURCE_URL_REGEX.match(identifier)
if matches:
resource_url = identifier[len(self.resource_url_tag):]
resource = model.Session.query(model.Resource).autoflush(True)\
.filter_by(url=resource_url).first()
.filter_by(id=matches.group(1)).first()
if not resource:
log.warning("Couldn't find resource %s" % resource_url)
continue
@ -307,26 +310,8 @@ class LoadAnalytics(CkanCommand):
username = self.CONFIG.get('googleanalytics.username')
password = self.CONFIG.get('googleanalytics.password')
ga_id = self.CONFIG.get('googleanalytics.id')
if not username or not password or not ga_id:
if not ga_id:
raise Exception("No googleanalytics profile info in config")
if self.TEST_HOST:
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME,
http_client=self.TEST_HOST)
else:
my_client = client.AnalyticsClient(source=SOURCE_APP_NAME)
#my_client.ClientLogin(username, password, SOURCE_APP_NAME)
#account_query = client.AccountFeedQuery({'max-results': '300'})
#feed = my_client.GetAccountFeed(account_query)
#table_id = None
#for entry in feed.entry:
# if entry.get_property("ga:webPropertyId").value == ga_id:
# table_id = entry.table_id.text
# break
#if not table_id:
# msg = "Couldn't find a profile with id '%s'" % ga_id
# raise Exception(msg)
#self.table_id = table_id
#self.client = my_client
def ga_query(self, query_filter=None, from_date=None, to_date=None,
start_index=1, max_results=10000, metrics=None, sort=None):
@ -335,10 +320,14 @@ class LoadAnalytics(CkanCommand):
if not to_date:
now = datetime.datetime.now()
to_date = now.strftime("%Y-%m-%d")
if isinstance(from_date,datetime.date):
from_date = from_date.strftime("%Y-%m-%d")
if not metrics:
metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews'
if not sort:
sort = '-ga:newVisits'
sort = '-ga:uniquePageviews'
print '%s -> %s' % (from_date, to_date)
results = self.service.data().ga().get(ids='ga:' + self.profile_id,
start_date=from_date,
@ -365,17 +354,26 @@ class LoadAnalytics(CkanCommand):
recent_date = recent_date.strftime("%Y-%m-%d")
floor_date = datetime.date(2005, 1, 1)
packages = {}
queries = ['ga:pagePath=~^%s' % PACKAGE_URL,
'ga:pagePath=~^%s' % self.resource_url_tag]
queries = ['ga:pagePath=~%s' % PACKAGE_URL] #,
#'ga:pagePath=~%s' % self.resource_url_tag]
dates = {'recent': recent_date, 'ever': floor_date}
for date_name, date in dates.items():
for date_name, date in dates.iteritems():
for query in queries:
results = self.ga_query(query_filter=query,
from_date=date)
for result in results.get('rows'):
package = result[0]
package = '/' + '/'.join(package.split('/')[2:])
count = result[1]
packages.setdefault(package, {})[date_name] = count
metrics='ga:uniquePageviews',
from_date=date)
if 'rows' in results:
for result in results.get('rows'):
package = result[0]
if not package.startswith(PACKAGE_URL):
package = '/' + '/'.join(package.split('/')[2:])
count = result[1]
# Make sure we add the different representations of the same
# dataset /mysite.com & /www.mysite.com ...
val = 0
if package in packages and date_name in packages[package]:
val += packages[package][date_name]
packages.setdefault(package, {})[date_name] = int(count) + val
return packages