diff --git a/README.rst b/README.rst index 9395c7a..3585375 100644 --- a/README.rst +++ b/README.rst @@ -3,7 +3,7 @@ CKAN Google Analytics Extension **Status:** Production -**CKAN Version:** 1.5.* +**CKAN Version:** >= 1.5.* Overview @@ -95,7 +95,7 @@ Installation 7. Import Google stats by running the following command from ``src/ckanext-googleanalytics``:: - paster loadanalytics token.dat 2012-10-10 --config=../ckan/development.ini + paster loadanalytics token.dat --config=../ckan/development.ini (Of course, pointing config at your specific site config and token.dat at the oauth file generated from the authorization step) diff --git a/ckanext/googleanalytics/commands.py b/ckanext/googleanalytics/commands.py index 7d23c61..4bb75b7 100644 --- a/ckanext/googleanalytics/commands.py +++ b/ckanext/googleanalytics/commands.py @@ -15,7 +15,7 @@ log = logging.getLogger('ckanext.googleanalytics') PACKAGE_URL = '/dataset/' # XXX get from routes... DEFAULT_RESOURCE_URL_TAG = '/downloads/' -RESOURCE_URL_REGEX = re.compile('/dataset/[a-z0-9-_]+/resource/[a-z0-9-_]+') +RESOURCE_URL_REGEX = re.compile('/dataset/[a-z0-9-_]+/resource/([a-z0-9-_]+)') DATASET_EDIT_REGEX = re.compile('/dataset/edit/([a-z0-9-_]+)') class GetAuthToken(CkanCommand): @@ -262,12 +262,14 @@ class LoadAnalytics(CkanCommand): raise Exception('Unable to create a service') self.profile_id = get_profile_id(self.service) - if len(self.args): - if len(self.args) > 1 and self.args[1].lower() != 'internal': + if len(self.args) > 1: + if len(self.args) > 2 and self.args[1].lower() != 'internal': raise Exception('Illegal argument %s' % self.args[1]) self.bulk_import() else: - packages_data = self.get_ga_data() + query = 'ga:pagePath=~%s,ga:pagePath=~%s' % \ + (PACKAGE_URL, self.resource_url_tag) + packages_data = self.get_ga_data(query_filter=query) self.save_ga_data(packages_data) log.info("Saved %s records from google" % len(packages_data)) @@ -277,10 +279,11 @@ class LoadAnalytics(CkanCommand): for identifier, visits in packages_data.items(): recently = visits.get('recent', 0) ever = visits.get('ever', 0) - if identifier.startswith(self.resource_url_tag): + matches = RESOURCE_URL_REGEX.match(identifier) + if matches: resource_url = identifier[len(self.resource_url_tag):] resource = model.Session.query(model.Resource).autoflush(True)\ - .filter_by(url=resource_url).first() + .filter_by(id=matches.group(1)).first() if not resource: log.warning("Couldn't find resource %s" % resource_url) continue @@ -307,26 +310,8 @@ class LoadAnalytics(CkanCommand): username = self.CONFIG.get('googleanalytics.username') password = self.CONFIG.get('googleanalytics.password') ga_id = self.CONFIG.get('googleanalytics.id') - if not username or not password or not ga_id: + if not ga_id: raise Exception("No googleanalytics profile info in config") - if self.TEST_HOST: - my_client = client.AnalyticsClient(source=SOURCE_APP_NAME, - http_client=self.TEST_HOST) - else: - my_client = client.AnalyticsClient(source=SOURCE_APP_NAME) - #my_client.ClientLogin(username, password, SOURCE_APP_NAME) - #account_query = client.AccountFeedQuery({'max-results': '300'}) - #feed = my_client.GetAccountFeed(account_query) - #table_id = None - #for entry in feed.entry: - # if entry.get_property("ga:webPropertyId").value == ga_id: - # table_id = entry.table_id.text - # break - #if not table_id: - # msg = "Couldn't find a profile with id '%s'" % ga_id - # raise Exception(msg) - #self.table_id = table_id - #self.client = my_client def ga_query(self, query_filter=None, from_date=None, to_date=None, start_index=1, max_results=10000, metrics=None, sort=None): @@ -335,10 +320,14 @@ class LoadAnalytics(CkanCommand): if not to_date: now = datetime.datetime.now() to_date = now.strftime("%Y-%m-%d") + if isinstance(from_date,datetime.date): + from_date = from_date.strftime("%Y-%m-%d") if not metrics: metrics = 'ga:visits,ga:visitors,ga:newVisits,ga:uniquePageviews' if not sort: - sort = '-ga:newVisits' + sort = '-ga:uniquePageviews' + + print '%s -> %s' % (from_date, to_date) results = self.service.data().ga().get(ids='ga:' + self.profile_id, start_date=from_date, @@ -365,17 +354,26 @@ class LoadAnalytics(CkanCommand): recent_date = recent_date.strftime("%Y-%m-%d") floor_date = datetime.date(2005, 1, 1) packages = {} - queries = ['ga:pagePath=~^%s' % PACKAGE_URL, - 'ga:pagePath=~^%s' % self.resource_url_tag] + queries = ['ga:pagePath=~%s' % PACKAGE_URL] #, + #'ga:pagePath=~%s' % self.resource_url_tag] dates = {'recent': recent_date, 'ever': floor_date} - for date_name, date in dates.items(): + for date_name, date in dates.iteritems(): for query in queries: results = self.ga_query(query_filter=query, - from_date=date) - for result in results.get('rows'): - package = result[0] - package = '/' + '/'.join(package.split('/')[2:]) - count = result[1] - packages.setdefault(package, {})[date_name] = count + metrics='ga:uniquePageviews', + from_date=date) + if 'rows' in results: + for result in results.get('rows'): + package = result[0] + if not package.startswith(PACKAGE_URL): + package = '/' + '/'.join(package.split('/')[2:]) + + count = result[1] + # Make sure we add the different representations of the same + # dataset /mysite.com & /www.mysite.com ... + val = 0 + if package in packages and date_name in packages[package]: + val += packages[package][date_name] + packages.setdefault(package, {})[date_name] = int(count) + val return packages