From af695826e8d22666ce66008cb2ad82b2faa04f68 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Mon, 18 Nov 2013 16:32:23 -0500 Subject: [PATCH 1/5] add functionality to set CSW keywords from CKAN tag counts --- bin/ckan_pycsw.py | 33 ++++++++++++++++++++++++++++++--- ckanext/spatial/commands/csw.py | 10 ++++++++-- doc/csw.rst | 12 ++++++++++++ pip-requirements.txt | 1 + 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/bin/ckan_pycsw.py b/bin/ckan_pycsw.py index 642dbfa..6b72008 100644 --- a/bin/ckan_pycsw.py +++ b/bin/ckan_pycsw.py @@ -33,6 +33,26 @@ def setup_db(pycsw_config): extra_columns=ckan_columns) +def set_keywords(pycsw_config_file, pycsw_config, ckan_url, limit=20): + """set pycsw service metadata keywords from top 10 CKAN tags""" + + log.info('Fetching tags from %s', ckan_url) + url = ckan_url + 'api/tag_counts' + response = requests.get(url) + tags = response.json() + + log.info('Deriving top %d tags', limit) + # uniquify and sort by top limit + tags_unique = [list(x) for x in set(tuple(x) for x in tags)] + tags_sorted = sorted(tags_unique, key=lambda x: x[1], reverse=1)[0:limit] + keywords = ','.join('%s' % tn[0] for tn in tags_sorted) + + log.info('Setting tags in pycsw configuration file %s', pycsw_config_file) + pycsw_config.set('metadata:main', 'identification_keywords', keywords) + with open(pycsw_config_file, 'wb') as configfile: + pycsw_config.write(configfile) + + def load(pycsw_config, ckan_url): database = pycsw_config.get('repository', 'database') @@ -40,7 +60,6 @@ def load(pycsw_config, ckan_url): context = pycsw.config.StaticContext() repo = repository.Repository(database, context, table=table_name) - ckan_url = ckan_url.lstrip('/') + '/' log.info('Started gathering CKAN datasets identifiers: {0}'.format(str(datetime.datetime.now()))) @@ -85,6 +104,7 @@ def load(pycsw_config, ckan_url): deleted = set(existing_records) - set(gathered_records) changed = set() + sys.exit(1) for key in set(gathered_records) & set(existing_records): if gathered_records[key]['metadata_modified'] > existing_records[key]: changed.add(key) @@ -179,6 +199,9 @@ Manages the CKAN-pycsw integration python ckan-pycsw.py setup [-p] Setups the necessary pycsw table on the db. + python ckan-pycsw.py set_keywords [-p] -u + Sets pycsw server metadata keywords from CKAN site tag list. + python ckan-pycsw.py load [-p] -u Loads CKAN datasets as records into the pycsw db. @@ -237,10 +260,14 @@ if __name__ == '__main__': if arg.command == 'setup': setup_db(pycsw_config) - elif arg.command == 'load': + elif arg.command in ['load', 'set_keywords']: if not arg.ckan_url: raise AssertionError('You need to provide a CKAN URL with -u or --ckan_url') - load(pycsw_config, arg.ckan_url) + ckan_url = arg.ckan_url.rstrip('/') + '/' + if arg.command == 'load': + load(pycsw_config, ckan_url) + else: + set_keywords(arg.pycsw_config, pycsw_config, ckan_url) elif arg.command == 'clear': clear(pycsw_config) else: diff --git a/ckanext/spatial/commands/csw.py b/ckanext/spatial/commands/csw.py index 3375d8b..7d53b9d 100644 --- a/ckanext/spatial/commands/csw.py +++ b/ckanext/spatial/commands/csw.py @@ -11,6 +11,9 @@ class Pycsw(script.command.Command): ckan-pycsw setup [-p] Setups the necessary pycsw table on the db. + ckan-pycsw set_keywords [-p] [-u] + Sets pycsw server metadata keywords from CKAN site tag list. + ckan-pycsw load [-p] [-u] Loads CKAN datasets as records into the pycsw db. @@ -51,9 +54,12 @@ option: cmd = self.args[0] if cmd == 'setup': ckan_pycsw.setup_db(config) - elif cmd == 'load': + elif cmd in ['load', 'set_keywords']: ckan_url = self.options.ckan_url - ckan_pycsw.load(config, ckan_url) + if cmd == 'load': + ckan_pycsw.load(config, ckan_url) + else: + ckan_pycsw.set_keywords(self.options.pycsw_config, config, ckan_url) elif cmd == 'clear': ckan_pycsw.clear(config) else: diff --git a/doc/csw.rst b/doc/csw.rst index ac1459d..857fd73 100644 --- a/doc/csw.rst +++ b/doc/csw.rst @@ -154,6 +154,17 @@ Setup datasets will be synchronized and deleted datasets from CKAN will be removed from pycsw as well. +Setting Service Metadata Keywords ++++++++++++++++++++++++++++++++++ + +The CSW standard allows for administrators to set CSW service metadata. These +values can be set in the pycsw configuration ``metadata:main`` section. If you +would like the CSW service metadata keywords to be reflective of the CKAN +tags, run the following: + + paster ckan-pycsw set_keywords -p /etc/ckan/default/pycsw.cfg + + Running it on production site +++++++++++++++++++++++++++++ @@ -165,6 +176,7 @@ keep CKAN and pycsw in sync, and serve pycsw with Apache + mod_wsgi like CKAN. # m h dom mon dow command 0 * * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-spatial ckan-pycsw load -p /etc/ckan/default/pycsw.cfg + 0 0 * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-spatial ckan-pycsw set_keywords -p /etc/ckan/default/pycsw.cfg This particular example will run the load command every hour. You can of course modify this periodicity, for instance reducing it for huge instances. diff --git a/pip-requirements.txt b/pip-requirements.txt index 913ea46..1042bd4 100644 --- a/pip-requirements.txt +++ b/pip-requirements.txt @@ -5,3 +5,4 @@ lxml>=2.3 argparse pyparsing==1.5.6 requests +jinja2 From 82e61c95ad12a4aa13da01b2794ff0e50b61a39e Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Mon, 18 Nov 2013 16:34:07 -0500 Subject: [PATCH 2/5] remove debugging --- bin/ckan_pycsw.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/ckan_pycsw.py b/bin/ckan_pycsw.py index 6b72008..cbe7d6b 100644 --- a/bin/ckan_pycsw.py +++ b/bin/ckan_pycsw.py @@ -104,7 +104,6 @@ def load(pycsw_config, ckan_url): deleted = set(existing_records) - set(gathered_records) changed = set() - sys.exit(1) for key in set(gathered_records) & set(existing_records): if gathered_records[key]['metadata_modified'] > existing_records[key]: changed.add(key) From d825f5daea1d52ff10cb2f2041a81255fe2c1362 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Mon, 18 Nov 2013 16:35:48 -0500 Subject: [PATCH 3/5] remove unused lib --- pip-requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/pip-requirements.txt b/pip-requirements.txt index 1042bd4..913ea46 100644 --- a/pip-requirements.txt +++ b/pip-requirements.txt @@ -5,4 +5,3 @@ lxml>=2.3 argparse pyparsing==1.5.6 requests -jinja2 From 2e1c82aa19c87ea2352cf31c69eaee21e8615bf7 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Fri, 22 Nov 2013 08:01:00 -0500 Subject: [PATCH 4/5] update docstring --- bin/ckan_pycsw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/ckan_pycsw.py b/bin/ckan_pycsw.py index cbe7d6b..471c71c 100644 --- a/bin/ckan_pycsw.py +++ b/bin/ckan_pycsw.py @@ -34,7 +34,7 @@ def setup_db(pycsw_config): def set_keywords(pycsw_config_file, pycsw_config, ckan_url, limit=20): - """set pycsw service metadata keywords from top 10 CKAN tags""" + """set pycsw service metadata keywords from top limit CKAN tags""" log.info('Fetching tags from %s', ckan_url) url = ckan_url + 'api/tag_counts' From 0f8fa75b089064ef38b1c8bb33b1c501c12e3a48 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Fri, 22 Nov 2013 08:52:32 -0500 Subject: [PATCH 5/5] small doc fixes --- doc/csw.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/csw.rst b/doc/csw.rst index 857fd73..405b56c 100644 --- a/doc/csw.rst +++ b/doc/csw.rst @@ -160,10 +160,12 @@ Setting Service Metadata Keywords The CSW standard allows for administrators to set CSW service metadata. These values can be set in the pycsw configuration ``metadata:main`` section. If you would like the CSW service metadata keywords to be reflective of the CKAN -tags, run the following: +tags, run the following convenience command: paster ckan-pycsw set_keywords -p /etc/ckan/default/pycsw.cfg +Note that you must have privileges to write to the pycsw configuration file. + Running it on production site +++++++++++++++++++++++++++++ @@ -176,7 +178,6 @@ keep CKAN and pycsw in sync, and serve pycsw with Apache + mod_wsgi like CKAN. # m h dom mon dow command 0 * * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-spatial ckan-pycsw load -p /etc/ckan/default/pycsw.cfg - 0 0 * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-spatial ckan-pycsw set_keywords -p /etc/ckan/default/pycsw.cfg This particular example will run the load command every hour. You can of course modify this periodicity, for instance reducing it for huge instances.