diff --git a/bin/ckan_pycsw.py b/bin/ckan_pycsw.py index e8508bf..47e171b 100644 --- a/bin/ckan_pycsw.py +++ b/bin/ckan_pycsw.py @@ -2,6 +2,9 @@ import sys import logging import datetime import io +import os +import argparse +from six.moves.configparser import SafeConfigParser import requests from lxml import etree @@ -10,58 +13,66 @@ from pycsw.core import metadata, repository, util import pycsw.core.config import pycsw.core.admin -logging.basicConfig(format='%(message)s', level=logging.INFO) +logging.basicConfig(format="%(message)s", level=logging.INFO) log = logging.getLogger(__name__) + def setup_db(pycsw_config): """Setup database tables and indexes""" from sqlalchemy import Column, Text - database = pycsw_config.get('repository', 'database') - table_name = pycsw_config.get('repository', 'table', 'records') + database = pycsw_config.get("repository", "database") + table_name = pycsw_config.get("repository", "table", "records") ckan_columns = [ - Column('ckan_id', Text, index=True), - Column('ckan_modified', Text), + Column("ckan_id", Text, index=True), + Column("ckan_modified", Text), ] - pycsw.core.admin.setup_db(database, - table_name, '', + pycsw.core.admin.setup_db( + database, + table_name, + "", create_plpythonu_functions=False, - extra_columns=ckan_columns) + extra_columns=ckan_columns, + ) def set_keywords(pycsw_config_file, pycsw_config, ckan_url, limit=20): """set pycsw service metadata keywords from top limit CKAN tags""" - log.info('Fetching tags from %s', ckan_url) - url = ckan_url + 'api/tag_counts' + log.info("Fetching tags from %s", ckan_url) + url = ckan_url + "api/tag_counts" response = requests.get(url) tags = response.json() - log.info('Deriving top %d tags', limit) + log.info("Deriving top %d tags", limit) # uniquify and sort by top limit tags_unique = [list(x) for x in set(tuple(x) for x in tags)] tags_sorted = sorted(tags_unique, key=lambda x: x[1], reverse=1)[0:limit] - keywords = ','.join('%s' % tn[0] for tn in tags_sorted) + keywords = ",".join("%s" % tn[0] for tn in tags_sorted) - log.info('Setting tags in pycsw configuration file %s', pycsw_config_file) - pycsw_config.set('metadata:main', 'identification_keywords', keywords) - with open(pycsw_config_file, 'wb') as configfile: + log.info("Setting tags in pycsw configuration file %s", pycsw_config_file) + pycsw_config.set("metadata:main", "identification_keywords", keywords) + with open(pycsw_config_file, "wb") as configfile: pycsw_config.write(configfile) def load(pycsw_config, ckan_url): - database = pycsw_config.get('repository', 'database') - table_name = pycsw_config.get('repository', 'table', 'records') + database = pycsw_config.get("repository", "database") + table_name = pycsw_config.get("repository", "table", "records") context = pycsw.core.config.StaticContext() repo = repository.Repository(database, context, table=table_name) - log.info('Started gathering CKAN datasets identifiers: {0}'.format(str(datetime.datetime.now()))) + log.info( + "Started gathering CKAN datasets identifiers: {0}".format( + str(datetime.datetime.now()) + ) + ) query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,extras_metadata_source", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}' @@ -75,23 +86,25 @@ def load(pycsw_config, ckan_url): response = requests.get(url) listing = response.json() if not isinstance(listing, dict): - raise RuntimeError('Wrong API response: %s' % listing) - results = listing.get('results') + raise RuntimeError("Wrong API response: %s" % listing) + results = listing.get("results") if not results: break for result in results: - gathered_records[result['id']] = { - 'metadata_modified': result['metadata_modified'], - 'harvest_object_id': result['extras']['harvest_object_id'], - 'source': result['extras'].get('metadata_source') + gathered_records[result["id"]] = { + "metadata_modified": result["metadata_modified"], + "harvest_object_id": result["extras"]["harvest_object_id"], + "source": result["extras"].get("metadata_source"), } start = start + 1000 - log.debug('Gathered %s' % start) + log.debug("Gathered %s" % start) - log.info('Gather finished ({0} datasets): {1}'.format( - len(gathered_records.keys()), - str(datetime.datetime.now()))) + log.info( + "Gather finished ({0} datasets): {1}".format( + len(gathered_records.keys()), str(datetime.datetime.now()) + ) + ) existing_records = {} @@ -105,17 +118,16 @@ def load(pycsw_config, ckan_url): changed = set() for key in set(gathered_records) & set(existing_records): - if gathered_records[key]['metadata_modified'] > existing_records[key]: + if gathered_records[key]["metadata_modified"] > existing_records[key]: changed.add(key) for ckan_id in deleted: try: repo.session.begin() - repo.session.query(repo.dataset.ckan_id).filter_by( - ckan_id=ckan_id).delete() - log.info('Deleted %s' % ckan_id) + repo.session.query(repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete() + log.info("Deleted %s" % ckan_id) repo.session.commit() - except Exception as err: + except Exception: repo.session.rollback() raise @@ -123,76 +135,81 @@ def load(pycsw_config, ckan_url): ckan_info = gathered_records[ckan_id] record = get_record(context, repo, ckan_url, ckan_id, ckan_info) if not record: - log.info('Skipped record %s' % ckan_id) + log.info("Skipped record %s" % ckan_id) continue try: - repo.insert(record, 'local', util.get_today_and_now()) - log.info('Inserted %s' % ckan_id) + repo.insert(record, "local", util.get_today_and_now()) + log.info("Inserted %s" % ckan_id) except Exception as err: - log.error('ERROR: not inserted %s Error:%s' % (ckan_id, err)) + log.error("ERROR: not inserted %s Error:%s" % (ckan_id, err)) for ckan_id in changed: ckan_info = gathered_records[ckan_id] record = get_record(context, repo, ckan_url, ckan_id, ckan_info) if not record: continue - update_dict = dict([(getattr(repo.dataset, key), - getattr(record, key)) \ - for key in record.__dict__.keys() if key != '_sa_instance_state']) + update_dict = dict( + [ + (getattr(repo.dataset, key), getattr(record, key)) + for key in record.__dict__.keys() + if key != "_sa_instance_state" + ] + ) try: repo.session.begin() - repo.session.query(repo.dataset).filter_by( - ckan_id=ckan_id).update(update_dict) + repo.session.query(repo.dataset).filter_by(ckan_id=ckan_id).update( + update_dict + ) repo.session.commit() - log.info('Changed %s' % ckan_id) + log.info("Changed %s" % ckan_id) except Exception as err: repo.session.rollback() - raise RuntimeError('ERROR: %s' % str(err)) + raise RuntimeError("ERROR: %s" % str(err)) def clear(pycsw_config): from sqlalchemy import create_engine, MetaData, Table - database = pycsw_config.get('repository', 'database') - table_name = pycsw_config.get('repository', 'table', 'records') + database = pycsw_config.get("repository", "database") + table_name = pycsw_config.get("repository", "table", "records") - log.debug('Creating engine') + log.debug("Creating engine") engine = create_engine(database) records = Table(table_name, MetaData(engine)) records.delete().execute() - log.info('Table cleared') + log.info("Table cleared") def get_record(context, repo, ckan_url, ckan_id, ckan_info): - query = ckan_url + 'harvest/object/%s' - url = query % ckan_info['harvest_object_id'] + query = ckan_url + "harvest/object/%s" + url = query % ckan_info["harvest_object_id"] response = requests.get(url) - if ckan_info['source'] == 'arcgis': + if ckan_info["source"] == "arcgis": return try: xml = etree.parse(io.BytesIO(response.content)) except Exception as err: - log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err)) + log.error("Could not pass xml doc from %s, Error: %s" % (ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception as err: - log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err)) + log.error("Could not extract metadata from %s, Error: %s" % (ckan_id, err)) return if not record.identifier: record.identifier = ckan_id record.ckan_id = ckan_id - record.ckan_modified = ckan_info['metadata_modified'] + record.ckan_modified = ckan_info["metadata_modified"] return record -usage=''' +usage = """ Manages the CKAN-pycsw integration python ckan-pycsw.py setup [-p] @@ -211,18 +228,19 @@ All commands require the pycsw configuration file. By default it will try to find a file called 'default.cfg' in the same directory, but you'll probably need to provide the actual location via the -p option: - paster ckan-pycsw setup -p /etc/ckan/default/pycsw.cfg + python ckan_pycsw.py setup -p /etc/ckan/default/pycsw.cfg The load command requires a CKAN URL from where the datasets will be pulled: - paster ckan-pycsw load -p /etc/ckan/default/pycsw.cfg -u http://localhost + python ckan_pycsw.py load -p /etc/ckan/default/pycsw.cfg -u http://localhost + +""" -''' def _load_config(file_path): abs_path = os.path.abspath(file_path) if not os.path.exists(abs_path): - raise AssertionError('pycsw config file {0} does not exist.'.format(abs_path)) + raise AssertionError("pycsw config file {0} does not exist.".format(abs_path)) config = SafeConfigParser() config.read(abs_path) @@ -230,25 +248,24 @@ def _load_config(file_path): return config +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="\n".split(usage)[0], usage=usage) + parser.add_argument("command", help="Command to perform") -import os -import argparse -from ConfigParser import SafeConfigParser + parser.add_argument( + "-p", + "--pycsw_config", + action="store", + default="default.cfg", + help="pycsw config file to use.", + ) -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='\n'.split(usage)[0], - usage=usage) - parser.add_argument('command', - help='Command to perform') - - parser.add_argument('-p', '--pycsw_config', - action='store', default='default.cfg', - help='pycsw config file to use.') - - parser.add_argument('-u', '--ckan_url', - action='store', - help='CKAN instance to import the datasets from.') + parser.add_argument( + "-u", + "--ckan_url", + action="store", + help="CKAN instance to import the datasets from.", + ) if len(sys.argv) <= 1: parser.print_usage() @@ -257,18 +274,18 @@ if __name__ == '__main__': arg = parser.parse_args() pycsw_config = _load_config(arg.pycsw_config) - if arg.command == 'setup': + if arg.command == "setup": setup_db(pycsw_config) - elif arg.command in ['load', 'set_keywords']: + elif arg.command in ["load", "set_keywords"]: if not arg.ckan_url: - raise AssertionError('You need to provide a CKAN URL with -u or --ckan_url') - ckan_url = arg.ckan_url.rstrip('/') + '/' - if arg.command == 'load': + raise AssertionError("You need to provide a CKAN URL with -u or --ckan_url") + ckan_url = arg.ckan_url.rstrip("/") + "/" + if arg.command == "load": load(pycsw_config, ckan_url) else: set_keywords(arg.pycsw_config, pycsw_config, ckan_url) - elif arg.command == 'clear': + elif arg.command == "clear": clear(pycsw_config) else: - print('Unknown command {0}'.format(arg.command)) + print("Unknown command {0}".format(arg.command)) sys.exit(1) diff --git a/ckanext/spatial/cli.py b/ckanext/spatial/cli.py index 7be5a0b..a1a3008 100644 --- a/ckanext/spatial/cli.py +++ b/ckanext/spatial/cli.py @@ -14,7 +14,7 @@ def get_commands(): ] -@click.group(u"spatial-validation", short_help=u"Validation commands") +@click.group(u"spatial-validation", short_help=u"Spatial formats validation commands") def spatial_validation(): pass @@ -22,18 +22,28 @@ def spatial_validation(): @spatial_validation.command() @click.argument('pkg', required=False) def report(pkg): + """ + Performs validation on the harvested metadata, either for all + packages or the one specified. + """ + return util.report(pkg) @spatial_validation.command('report-csv') @click.argument('filepath') def report_csv(filepath): + """ + Performs validation on all the harvested metadata in the db and + writes a report in CSV format to the given filepath. + """ return util.report_csv(filepath) @spatial_validation.command('file') @click.argument('filepath') def validate_file(filepath): + """Performs validation on the given metadata file.""" return util.validate_file(filepath) @@ -45,9 +55,19 @@ def spatial(): @spatial.command() @click.argument('srid', required=False) def initdb(srid): + """ + Creates the necessary tables. You must have PostGIS installed + and configured in the database. + You can provide the SRID of the geometry column. Default is 4326. + """ return util.initdb(srid) @spatial.command('extents') def update_extents(): + """ + Creates or updates the extent geometry column for datasets with + an extent defined in the 'spatial' extra. + """ + return util.update_extents() diff --git a/ckanext/spatial/tests/test_csw_client.py b/ckanext/spatial/tests/test_csw_client.py deleted file mode 100644 index 032f34a..0000000 --- a/ckanext/spatial/tests/test_csw_client.py +++ /dev/null @@ -1,71 +0,0 @@ -import time -from six.moves.urllib.request import urlopen -from six.moves.urllib.error import URLError -import os - -import pytest - -from ckan.plugins.toolkit import config - -from ckan.model import engine_is_sqlite - - -# copied from ckan/tests/__init__ to save importing it and therefore -# setting up Pylons. -class CkanServerCase(object): - @staticmethod - def _system(cmd): - import subprocess - - (status, output) = subprocess.getstatusoutput(cmd) - if status: - raise Exception("Couldn't execute cmd: %s: %s" % (cmd, output)) - - @classmethod - def _paster(cls, cmd, config_path_rel): - config_path = os.path.join(config["here"], config_path_rel) - cls._system("paster --plugin ckan %s --config=%s" % (cmd, config_path)) - - @staticmethod - def _start_ckan_server(config_file=None): - if not config_file: - config_file = config["__file__"] - config_path = config_file - import subprocess - - process = subprocess.Popen(["paster", "serve", config_path]) - return process - - @staticmethod - def _wait_for_url(url="http://127.0.0.1:5000/", timeout=15): - for i in range(int(timeout) * 100): - try: - urlopen(url) - except URLError: - time.sleep(0.01) - else: - break - - @staticmethod - def _stop_ckan_server(process): - pid = process.pid - pid = int(pid) - if os.system("kill -9 %d" % pid): - raise Exception( - "Can't kill foreign CKAN instance (pid: %d)." % pid - ) - - -class CkanProcess(CkanServerCase): - @classmethod - def setup_class(cls): - if engine_is_sqlite(): - return pytest.skip("Non-memory database needed for this test") - - cls.pid = cls._start_ckan_server() - ## Don't need to init database, since it is same database as this process uses - cls._wait_for_url() - - @classmethod - def teardown_class(cls): - cls._stop_ckan_server(cls.pid) diff --git a/doc/csw.rst b/doc/csw.rst index db32fe4..3c4cf23 100644 --- a/doc/csw.rst +++ b/doc/csw.rst @@ -55,7 +55,7 @@ All necessary tasks are done with the ``ckan-pycsw`` command. To get more details of its usage, run the following:: cd /usr/lib/ckan/default/src/ckanext-spatial - paster ckan-pycsw --help + python bin/ckan_pycsw.py --help Setup @@ -114,11 +114,11 @@ Setup The rest of the options are described `here `_. -4. Setup the pycsw table. This is done with the ``ckan-pycsw`` paster command +4. Setup the pycsw table. This is done with the ``ckan-pycsw`` script (Remember to have the virtualenv activated when running it):: cd /usr/lib/ckan/default/src/ckanext-spatial - paster ckan-pycsw setup -p /etc/ckan/default/pycsw.cfg + python bin/ckan_pycsw.py setup -p /etc/ckan/default/pycsw.cfg At this point you should be ready to run pycsw with the wsgi script that it includes:: @@ -135,7 +135,7 @@ Setup command for this:: cd /usr/lib/ckan/default/src/ckanext-spatial - paster ckan-pycsw load -p /etc/ckan/default/pycsw.cfg + python bin/ckan_pycsw.py load -p /etc/ckan/default/pycsw.cfg When the loading is finished, check that results are returned when visiting this link: @@ -155,7 +155,7 @@ values can be set in the pycsw configuration ``metadata:main`` section. If you would like the CSW service metadata keywords to be reflective of the CKAN tags, run the following convenience command:: - paster ckan-pycsw set_keywords -p /etc/ckan/default/pycsw.cfg + python ckan_pycsw.py set_keywords -p /etc/ckan/default/pycsw.cfg Note that you must have privileges to write to the pycsw configuration file. @@ -170,7 +170,7 @@ keep CKAN and pycsw in sync, and serve pycsw with Apache + mod_wsgi like CKAN. and copy the following lines:: # m h dom mon dow command - 0 * * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-spatial ckan-pycsw load -p /etc/ckan/default/pycsw.cfg + 0 * * * * /var/lib/ckan/default/bin/python /var/lib/ckan/default/src/ckanext-spatial/bin/ckan_pycsw.py load -p /etc/ckan/default/pycsw.cfg This particular example will run the load command every hour. You can of course modify this periodicity, for instance reducing it for huge instances. diff --git a/doc/install.rst b/doc/install.rst index 5c4bcf3..eab0a97 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -140,6 +140,10 @@ plugins on the configuration ini file (eg when restarting Apache). If for some reason you need to explicitly create the table beforehand, you can do it with the following command (with the virtualenv activated):: + (pyenv) $ ckan --config=mysite.ini spatial initdb [srid] + +On CKAN 2.8 and below use:: + (pyenv) $ paster --plugin=ckanext-spatial spatial initdb [srid] --config=mysite.ini You can define the SRID of the geometry column. Default is 4326. If you are not diff --git a/doc/spatial-search.rst b/doc/spatial-search.rst index cd6986e..4aa14a2 100644 --- a/doc/spatial-search.rst +++ b/doc/spatial-search.rst @@ -61,6 +61,10 @@ synchronize the information stored in the extra with the geometry table. If you already have datasets when you enable Spatial Search then you'll need to reindex them: + ckan --config=/etc/ckan/default/development.ini search-index rebuild + +..note:: For CKAN 2.8 and below use: + paster --plugin=ckan search-index rebuild --config=/etc/ckan/default/development.ini