Update CLI docstrings and documentation, remove unused file

This commit is contained in:
amercader 2021-05-28 13:27:12 +02:00
parent bb2bf38c72
commit cdf4b70bb7
6 changed files with 135 additions and 161 deletions

View File

@ -2,6 +2,9 @@ import sys
import logging
import datetime
import io
import os
import argparse
from six.moves.configparser import SafeConfigParser
import requests
from lxml import etree
@ -10,58 +13,66 @@ from pycsw.core import metadata, repository, util
import pycsw.core.config
import pycsw.core.admin
logging.basicConfig(format='%(message)s', level=logging.INFO)
logging.basicConfig(format="%(message)s", level=logging.INFO)
log = logging.getLogger(__name__)
def setup_db(pycsw_config):
"""Setup database tables and indexes"""
from sqlalchemy import Column, Text
database = pycsw_config.get('repository', 'database')
table_name = pycsw_config.get('repository', 'table', 'records')
database = pycsw_config.get("repository", "database")
table_name = pycsw_config.get("repository", "table", "records")
ckan_columns = [
Column('ckan_id', Text, index=True),
Column('ckan_modified', Text),
Column("ckan_id", Text, index=True),
Column("ckan_modified", Text),
]
pycsw.core.admin.setup_db(database,
table_name, '',
pycsw.core.admin.setup_db(
database,
table_name,
"",
create_plpythonu_functions=False,
extra_columns=ckan_columns)
extra_columns=ckan_columns,
)
def set_keywords(pycsw_config_file, pycsw_config, ckan_url, limit=20):
"""set pycsw service metadata keywords from top limit CKAN tags"""
log.info('Fetching tags from %s', ckan_url)
url = ckan_url + 'api/tag_counts'
log.info("Fetching tags from %s", ckan_url)
url = ckan_url + "api/tag_counts"
response = requests.get(url)
tags = response.json()
log.info('Deriving top %d tags', limit)
log.info("Deriving top %d tags", limit)
# uniquify and sort by top limit
tags_unique = [list(x) for x in set(tuple(x) for x in tags)]
tags_sorted = sorted(tags_unique, key=lambda x: x[1], reverse=1)[0:limit]
keywords = ','.join('%s' % tn[0] for tn in tags_sorted)
keywords = ",".join("%s" % tn[0] for tn in tags_sorted)
log.info('Setting tags in pycsw configuration file %s', pycsw_config_file)
pycsw_config.set('metadata:main', 'identification_keywords', keywords)
with open(pycsw_config_file, 'wb') as configfile:
log.info("Setting tags in pycsw configuration file %s", pycsw_config_file)
pycsw_config.set("metadata:main", "identification_keywords", keywords)
with open(pycsw_config_file, "wb") as configfile:
pycsw_config.write(configfile)
def load(pycsw_config, ckan_url):
database = pycsw_config.get('repository', 'database')
table_name = pycsw_config.get('repository', 'table', 'records')
database = pycsw_config.get("repository", "database")
table_name = pycsw_config.get("repository", "table", "records")
context = pycsw.core.config.StaticContext()
repo = repository.Repository(database, context, table=table_name)
log.info('Started gathering CKAN datasets identifiers: {0}'.format(str(datetime.datetime.now())))
log.info(
"Started gathering CKAN datasets identifiers: {0}".format(
str(datetime.datetime.now())
)
)
query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,extras_metadata_source", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}'
@ -75,23 +86,25 @@ def load(pycsw_config, ckan_url):
response = requests.get(url)
listing = response.json()
if not isinstance(listing, dict):
raise RuntimeError('Wrong API response: %s' % listing)
results = listing.get('results')
raise RuntimeError("Wrong API response: %s" % listing)
results = listing.get("results")
if not results:
break
for result in results:
gathered_records[result['id']] = {
'metadata_modified': result['metadata_modified'],
'harvest_object_id': result['extras']['harvest_object_id'],
'source': result['extras'].get('metadata_source')
gathered_records[result["id"]] = {
"metadata_modified": result["metadata_modified"],
"harvest_object_id": result["extras"]["harvest_object_id"],
"source": result["extras"].get("metadata_source"),
}
start = start + 1000
log.debug('Gathered %s' % start)
log.debug("Gathered %s" % start)
log.info('Gather finished ({0} datasets): {1}'.format(
len(gathered_records.keys()),
str(datetime.datetime.now())))
log.info(
"Gather finished ({0} datasets): {1}".format(
len(gathered_records.keys()), str(datetime.datetime.now())
)
)
existing_records = {}
@ -105,17 +118,16 @@ def load(pycsw_config, ckan_url):
changed = set()
for key in set(gathered_records) & set(existing_records):
if gathered_records[key]['metadata_modified'] > existing_records[key]:
if gathered_records[key]["metadata_modified"] > existing_records[key]:
changed.add(key)
for ckan_id in deleted:
try:
repo.session.begin()
repo.session.query(repo.dataset.ckan_id).filter_by(
ckan_id=ckan_id).delete()
log.info('Deleted %s' % ckan_id)
repo.session.query(repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete()
log.info("Deleted %s" % ckan_id)
repo.session.commit()
except Exception as err:
except Exception:
repo.session.rollback()
raise
@ -123,76 +135,81 @@ def load(pycsw_config, ckan_url):
ckan_info = gathered_records[ckan_id]
record = get_record(context, repo, ckan_url, ckan_id, ckan_info)
if not record:
log.info('Skipped record %s' % ckan_id)
log.info("Skipped record %s" % ckan_id)
continue
try:
repo.insert(record, 'local', util.get_today_and_now())
log.info('Inserted %s' % ckan_id)
repo.insert(record, "local", util.get_today_and_now())
log.info("Inserted %s" % ckan_id)
except Exception as err:
log.error('ERROR: not inserted %s Error:%s' % (ckan_id, err))
log.error("ERROR: not inserted %s Error:%s" % (ckan_id, err))
for ckan_id in changed:
ckan_info = gathered_records[ckan_id]
record = get_record(context, repo, ckan_url, ckan_id, ckan_info)
if not record:
continue
update_dict = dict([(getattr(repo.dataset, key),
getattr(record, key)) \
for key in record.__dict__.keys() if key != '_sa_instance_state'])
update_dict = dict(
[
(getattr(repo.dataset, key), getattr(record, key))
for key in record.__dict__.keys()
if key != "_sa_instance_state"
]
)
try:
repo.session.begin()
repo.session.query(repo.dataset).filter_by(
ckan_id=ckan_id).update(update_dict)
repo.session.query(repo.dataset).filter_by(ckan_id=ckan_id).update(
update_dict
)
repo.session.commit()
log.info('Changed %s' % ckan_id)
log.info("Changed %s" % ckan_id)
except Exception as err:
repo.session.rollback()
raise RuntimeError('ERROR: %s' % str(err))
raise RuntimeError("ERROR: %s" % str(err))
def clear(pycsw_config):
from sqlalchemy import create_engine, MetaData, Table
database = pycsw_config.get('repository', 'database')
table_name = pycsw_config.get('repository', 'table', 'records')
database = pycsw_config.get("repository", "database")
table_name = pycsw_config.get("repository", "table", "records")
log.debug('Creating engine')
log.debug("Creating engine")
engine = create_engine(database)
records = Table(table_name, MetaData(engine))
records.delete().execute()
log.info('Table cleared')
log.info("Table cleared")
def get_record(context, repo, ckan_url, ckan_id, ckan_info):
query = ckan_url + 'harvest/object/%s'
url = query % ckan_info['harvest_object_id']
query = ckan_url + "harvest/object/%s"
url = query % ckan_info["harvest_object_id"]
response = requests.get(url)
if ckan_info['source'] == 'arcgis':
if ckan_info["source"] == "arcgis":
return
try:
xml = etree.parse(io.BytesIO(response.content))
except Exception as err:
log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err))
log.error("Could not pass xml doc from %s, Error: %s" % (ckan_id, err))
return
try:
record = metadata.parse_record(context, xml, repo)[0]
except Exception as err:
log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err))
log.error("Could not extract metadata from %s, Error: %s" % (ckan_id, err))
return
if not record.identifier:
record.identifier = ckan_id
record.ckan_id = ckan_id
record.ckan_modified = ckan_info['metadata_modified']
record.ckan_modified = ckan_info["metadata_modified"]
return record
usage='''
usage = """
Manages the CKAN-pycsw integration
python ckan-pycsw.py setup [-p]
@ -211,18 +228,19 @@ All commands require the pycsw configuration file. By default it will try
to find a file called 'default.cfg' in the same directory, but you'll
probably need to provide the actual location via the -p option:
paster ckan-pycsw setup -p /etc/ckan/default/pycsw.cfg
python ckan_pycsw.py setup -p /etc/ckan/default/pycsw.cfg
The load command requires a CKAN URL from where the datasets will be pulled:
paster ckan-pycsw load -p /etc/ckan/default/pycsw.cfg -u http://localhost
python ckan_pycsw.py load -p /etc/ckan/default/pycsw.cfg -u http://localhost
"""
'''
def _load_config(file_path):
abs_path = os.path.abspath(file_path)
if not os.path.exists(abs_path):
raise AssertionError('pycsw config file {0} does not exist.'.format(abs_path))
raise AssertionError("pycsw config file {0} does not exist.".format(abs_path))
config = SafeConfigParser()
config.read(abs_path)
@ -230,25 +248,24 @@ def _load_config(file_path):
return config
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="\n".split(usage)[0], usage=usage)
parser.add_argument("command", help="Command to perform")
import os
import argparse
from ConfigParser import SafeConfigParser
parser.add_argument(
"-p",
"--pycsw_config",
action="store",
default="default.cfg",
help="pycsw config file to use.",
)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='\n'.split(usage)[0],
usage=usage)
parser.add_argument('command',
help='Command to perform')
parser.add_argument('-p', '--pycsw_config',
action='store', default='default.cfg',
help='pycsw config file to use.')
parser.add_argument('-u', '--ckan_url',
action='store',
help='CKAN instance to import the datasets from.')
parser.add_argument(
"-u",
"--ckan_url",
action="store",
help="CKAN instance to import the datasets from.",
)
if len(sys.argv) <= 1:
parser.print_usage()
@ -257,18 +274,18 @@ if __name__ == '__main__':
arg = parser.parse_args()
pycsw_config = _load_config(arg.pycsw_config)
if arg.command == 'setup':
if arg.command == "setup":
setup_db(pycsw_config)
elif arg.command in ['load', 'set_keywords']:
elif arg.command in ["load", "set_keywords"]:
if not arg.ckan_url:
raise AssertionError('You need to provide a CKAN URL with -u or --ckan_url')
ckan_url = arg.ckan_url.rstrip('/') + '/'
if arg.command == 'load':
raise AssertionError("You need to provide a CKAN URL with -u or --ckan_url")
ckan_url = arg.ckan_url.rstrip("/") + "/"
if arg.command == "load":
load(pycsw_config, ckan_url)
else:
set_keywords(arg.pycsw_config, pycsw_config, ckan_url)
elif arg.command == 'clear':
elif arg.command == "clear":
clear(pycsw_config)
else:
print('Unknown command {0}'.format(arg.command))
print("Unknown command {0}".format(arg.command))
sys.exit(1)

View File

@ -14,7 +14,7 @@ def get_commands():
]
@click.group(u"spatial-validation", short_help=u"Validation commands")
@click.group(u"spatial-validation", short_help=u"Spatial formats validation commands")
def spatial_validation():
pass
@ -22,18 +22,28 @@ def spatial_validation():
@spatial_validation.command()
@click.argument('pkg', required=False)
def report(pkg):
"""
Performs validation on the harvested metadata, either for all
packages or the one specified.
"""
return util.report(pkg)
@spatial_validation.command('report-csv')
@click.argument('filepath')
def report_csv(filepath):
"""
Performs validation on all the harvested metadata in the db and
writes a report in CSV format to the given filepath.
"""
return util.report_csv(filepath)
@spatial_validation.command('file')
@click.argument('filepath')
def validate_file(filepath):
"""Performs validation on the given metadata file."""
return util.validate_file(filepath)
@ -45,9 +55,19 @@ def spatial():
@spatial.command()
@click.argument('srid', required=False)
def initdb(srid):
"""
Creates the necessary tables. You must have PostGIS installed
and configured in the database.
You can provide the SRID of the geometry column. Default is 4326.
"""
return util.initdb(srid)
@spatial.command('extents')
def update_extents():
"""
Creates or updates the extent geometry column for datasets with
an extent defined in the 'spatial' extra.
"""
return util.update_extents()

View File

@ -1,71 +0,0 @@
import time
from six.moves.urllib.request import urlopen
from six.moves.urllib.error import URLError
import os
import pytest
from ckan.plugins.toolkit import config
from ckan.model import engine_is_sqlite
# copied from ckan/tests/__init__ to save importing it and therefore
# setting up Pylons.
class CkanServerCase(object):
@staticmethod
def _system(cmd):
import subprocess
(status, output) = subprocess.getstatusoutput(cmd)
if status:
raise Exception("Couldn't execute cmd: %s: %s" % (cmd, output))
@classmethod
def _paster(cls, cmd, config_path_rel):
config_path = os.path.join(config["here"], config_path_rel)
cls._system("paster --plugin ckan %s --config=%s" % (cmd, config_path))
@staticmethod
def _start_ckan_server(config_file=None):
if not config_file:
config_file = config["__file__"]
config_path = config_file
import subprocess
process = subprocess.Popen(["paster", "serve", config_path])
return process
@staticmethod
def _wait_for_url(url="http://127.0.0.1:5000/", timeout=15):
for i in range(int(timeout) * 100):
try:
urlopen(url)
except URLError:
time.sleep(0.01)
else:
break
@staticmethod
def _stop_ckan_server(process):
pid = process.pid
pid = int(pid)
if os.system("kill -9 %d" % pid):
raise Exception(
"Can't kill foreign CKAN instance (pid: %d)." % pid
)
class CkanProcess(CkanServerCase):
@classmethod
def setup_class(cls):
if engine_is_sqlite():
return pytest.skip("Non-memory database needed for this test")
cls.pid = cls._start_ckan_server()
## Don't need to init database, since it is same database as this process uses
cls._wait_for_url()
@classmethod
def teardown_class(cls):
cls._stop_ckan_server(cls.pid)

View File

@ -55,7 +55,7 @@ All necessary tasks are done with the ``ckan-pycsw`` command. To get more
details of its usage, run the following::
cd /usr/lib/ckan/default/src/ckanext-spatial
paster ckan-pycsw --help
python bin/ckan_pycsw.py --help
Setup
@ -114,11 +114,11 @@ Setup
The rest of the options are described `here <http://docs.pycsw.org/en/latest/configuration.html>`_.
4. Setup the pycsw table. This is done with the ``ckan-pycsw`` paster command
4. Setup the pycsw table. This is done with the ``ckan-pycsw`` script
(Remember to have the virtualenv activated when running it)::
cd /usr/lib/ckan/default/src/ckanext-spatial
paster ckan-pycsw setup -p /etc/ckan/default/pycsw.cfg
python bin/ckan_pycsw.py setup -p /etc/ckan/default/pycsw.cfg
At this point you should be ready to run pycsw with the wsgi script that it
includes::
@ -135,7 +135,7 @@ Setup
command for this::
cd /usr/lib/ckan/default/src/ckanext-spatial
paster ckan-pycsw load -p /etc/ckan/default/pycsw.cfg
python bin/ckan_pycsw.py load -p /etc/ckan/default/pycsw.cfg
When the loading is finished, check that results are returned when visiting
this link:
@ -155,7 +155,7 @@ values can be set in the pycsw configuration ``metadata:main`` section. If you
would like the CSW service metadata keywords to be reflective of the CKAN
tags, run the following convenience command::
paster ckan-pycsw set_keywords -p /etc/ckan/default/pycsw.cfg
python ckan_pycsw.py set_keywords -p /etc/ckan/default/pycsw.cfg
Note that you must have privileges to write to the pycsw configuration file.
@ -170,7 +170,7 @@ keep CKAN and pycsw in sync, and serve pycsw with Apache + mod_wsgi like CKAN.
and copy the following lines::
# m h dom mon dow command
0 * * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-spatial ckan-pycsw load -p /etc/ckan/default/pycsw.cfg
0 * * * * /var/lib/ckan/default/bin/python /var/lib/ckan/default/src/ckanext-spatial/bin/ckan_pycsw.py load -p /etc/ckan/default/pycsw.cfg
This particular example will run the load command every hour. You can of
course modify this periodicity, for instance reducing it for huge instances.

View File

@ -140,6 +140,10 @@ plugins on the configuration ini file (eg when restarting Apache).
If for some reason you need to explicitly create the table beforehand, you can
do it with the following command (with the virtualenv activated)::
(pyenv) $ ckan --config=mysite.ini spatial initdb [srid]
On CKAN 2.8 and below use::
(pyenv) $ paster --plugin=ckanext-spatial spatial initdb [srid] --config=mysite.ini
You can define the SRID of the geometry column. Default is 4326. If you are not

View File

@ -61,6 +61,10 @@ synchronize the information stored in the extra with the geometry table.
If you already have datasets when you enable Spatial Search then you'll need to
reindex them:
ckan --config=/etc/ckan/default/development.ini search-index rebuild
..note:: For CKAN 2.8 and below use:
paster --plugin=ckan search-index rebuild --config=/etc/ckan/default/development.ini