From e8570b9e508709dd969d3240a941b56e2dfe76fb Mon Sep 17 00:00:00 2001 From: Raphael Stolt Date: Fri, 28 Oct 2016 11:29:27 +0200 Subject: [PATCH 01/13] Add clearsource history command --- CHANGELOG.rst | 3 +- README.rst | 7 +++ ckanext/harvest/commands/harvester.py | 37 ++++++++++++++- ckanext/harvest/logic/action/update.py | 62 ++++++++++++++++++++++++++ ckanext/harvest/logic/auth/update.py | 11 +++++ 5 files changed, 117 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a0bb373..d4b87cd 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,4 +3,5 @@ Changelog v0.0.6 `2016-??-??` ------------------- -Includes i18n directory in package. \ No newline at end of file +- Includes i18n directory in package. +- Adds a new `clearsource_history` command/operation. diff --git a/README.rst b/README.rst index d61aefe..2c08da0 100644 --- a/README.rst +++ b/README.rst @@ -189,6 +189,13 @@ The following operations can be run from the command line as described underneat - clears all datasets, jobs and objects related to a harvest source, but keeps the source itself + harvester clearsource_history [{source-id}] + - If no source id is given the history for all harvest sources will be cleared. + Clears all jobs and objects related to a harvest source, but keeps the source + itself. The datasets imported from the harvest source will **NOT** be deleted!!! + If a source id is given, it only clears the history of the harvest source with + the given source id. + harvester sources [all] - lists harvest sources If 'all' is defined, it also shows the Inactive sources diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 738ccae..e125439 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -29,6 +29,12 @@ class Harvester(CkanCommand): - clears all datasets, jobs and objects related to a harvest source, but keeps the source itself + harvester clearsource_history [{source-id}] + - If no source id is given the history for all harvest sources will be cleared. + Clears all jobs and objects related to a harvest source, but keeps the source itself. + The datasets imported from the harvest source will NOT be deleted!!! + If a source id is given, it only clears the history of the harvest source with the given source id. + harvester sources [all] - lists harvest sources If 'all' is defined, it also shows the Inactive sources @@ -153,6 +159,8 @@ class Harvester(CkanCommand): self.remove_harvest_source() elif cmd == 'clearsource': self.clear_harvest_source() + elif cmd == 'clearsource_history': + self.clear_harvest_source_history() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': @@ -182,8 +190,7 @@ class Harvester(CkanCommand): for method, header, body in consumer.consume(queue=get_fetch_queue_name()): fetch_callback(consumer, method, header, body) elif cmd == 'purge_queues': - from ckanext.harvest.queue import purge_queues - purge_queues() + self.purge_queues() elif cmd == 'initdb': self.initdb() elif cmd == 'import': @@ -288,6 +295,29 @@ class Harvester(CkanCommand): print str(e.error_dict) raise e + def clear_harvest_source_history(self): + source_id = None + if len(self.args) >= 2: + source_id = unicode(self.args[1]) + + context = { + 'model': model, + 'user': self.admin_user['name'], + 'session': model.Session + } + if source_id is not None: + get_action('harvest_source_job_history_clear')(context,{'id':source_id}) + print 'Cleared job history of harvest source: %s' % source_id + else: + ''' + Purge queues, because we clean all harvest jobs and + objects in the database. + ''' + self.purge_queues() + cleared_sources_dicts = get_action('harvest_sources_job_history_clear')(context,{}) + print 'Cleared job history for all harvest sources: %s source(s)' % len(cleared_sources_dicts) + + def show_harvest_source(self): if len(self.args) >= 2: @@ -465,6 +495,9 @@ class Harvester(CkanCommand): context = {'model': model, 'user': self.admin_user['name']} get_action('harvest_sources_reindex')(context,{}) + def purge_queues(self): + from ckanext.harvest.queue import purge_queues + purge_queues() def print_harvest_sources(self, sources): if sources: diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index e4f2041..74e58f5 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -225,6 +225,68 @@ def harvest_source_clear(context, data_dict): return {'id': harvest_source_id} +def harvest_sources_job_history_clear(context, data_dict): + ''' + Clears the history for all active harvest sources. All jobs and objects related to a harvest source will + be cleared, but keeps the source itself. + This is useful to clean history of long running harvest sources to start again fresh. + The datasets imported from the harvest source will NOT be deleted!!! + + ''' + check_access('harvest_sources_clear', context, data_dict) + + job_history_clear_results = [] + # We assume that the maximum of 1000 (hard limit) rows should be enough + result = logic.get_action('package_search')(context, {'fq': '+type:"harvest"', 'rows': 1000}) + harvest_packages = result['results'] + if harvest_packages: + for data_dict in harvest_packages: + clear_result = get_action('harvest_source_job_history_clear')(context, {'id': data_dict['id']}) + job_history_clear_results.append(clear_result) + + return job_history_clear_results + + +def harvest_source_job_history_clear(context, data_dict): + ''' + Clears all jobs and objects related to a harvest source, but keeps the source itself. + This is useful to clean history of long running harvest sources to start again fresh. + The datasets imported from the harvest source will NOT be deleted!!! + + :param id: the id of the harvest source to clear + :type id: string + + ''' + check_access('harvest_source_clear', context, data_dict) + + harvest_source_id = data_dict.get('id', None) + + source = HarvestSource.get(harvest_source_id) + if not source: + log.error('Harvest source %s does not exist', harvest_source_id) + raise NotFound('Harvest source %s does not exist' % harvest_source_id) + + harvest_source_id = source.id + + model = context['model'] + + sql = '''begin; + delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); + delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); + delete from harvest_object where harvest_source_id = '{harvest_source_id}'; + delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}'); + delete from harvest_job where source_id = '{harvest_source_id}'; + commit; + '''.format(harvest_source_id=harvest_source_id) + + model.Session.execute(sql) + + # Refresh the index for this source to update the status object + get_action('harvest_source_reindex')(context, {'id': harvest_source_id}) + + return {'id': harvest_source_id} + + def harvest_source_index_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but diff --git a/ckanext/harvest/logic/auth/update.py b/ckanext/harvest/logic/auth/update.py index 2bd70b9..3a4a75e 100644 --- a/ckanext/harvest/logic/auth/update.py +++ b/ckanext/harvest/logic/auth/update.py @@ -27,6 +27,17 @@ def harvest_source_update(context, data_dict): return {'success': False, 'msg': pt._('User {0} not authorized to update harvest source {1}').format(user, source_id)} +def harvest_sources_clear(context, data_dict): + ''' + Authorization check for clearing history for all harvest sources + + Only sysadmins can do it + ''' + if not user_is_sysadmin(context): + return {'success': False, 'msg': pt._('Only sysadmins can clear history for all harvest jobs')} + else: + return {'success': True} + def harvest_source_clear(context, data_dict): ''' Authorization check for clearing a harvest source From af0e1712b9b9e935786f3fff4dc0d9739ac4abfa Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Fri, 11 Nov 2016 18:11:28 +0100 Subject: [PATCH 02/13] Changed filter query for reading harvest sources Changed filter query for reading harvest sources in according to the code in /ckanext/harvest/plugin.py. --- ckanext/harvest/logic/action/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 74e58f5..ed61ebf 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -237,7 +237,7 @@ def harvest_sources_job_history_clear(context, data_dict): job_history_clear_results = [] # We assume that the maximum of 1000 (hard limit) rows should be enough - result = logic.get_action('package_search')(context, {'fq': '+type:"harvest"', 'rows': 1000}) + result = logic.get_action('package_search')(context, {'fq': '+dataset_type:harvest', 'rows': 1000}) harvest_packages = result['results'] if harvest_packages: for data_dict in harvest_packages: From 1acab98026ffb2071956c4c1641bf2486e409623 Mon Sep 17 00:00:00 2001 From: rnoerenberg Date: Tue, 15 Nov 2016 15:37:26 +0100 Subject: [PATCH 03/13] Added tests for clearsource history command --- ckanext/harvest/tests/test_action.py | 60 ++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 79e91ca..f42087f 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -371,6 +371,66 @@ class TestActions(ActionBase): assert_equal(harvest_model.HarvestJob.get(job.id), None) assert_equal(harvest_model.HarvestObject.get(object_.id), None) assert_equal(model.Package.get(dataset['id']), None) + + def test_harvest_source_job_history_clear(self): + # prepare + source = factories.HarvestSourceObj(**SOURCE_DICT) + job = factories.HarvestJobObj(source=source) + dataset = ckan_factories.Dataset() + object_ = factories.HarvestObjectObj(job=job, source=source, + package_id=dataset['id']) + + # execute + context = {'model': model, 'session': model.Session, + 'ignore_auth': True, 'user': ''} + result = toolkit.get_action('harvest_source_job_history_clear')( + context, {'id': source.id}) + + # verify + assert_equal(result, {'id': source.id}) + source = harvest_model.HarvestSource.get(source.id) + assert source + assert_equal(harvest_model.HarvestJob.get(job.id), None) + assert_equal(harvest_model.HarvestObject.get(object_.id), None) + dataset_from_db = model.Package.get(dataset['id']) + assert dataset_from_db, 'is None' + assert_equal(dataset_from_db['id'], dataset['id']) + + def test_harvest_sources_job_history_clear(self): + # prepare + source_1 = factories.HarvestSourceObj(**SOURCE_DICT) + job_1 = factories.HarvestJobObj(source=source_1) + dataset_1 = ckan_factories.Dataset() + object_1_ = factories.HarvestObjectObj(job=job_1, source=source_1, + package_id=dataset_1['id']) + source_2 = factories.HarvestSourceObj(**SOURCE_DICT) + job_2 = factories.HarvestJobObj(source=source_2) + dataset_2 = ckan_factories.Dataset() + object_2_ = factories.HarvestObjectObj(job=job_2, source=source_2, + package_id=dataset_2['id']) + + # execute + context = {'model': model, 'session': model.Session, + 'ignore_auth': True, 'user': ''} + result = toolkit.get_action('harvest_sources_job_history_clear')( + context, {}) + + # verify + assert_equal(result, [{'id': source_1.id}, {'id': source_2.id}]) + source_1 = harvest_model.HarvestSource.get(source_1.id) + assert source_1 + assert_equal(harvest_model.HarvestJob.get(job_1.id), None) + assert_equal(harvest_model.HarvestObject.get(object_1_.id), None) + dataset_from_db_1 = model.Package.get(dataset_1['id']) + assert dataset_from_db_1, 'is None' + assert_equal(dataset_from_db_1['id'], dataset_1['id']) + source_2 = harvest_model.HarvestSource.get(source_1.id) + assert source_2 + assert_equal(harvest_model.HarvestJob.get(job_2.id), None) + assert_equal(harvest_model.HarvestObject.get(object_2_.id), None) + dataset_from_db_1 = model.Package.get(dataset_2['id']) + assert dataset_from_db_2, 'is None' + assert_equal(dataset_from_db_2['id'], dataset_2['id']) def test_harvest_source_create_twice_with_unique_url(self): # don't use factory because it looks for the existing source From cf1cfcca4890423cd873926ef06c6e97341abdfa Mon Sep 17 00:00:00 2001 From: rnoerenberg Date: Tue, 15 Nov 2016 15:50:03 +0100 Subject: [PATCH 04/13] Fixed using property of object --- ckanext/harvest/tests/test_action.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index f42087f..bc351f1 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -394,7 +394,7 @@ class TestActions(ActionBase): assert_equal(harvest_model.HarvestObject.get(object_.id), None) dataset_from_db = model.Package.get(dataset['id']) assert dataset_from_db, 'is None' - assert_equal(dataset_from_db['id'], dataset['id']) + assert_equal(dataset_from_db.id, dataset['id']) def test_harvest_sources_job_history_clear(self): # prepare @@ -430,7 +430,7 @@ class TestActions(ActionBase): assert_equal(harvest_model.HarvestObject.get(object_2_.id), None) dataset_from_db_1 = model.Package.get(dataset_2['id']) assert dataset_from_db_2, 'is None' - assert_equal(dataset_from_db_2['id'], dataset_2['id']) + assert_equal(dataset_from_db_2.id, dataset_2['id']) def test_harvest_source_create_twice_with_unique_url(self): # don't use factory because it looks for the existing source From 8d5ff4b4ef5dda39c85e6ee945940ef372dbb086 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Tue, 15 Nov 2016 21:09:42 +0100 Subject: [PATCH 05/13] Fixed harvest_sources_job_history_clear test Fixed harvest_sources_job_history_clear test by creating different harvest sources. --- ckanext/harvest/tests/test_action.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index bc351f1..8a7cc58 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -398,12 +398,15 @@ class TestActions(ActionBase): def test_harvest_sources_job_history_clear(self): # prepare - source_1 = factories.HarvestSourceObj(**SOURCE_DICT) + data_dict = SOURCE_DICT + source_1 = factories.HarvestSourceObj(data_dict) job_1 = factories.HarvestJobObj(source=source_1) dataset_1 = ckan_factories.Dataset() object_1_ = factories.HarvestObjectObj(job=job_1, source=source_1, package_id=dataset_1['id']) - source_2 = factories.HarvestSourceObj(**SOURCE_DICT) + data_dict['name'] = 'another-source1' + data_dict['url'] = 'http://another-url' + source_2 = factories.HarvestSourceObj(data_dict) job_2 = factories.HarvestJobObj(source=source_2) dataset_2 = ckan_factories.Dataset() object_2_ = factories.HarvestObjectObj(job=job_2, source=source_2, @@ -423,7 +426,7 @@ class TestActions(ActionBase): assert_equal(harvest_model.HarvestObject.get(object_1_.id), None) dataset_from_db_1 = model.Package.get(dataset_1['id']) assert dataset_from_db_1, 'is None' - assert_equal(dataset_from_db_1['id'], dataset_1['id']) + assert_equal(dataset_from_db_1.id, dataset_1['id']) source_2 = harvest_model.HarvestSource.get(source_1.id) assert source_2 assert_equal(harvest_model.HarvestJob.get(job_2.id), None) @@ -598,4 +601,4 @@ class TestHarvestDBLog(unittest.TestCase): per_page = 1 data = toolkit.get_action('harvest_log_list')(context, {'level': 'info', 'per_page': per_page}) self.assertEqual(len(data), per_page) - self.assertEqual(data[0]['level'], 'INFO') \ No newline at end of file + self.assertEqual(data[0]['level'], 'INFO') From 096e746c817e7c87cc85f660008dc69293b07c80 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Tue, 15 Nov 2016 21:23:20 +0100 Subject: [PATCH 06/13] Fixed HarvestSourceObj argument --- ckanext/harvest/tests/test_action.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 8a7cc58..6b69a34 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -399,14 +399,14 @@ class TestActions(ActionBase): def test_harvest_sources_job_history_clear(self): # prepare data_dict = SOURCE_DICT - source_1 = factories.HarvestSourceObj(data_dict) + source_1 = factories.HarvestSourceObj(**data_dict) job_1 = factories.HarvestJobObj(source=source_1) dataset_1 = ckan_factories.Dataset() object_1_ = factories.HarvestObjectObj(job=job_1, source=source_1, package_id=dataset_1['id']) data_dict['name'] = 'another-source1' data_dict['url'] = 'http://another-url' - source_2 = factories.HarvestSourceObj(data_dict) + source_2 = factories.HarvestSourceObj(**data_dict) job_2 = factories.HarvestJobObj(source=source_2) dataset_2 = ckan_factories.Dataset() object_2_ = factories.HarvestObjectObj(job=job_2, source=source_2, From d01a86680e1f054c058575ad82d36b3f757dea34 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Tue, 15 Nov 2016 21:56:57 +0100 Subject: [PATCH 07/13] Fix creating different harvest sources Fix creating different harvest sources. Different harvest sources can't be created with factory. --- ckanext/harvest/tests/test_action.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 6b69a34..52e83be 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -398,15 +398,25 @@ class TestActions(ActionBase): def test_harvest_sources_job_history_clear(self): # prepare + # don't use factory because it looks for the existing source data_dict = SOURCE_DICT - source_1 = factories.HarvestSourceObj(**data_dict) + site_user = toolkit.get_action('get_site_user')( + {'model': model, 'ignore_auth': True}, {})['name'] + + source_1_dict = toolkit.get_action('harvest_source_create')( + {'user': site_user}, data_dict) + source_1 = harvest_model.HarvestSource.get(source_1_dict['id']) + + data_dict['name'] = 'another-source1' + data_dict['url'] = 'http://another-url' + source_2_dict = toolkit.get_action('harvest_source_create')( + {'user': site_user}, data_dict) + source_2 = harvest_model.HarvestSource.get(source_2_dict['id']) + job_1 = factories.HarvestJobObj(source=source_1) dataset_1 = ckan_factories.Dataset() object_1_ = factories.HarvestObjectObj(job=job_1, source=source_1, package_id=dataset_1['id']) - data_dict['name'] = 'another-source1' - data_dict['url'] = 'http://another-url' - source_2 = factories.HarvestSourceObj(**data_dict) job_2 = factories.HarvestJobObj(source=source_2) dataset_2 = ckan_factories.Dataset() object_2_ = factories.HarvestObjectObj(job=job_2, source=source_2, From f68bf323f01b542a7de215b075c15c34a4011ef5 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Tue, 15 Nov 2016 22:28:37 +0100 Subject: [PATCH 08/13] Using test class wide unique harvest source url Using test class wide unique harvest source url, because in a test created objects are still present in following tests. --- ckanext/harvest/tests/test_action.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 52e83be..9e23fe5 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -400,6 +400,8 @@ class TestActions(ActionBase): # prepare # don't use factory because it looks for the existing source data_dict = SOURCE_DICT + data_dict['name'] = 'job-history-clear-source' + data_dict['url'] = 'http://job-history-clear-url' site_user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {})['name'] @@ -407,8 +409,8 @@ class TestActions(ActionBase): {'user': site_user}, data_dict) source_1 = harvest_model.HarvestSource.get(source_1_dict['id']) - data_dict['name'] = 'another-source1' - data_dict['url'] = 'http://another-url' + data_dict['name'] = 'another-job-history-clear-source' + data_dict['url'] = 'http://another-job-history-clear-url' source_2_dict = toolkit.get_action('harvest_source_create')( {'user': site_user}, data_dict) source_2 = harvest_model.HarvestSource.get(source_2_dict['id']) From 95d0c1ca419c483cf06821203b1d319dd03c7319 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Tue, 15 Nov 2016 23:36:11 +0100 Subject: [PATCH 09/13] Ignoring not existent harvest sources in harvest_sources_job_history_clear Ignoring not existent harvest sources harvest_sources_job_history_clear because of a possibly corrupt search index. --- ckanext/harvest/logic/action/update.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index ed61ebf..7728fec 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -241,8 +241,13 @@ def harvest_sources_job_history_clear(context, data_dict): harvest_packages = result['results'] if harvest_packages: for data_dict in harvest_packages: - clear_result = get_action('harvest_source_job_history_clear')(context, {'id': data_dict['id']}) - job_history_clear_results.append(clear_result) + try: + clear_result = get_action('harvest_source_job_history_clear')(context, {'id': data_dict['id']}) + job_history_clear_results.append(clear_result) + except NotFound: + # Ignoring not existent harvest sources because of a possibly corrupt search index + # Logging was already done in called function + pass return job_history_clear_results From d511663038fa6a3e3c0cebbe5c4c069c0be70511 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Wed, 16 Nov 2016 00:25:19 +0100 Subject: [PATCH 10/13] Sort lists for assert --- ckanext/harvest/tests/test_action.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 9e23fe5..5503c6c 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -431,7 +431,9 @@ class TestActions(ActionBase): context, {}) # verify - assert_equal(result, [{'id': source_1.id}, {'id': source_2.id}]) + assert_equal( + sorted(result), + sorted([{'id': source_1.id}, {'id': source_2.id}])) source_1 = harvest_model.HarvestSource.get(source_1.id) assert source_1 assert_equal(harvest_model.HarvestJob.get(job_1.id), None) From 7f76f60ec30384a4ce38a2cca97f7cab0568a351 Mon Sep 17 00:00:00 2001 From: seitenbau-govdata Date: Wed, 16 Nov 2016 00:34:07 +0100 Subject: [PATCH 11/13] Fixed variable name --- ckanext/harvest/tests/test_action.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 5503c6c..7a73e07 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -445,7 +445,7 @@ class TestActions(ActionBase): assert source_2 assert_equal(harvest_model.HarvestJob.get(job_2.id), None) assert_equal(harvest_model.HarvestObject.get(object_2_.id), None) - dataset_from_db_1 = model.Package.get(dataset_2['id']) + dataset_from_db_2 = model.Package.get(dataset_2['id']) assert dataset_from_db_2, 'is None' assert_equal(dataset_from_db_2.id, dataset_2['id']) From e7c03855caf23fdfaa4dc817a29fba35bd0efdb0 Mon Sep 17 00:00:00 2001 From: David Read Date: Wed, 16 Nov 2016 11:40:36 +0000 Subject: [PATCH 12/13] Avoid the "# dont use factory because it looks for the existing source" by copying the SOURCE_DICT each time, rather than letting tests edit the master copy. --- ckanext/harvest/tests/test_action.py | 63 ++++++++++------------------ 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 7a73e07..9cc407d 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -354,7 +354,7 @@ class TestHarvestSourceActionPatch(HarvestSourceFixtureMixin, class TestActions(ActionBase): def test_harvest_source_clear(self): - source = factories.HarvestSourceObj(**SOURCE_DICT) + source = factories.HarvestSourceObj(**SOURCE_DICT.copy()) job = factories.HarvestJobObj(source=source) dataset = ckan_factories.Dataset() object_ = factories.HarvestObjectObj(job=job, source=source, @@ -371,10 +371,10 @@ class TestActions(ActionBase): assert_equal(harvest_model.HarvestJob.get(job.id), None) assert_equal(harvest_model.HarvestObject.get(object_.id), None) assert_equal(model.Package.get(dataset['id']), None) - + def test_harvest_source_job_history_clear(self): # prepare - source = factories.HarvestSourceObj(**SOURCE_DICT) + source = factories.HarvestSourceObj(**SOURCE_DICT.copy()) job = factories.HarvestJobObj(source=source) dataset = ckan_factories.Dataset() object_ = factories.HarvestObjectObj(job=job, source=source, @@ -395,26 +395,15 @@ class TestActions(ActionBase): dataset_from_db = model.Package.get(dataset['id']) assert dataset_from_db, 'is None' assert_equal(dataset_from_db.id, dataset['id']) - + def test_harvest_sources_job_history_clear(self): # prepare - # don't use factory because it looks for the existing source - data_dict = SOURCE_DICT - data_dict['name'] = 'job-history-clear-source' - data_dict['url'] = 'http://job-history-clear-url' - site_user = toolkit.get_action('get_site_user')( - {'model': model, 'ignore_auth': True}, {})['name'] + data_dict = SOURCE_DICT.copy() + source_1 = factories.HarvestSourceObj(**data_dict) + data_dict['name'] = 'another-source' + data_dict['url'] = 'http://another-url' + source_2 = factories.HarvestSourceObj(**data_dict) - source_1_dict = toolkit.get_action('harvest_source_create')( - {'user': site_user}, data_dict) - source_1 = harvest_model.HarvestSource.get(source_1_dict['id']) - - data_dict['name'] = 'another-job-history-clear-source' - data_dict['url'] = 'http://another-job-history-clear-url' - source_2_dict = toolkit.get_action('harvest_source_create')( - {'user': site_user}, data_dict) - source_2 = harvest_model.HarvestSource.get(source_2_dict['id']) - job_1 = factories.HarvestJobObj(source=source_1) dataset_1 = ckan_factories.Dataset() object_1_ = factories.HarvestObjectObj(job=job_1, source=source_1, @@ -450,49 +439,39 @@ class TestActions(ActionBase): assert_equal(dataset_from_db_2.id, dataset_2['id']) def test_harvest_source_create_twice_with_unique_url(self): - # don't use factory because it looks for the existing source - data_dict = SOURCE_DICT + data_dict = SOURCE_DICT.copy() + factories.HarvestSourceObj(**data_dict) site_user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {})['name'] - - toolkit.get_action('harvest_source_create')( - {'user': site_user}, data_dict) - - data_dict['name'] = 'another-source1' + data_dict['name'] = 'another-source' data_dict['url'] = 'http://another-url' toolkit.get_action('harvest_source_create')( {'user': site_user}, data_dict) def test_harvest_source_create_twice_with_same_url(self): - # don't use factory because it looks for the existing source - data_dict = SOURCE_DICT + data_dict = SOURCE_DICT.copy() + factories.HarvestSourceObj(**data_dict) + site_user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {})['name'] - - toolkit.get_action('harvest_source_create')( - {'user': site_user}, data_dict) - - data_dict['name'] = 'another-source2' + data_dict['name'] = 'another-source' assert_raises(toolkit.ValidationError, toolkit.get_action('harvest_source_create'), {'user': site_user}, data_dict) def test_harvest_source_create_twice_with_unique_url_and_config(self): - # don't use factory because it looks for the existing source - data_dict = SOURCE_DICT + data_dict = SOURCE_DICT.copy() + factories.HarvestSourceObj(**data_dict) + site_user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {})['name'] - - toolkit.get_action('harvest_source_create')( - {'user': site_user}, data_dict) - - data_dict['name'] = 'another-source3' + data_dict['name'] = 'another-source' data_dict['config'] = '{"something": "new"}' toolkit.get_action('harvest_source_create')( {'user': site_user}, data_dict) def test_harvest_job_create_as_sysadmin(self): - source = factories.HarvestSource(**SOURCE_DICT) + source = factories.HarvestSource(**SOURCE_DICT.copy()) site_user = toolkit.get_action('get_site_user')( {'model': model, 'ignore_auth': True}, {})['name'] From ff1b861f1b39637347489a563e147c95fba5a604 Mon Sep 17 00:00:00 2001 From: rnoerenberg Date: Wed, 16 Nov 2016 16:02:12 +0100 Subject: [PATCH 13/13] Update documentation Added note with the limit of 1000 harvest sources --- README.rst | 3 ++- ckanext/harvest/commands/harvester.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 2c08da0..fe47bec 100644 --- a/README.rst +++ b/README.rst @@ -190,7 +190,8 @@ The following operations can be run from the command line as described underneat but keeps the source itself harvester clearsource_history [{source-id}] - - If no source id is given the history for all harvest sources will be cleared. + - If no source id is given the history for all harvest sources (maximum is 1000) + will be cleared. Clears all jobs and objects related to a harvest source, but keeps the source itself. The datasets imported from the harvest source will **NOT** be deleted!!! If a source id is given, it only clears the history of the harvest source with diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index e125439..5c515b9 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -30,7 +30,7 @@ class Harvester(CkanCommand): but keeps the source itself harvester clearsource_history [{source-id}] - - If no source id is given the history for all harvest sources will be cleared. + - If no source id is given the history for all harvest sources (maximum is 1000) will be cleared. Clears all jobs and objects related to a harvest source, but keeps the source itself. The datasets imported from the harvest source will NOT be deleted!!! If a source id is given, it only clears the history of the harvest source with the given source id.