added a new working table to import the suggestions

This commit is contained in:
Michele Artini 2024-06-11 13:22:22 +02:00
parent 140e14e48d
commit 8e353f7fa3
1 changed files with 28 additions and 24 deletions

View File

@ -1,6 +1,10 @@
CREATE OR REPLACE PROCEDURE import_dedup_events() LANGUAGE plpgsql AS $$
BEGIN
-- MAKE A WORKING COPY OF THE TABLE PREPARED BY THE DEDUP JOB
DROP TABLE IF EXISTS tmp_dedup_events_work;
CREATE TABLE tmp_dedup_events_work AS SELECT * FROM tmp_dedup_events;
DELETE FROM oa_conflicts WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested';
DELETE FROM oa_duplicates WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested';
DELETE FROM organizations WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and status = 'suggested';
@ -10,51 +14,51 @@ DELETE FROM organizations WHERE created_by = 'dedupWf' and modified_by = 'dedupW
UPDATE organizations SET id = 'pending_org_::'||MD5(id) WHERE status = 'suggested' AND id NOT LIKE 'pending_org_::%';
-- FIX IMPORT DATA
DELETE FROM tmp_dedup_events WHERE oa_original_id = '' OR oa_original_id IS NULL;
UPDATE tmp_dedup_events SET local_id = oa_original_id WHERE local_id = '' OR local_id IS NULL;
UPDATE tmp_dedup_events SET oa_country = 'UNKNOWN' WHERE oa_country = '' OR oa_country IS NULL;
UPDATE tmp_dedup_events SET oa_name = oa_acronym WHERE oa_name = '' OR oa_name IS NULL;
DELETE FROM tmp_dedup_events WHERE oa_name = '' OR oa_name IS NULL;
DELETE FROM tmp_dedup_events_work WHERE oa_original_id = '' OR oa_original_id IS NULL;
UPDATE tmp_dedup_events_work SET local_id = oa_original_id WHERE local_id = '' OR local_id IS NULL;
UPDATE tmp_dedup_events_work SET oa_country = 'UNKNOWN' WHERE oa_country = '' OR oa_country IS NULL;
UPDATE tmp_dedup_events_work SET oa_name = oa_acronym WHERE oa_name = '' OR oa_name IS NULL;
DELETE FROM tmp_dedup_events_work WHERE oa_name = '' OR oa_name IS NULL;
-- delete invalid relations (a raw org can not be suggested as duplicate and as new org)
DELETE FROM tmp_dedup_events WHERE oa_original_id IN (
DELETE FROM tmp_dedup_events_work WHERE oa_original_id IN (
SELECT oa_original_id
FROM tmp_dedup_events
FROM tmp_dedup_events_work
GROUP BY oa_original_id HAVING count(oa_original_id) > 1)
AND (local_id = '' OR local_id is NULL OR local_id = oa_original_id)
AND (group_id = '' OR group_id is NULL);
-- delete invalid relations (a raw org can not be suggested to multiple orgs)
DELETE FROM tmp_dedup_events WHERE oa_original_id IN (
DELETE FROM tmp_dedup_events_work WHERE oa_original_id IN (
SELECT oa_original_id
FROM tmp_dedup_events
FROM tmp_dedup_events_work
GROUP BY oa_original_id HAVING count(oa_original_id) > 1)
AND local_id NOT LIKE 'openorgs____::%';
-- delete invalid groups (only one row)
DELETE FROM tmp_dedup_events WHERE group_id IN (
DELETE FROM tmp_dedup_events_work WHERE group_id IN (
SELECT group_id
FROM tmp_dedup_events GROUP BY group_id
FROM tmp_dedup_events_work GROUP BY group_id
HAVING count(*) = 1
);
-- IMPORT MISSING TERMS
INSERT INTO id_types(val, name) SELECT distinct arr[2], arr[2] FROM (SELECT string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr FROM tmp_dedup_events WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%') as c ON CONFLICT DO NOTHING;
INSERT INTO id_types(val, name) SELECT distinct arr[2], arr[2] FROM (SELECT string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr FROM tmp_dedup_events_work WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%') as c ON CONFLICT DO NOTHING;
-- NEW ORGANIZATIONS (suggested)
INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by)
SELECT 'pending_org_::'||MD5(local_id), oa_name, oa_country, 'suggested', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf'
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id
ON CONFLICT DO NOTHING;
INSERT INTO acronyms(id, acronym)
SELECT 'pending_org_::'||MD5(local_id), oa_acronym FROM tmp_dedup_events
SELECT 'pending_org_::'||MD5(local_id), oa_acronym FROM tmp_dedup_events_work
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_acronym IS NOT NULL AND oa_acronym != ''
ON CONFLICT DO NOTHING;
INSERT INTO urls(id, url)
SELECT 'pending_org_::'||MD5(local_id), oa_url FROM tmp_dedup_events
SELECT 'pending_org_::'||MD5(local_id), oa_url FROM tmp_dedup_events_work
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_url IS NOT NULL AND oa_url != ''
ON CONFLICT DO NOTHING;
@ -62,7 +66,7 @@ INSERT INTO other_ids(id, otherid, type)
SELECT 'pending_org_::'||MD5(local_id), arr[1] AS otherid, arr[2] AS type
FROM (
SELECT local_id, string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id
) as c
ON CONFLICT DO NOTHING;
@ -70,7 +74,7 @@ ON CONFLICT DO NOTHING;
-- NEW ORGANIZATIONS (raw)
INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by)
SELECT oa_original_id, oa_name, oa_country, 'raw', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf'
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
ON CONFLICT(id) DO UPDATE SET
(name, country, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, modification_date, modified_by) =
@ -78,13 +82,13 @@ ON CONFLICT(id) DO UPDATE SET
INSERT INTO acronyms(id, acronym)
SELECT oa_original_id, oa_acronym
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_acronym IS NOT NULL AND oa_acronym != ''
ON CONFLICT DO NOTHING;
INSERT INTO urls(id, url)
SELECT oa_original_id, oa_url
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_url IS NOT NULL AND oa_url != ''
ON CONFLICT DO NOTHING;
@ -93,7 +97,7 @@ SELECT oa_original_id, arr[1] AS otherid, arr[2] AS type
FROM (
SELECT oa_original_id,
string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
) as c
ON CONFLICT DO NOTHING;
@ -101,23 +105,23 @@ ON CONFLICT DO NOTHING;
-- DUPLICATES (relations to openorgs)
INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by)
SELECT local_id, oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf'
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
ON CONFLICT DO NOTHING;
-- DUPLICATES (relations to suggested)
INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by)
SELECT 'pending_org_::'||MD5(local_id), oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf'
FROM tmp_dedup_events
FROM tmp_dedup_events_work
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
ON CONFLICT DO NOTHING;
-- CONFLICTS (I generate all the couples)
CREATE TEMPORARY TABLE tmp_conflict_groups AS
SELECT DISTINCT group_id as gid, local_id oid from tmp_dedup_events
SELECT DISTINCT group_id as gid, local_id oid from tmp_dedup_events_work
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != ''
UNION
SELECT DISTINCT group_id as gid, oa_original_id oid from tmp_dedup_events
SELECT DISTINCT group_id as gid, oa_original_id oid from tmp_dedup_events_work
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != '';
INSERT INTO oa_conflicts (id1, id2, idgroup, created_by, modified_by) SELECT DISTINCT