added a new working table to import the suggestions
This commit is contained in:
parent
140e14e48d
commit
8e353f7fa3
|
@ -1,6 +1,10 @@
|
|||
CREATE OR REPLACE PROCEDURE import_dedup_events() LANGUAGE plpgsql AS $$
|
||||
BEGIN
|
||||
|
||||
-- MAKE A WORKING COPY OF THE TABLE PREPARED BY THE DEDUP JOB
|
||||
DROP TABLE IF EXISTS tmp_dedup_events_work;
|
||||
CREATE TABLE tmp_dedup_events_work AS SELECT * FROM tmp_dedup_events;
|
||||
|
||||
DELETE FROM oa_conflicts WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested';
|
||||
DELETE FROM oa_duplicates WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested';
|
||||
DELETE FROM organizations WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and status = 'suggested';
|
||||
|
@ -10,51 +14,51 @@ DELETE FROM organizations WHERE created_by = 'dedupWf' and modified_by = 'dedupW
|
|||
UPDATE organizations SET id = 'pending_org_::'||MD5(id) WHERE status = 'suggested' AND id NOT LIKE 'pending_org_::%';
|
||||
|
||||
-- FIX IMPORT DATA
|
||||
DELETE FROM tmp_dedup_events WHERE oa_original_id = '' OR oa_original_id IS NULL;
|
||||
UPDATE tmp_dedup_events SET local_id = oa_original_id WHERE local_id = '' OR local_id IS NULL;
|
||||
UPDATE tmp_dedup_events SET oa_country = 'UNKNOWN' WHERE oa_country = '' OR oa_country IS NULL;
|
||||
UPDATE tmp_dedup_events SET oa_name = oa_acronym WHERE oa_name = '' OR oa_name IS NULL;
|
||||
DELETE FROM tmp_dedup_events WHERE oa_name = '' OR oa_name IS NULL;
|
||||
DELETE FROM tmp_dedup_events_work WHERE oa_original_id = '' OR oa_original_id IS NULL;
|
||||
UPDATE tmp_dedup_events_work SET local_id = oa_original_id WHERE local_id = '' OR local_id IS NULL;
|
||||
UPDATE tmp_dedup_events_work SET oa_country = 'UNKNOWN' WHERE oa_country = '' OR oa_country IS NULL;
|
||||
UPDATE tmp_dedup_events_work SET oa_name = oa_acronym WHERE oa_name = '' OR oa_name IS NULL;
|
||||
DELETE FROM tmp_dedup_events_work WHERE oa_name = '' OR oa_name IS NULL;
|
||||
|
||||
-- delete invalid relations (a raw org can not be suggested as duplicate and as new org)
|
||||
DELETE FROM tmp_dedup_events WHERE oa_original_id IN (
|
||||
DELETE FROM tmp_dedup_events_work WHERE oa_original_id IN (
|
||||
SELECT oa_original_id
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
GROUP BY oa_original_id HAVING count(oa_original_id) > 1)
|
||||
AND (local_id = '' OR local_id is NULL OR local_id = oa_original_id)
|
||||
AND (group_id = '' OR group_id is NULL);
|
||||
|
||||
-- delete invalid relations (a raw org can not be suggested to multiple orgs)
|
||||
DELETE FROM tmp_dedup_events WHERE oa_original_id IN (
|
||||
DELETE FROM tmp_dedup_events_work WHERE oa_original_id IN (
|
||||
SELECT oa_original_id
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
GROUP BY oa_original_id HAVING count(oa_original_id) > 1)
|
||||
AND local_id NOT LIKE 'openorgs____::%';
|
||||
|
||||
-- delete invalid groups (only one row)
|
||||
DELETE FROM tmp_dedup_events WHERE group_id IN (
|
||||
DELETE FROM tmp_dedup_events_work WHERE group_id IN (
|
||||
SELECT group_id
|
||||
FROM tmp_dedup_events GROUP BY group_id
|
||||
FROM tmp_dedup_events_work GROUP BY group_id
|
||||
HAVING count(*) = 1
|
||||
);
|
||||
|
||||
-- IMPORT MISSING TERMS
|
||||
INSERT INTO id_types(val, name) SELECT distinct arr[2], arr[2] FROM (SELECT string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr FROM tmp_dedup_events WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%') as c ON CONFLICT DO NOTHING;
|
||||
INSERT INTO id_types(val, name) SELECT distinct arr[2], arr[2] FROM (SELECT string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr FROM tmp_dedup_events_work WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%') as c ON CONFLICT DO NOTHING;
|
||||
|
||||
-- NEW ORGANIZATIONS (suggested)
|
||||
INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by)
|
||||
SELECT 'pending_org_::'||MD5(local_id), oa_name, oa_country, 'suggested', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf'
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
INSERT INTO acronyms(id, acronym)
|
||||
SELECT 'pending_org_::'||MD5(local_id), oa_acronym FROM tmp_dedup_events
|
||||
SELECT 'pending_org_::'||MD5(local_id), oa_acronym FROM tmp_dedup_events_work
|
||||
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_acronym IS NOT NULL AND oa_acronym != ''
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
INSERT INTO urls(id, url)
|
||||
SELECT 'pending_org_::'||MD5(local_id), oa_url FROM tmp_dedup_events
|
||||
SELECT 'pending_org_::'||MD5(local_id), oa_url FROM tmp_dedup_events_work
|
||||
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_url IS NOT NULL AND oa_url != ''
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
|
@ -62,7 +66,7 @@ INSERT INTO other_ids(id, otherid, type)
|
|||
SELECT 'pending_org_::'||MD5(local_id), arr[1] AS otherid, arr[2] AS type
|
||||
FROM (
|
||||
SELECT local_id, string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id
|
||||
) as c
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
@ -70,7 +74,7 @@ ON CONFLICT DO NOTHING;
|
|||
-- NEW ORGANIZATIONS (raw)
|
||||
INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by)
|
||||
SELECT oa_original_id, oa_name, oa_country, 'raw', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf'
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
(name, country, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, modification_date, modified_by) =
|
||||
|
@ -78,13 +82,13 @@ ON CONFLICT(id) DO UPDATE SET
|
|||
|
||||
INSERT INTO acronyms(id, acronym)
|
||||
SELECT oa_original_id, oa_acronym
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_acronym IS NOT NULL AND oa_acronym != ''
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
INSERT INTO urls(id, url)
|
||||
SELECT oa_original_id, oa_url
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_url IS NOT NULL AND oa_url != ''
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
|
@ -93,7 +97,7 @@ SELECT oa_original_id, arr[1] AS otherid, arr[2] AS type
|
|||
FROM (
|
||||
SELECT oa_original_id,
|
||||
string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
||||
) as c
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
@ -101,23 +105,23 @@ ON CONFLICT DO NOTHING;
|
|||
-- DUPLICATES (relations to openorgs)
|
||||
INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by)
|
||||
SELECT local_id, oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf'
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
-- DUPLICATES (relations to suggested)
|
||||
INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by)
|
||||
SELECT 'pending_org_::'||MD5(local_id), oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf'
|
||||
FROM tmp_dedup_events
|
||||
FROM tmp_dedup_events_work
|
||||
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
-- CONFLICTS (I generate all the couples)
|
||||
CREATE TEMPORARY TABLE tmp_conflict_groups AS
|
||||
SELECT DISTINCT group_id as gid, local_id oid from tmp_dedup_events
|
||||
SELECT DISTINCT group_id as gid, local_id oid from tmp_dedup_events_work
|
||||
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != ''
|
||||
UNION
|
||||
SELECT DISTINCT group_id as gid, oa_original_id oid from tmp_dedup_events
|
||||
SELECT DISTINCT group_id as gid, oa_original_id oid from tmp_dedup_events_work
|
||||
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != '';
|
||||
|
||||
INSERT INTO oa_conflicts (id1, id2, idgroup, created_by, modified_by) SELECT DISTINCT
|
||||
|
|
Loading…
Reference in New Issue