From 8e353f7fa3de37becb436b21ae54f342560409e4 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 11 Jun 2024 13:22:22 +0200 Subject: [PATCH] added a new working table to import the suggestions --- .../main/resources/sql/importDedupEvents.sql | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/apps/dnet-orgs-database-application/src/main/resources/sql/importDedupEvents.sql b/apps/dnet-orgs-database-application/src/main/resources/sql/importDedupEvents.sql index 8e1b1d29..21383f2e 100644 --- a/apps/dnet-orgs-database-application/src/main/resources/sql/importDedupEvents.sql +++ b/apps/dnet-orgs-database-application/src/main/resources/sql/importDedupEvents.sql @@ -1,6 +1,10 @@ CREATE OR REPLACE PROCEDURE import_dedup_events() LANGUAGE plpgsql AS $$ BEGIN +-- MAKE A WORKING COPY OF THE TABLE PREPARED BY THE DEDUP JOB +DROP TABLE IF EXISTS tmp_dedup_events_work; +CREATE TABLE tmp_dedup_events_work AS SELECT * FROM tmp_dedup_events; + DELETE FROM oa_conflicts WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested'; DELETE FROM oa_duplicates WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested'; DELETE FROM organizations WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and status = 'suggested'; @@ -10,51 +14,51 @@ DELETE FROM organizations WHERE created_by = 'dedupWf' and modified_by = 'dedupW UPDATE organizations SET id = 'pending_org_::'||MD5(id) WHERE status = 'suggested' AND id NOT LIKE 'pending_org_::%'; -- FIX IMPORT DATA -DELETE FROM tmp_dedup_events WHERE oa_original_id = '' OR oa_original_id IS NULL; -UPDATE tmp_dedup_events SET local_id = oa_original_id WHERE local_id = '' OR local_id IS NULL; -UPDATE tmp_dedup_events SET oa_country = 'UNKNOWN' WHERE oa_country = '' OR oa_country IS NULL; -UPDATE tmp_dedup_events SET oa_name = oa_acronym WHERE oa_name = '' OR oa_name IS NULL; -DELETE FROM tmp_dedup_events WHERE oa_name = '' OR oa_name IS NULL; +DELETE FROM tmp_dedup_events_work WHERE oa_original_id = '' OR oa_original_id IS NULL; +UPDATE tmp_dedup_events_work SET local_id = oa_original_id WHERE local_id = '' OR local_id IS NULL; +UPDATE tmp_dedup_events_work SET oa_country = 'UNKNOWN' WHERE oa_country = '' OR oa_country IS NULL; +UPDATE tmp_dedup_events_work SET oa_name = oa_acronym WHERE oa_name = '' OR oa_name IS NULL; +DELETE FROM tmp_dedup_events_work WHERE oa_name = '' OR oa_name IS NULL; -- delete invalid relations (a raw org can not be suggested as duplicate and as new org) -DELETE FROM tmp_dedup_events WHERE oa_original_id IN ( +DELETE FROM tmp_dedup_events_work WHERE oa_original_id IN ( SELECT oa_original_id - FROM tmp_dedup_events + FROM tmp_dedup_events_work GROUP BY oa_original_id HAVING count(oa_original_id) > 1) AND (local_id = '' OR local_id is NULL OR local_id = oa_original_id) AND (group_id = '' OR group_id is NULL); -- delete invalid relations (a raw org can not be suggested to multiple orgs) -DELETE FROM tmp_dedup_events WHERE oa_original_id IN ( +DELETE FROM tmp_dedup_events_work WHERE oa_original_id IN ( SELECT oa_original_id - FROM tmp_dedup_events + FROM tmp_dedup_events_work GROUP BY oa_original_id HAVING count(oa_original_id) > 1) AND local_id NOT LIKE 'openorgs____::%'; -- delete invalid groups (only one row) -DELETE FROM tmp_dedup_events WHERE group_id IN ( +DELETE FROM tmp_dedup_events_work WHERE group_id IN ( SELECT group_id - FROM tmp_dedup_events GROUP BY group_id + FROM tmp_dedup_events_work GROUP BY group_id HAVING count(*) = 1 ); -- IMPORT MISSING TERMS -INSERT INTO id_types(val, name) SELECT distinct arr[2], arr[2] FROM (SELECT string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr FROM tmp_dedup_events WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%') as c ON CONFLICT DO NOTHING; +INSERT INTO id_types(val, name) SELECT distinct arr[2], arr[2] FROM (SELECT string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr FROM tmp_dedup_events_work WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%') as c ON CONFLICT DO NOTHING; -- NEW ORGANIZATIONS (suggested) INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by) SELECT 'pending_org_::'||MD5(local_id), oa_name, oa_country, 'suggested', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf' -FROM tmp_dedup_events +FROM tmp_dedup_events_work WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id ON CONFLICT DO NOTHING; INSERT INTO acronyms(id, acronym) -SELECT 'pending_org_::'||MD5(local_id), oa_acronym FROM tmp_dedup_events +SELECT 'pending_org_::'||MD5(local_id), oa_acronym FROM tmp_dedup_events_work WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_acronym IS NOT NULL AND oa_acronym != '' ON CONFLICT DO NOTHING; INSERT INTO urls(id, url) -SELECT 'pending_org_::'||MD5(local_id), oa_url FROM tmp_dedup_events +SELECT 'pending_org_::'||MD5(local_id), oa_url FROM tmp_dedup_events_work WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_url IS NOT NULL AND oa_url != '' ON CONFLICT DO NOTHING; @@ -62,7 +66,7 @@ INSERT INTO other_ids(id, otherid, type) SELECT 'pending_org_::'||MD5(local_id), arr[1] AS otherid, arr[2] AS type FROM ( SELECT local_id, string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr - FROM tmp_dedup_events + FROM tmp_dedup_events_work WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id ) as c ON CONFLICT DO NOTHING; @@ -70,7 +74,7 @@ ON CONFLICT DO NOTHING; -- NEW ORGANIZATIONS (raw) INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by) SELECT oa_original_id, oa_name, oa_country, 'raw', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf' -FROM tmp_dedup_events +FROM tmp_dedup_events_work WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' ON CONFLICT(id) DO UPDATE SET (name, country, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, modification_date, modified_by) = @@ -78,13 +82,13 @@ ON CONFLICT(id) DO UPDATE SET INSERT INTO acronyms(id, acronym) SELECT oa_original_id, oa_acronym -FROM tmp_dedup_events +FROM tmp_dedup_events_work WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_acronym IS NOT NULL AND oa_acronym != '' ON CONFLICT DO NOTHING; INSERT INTO urls(id, url) SELECT oa_original_id, oa_url -FROM tmp_dedup_events +FROM tmp_dedup_events_work WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_url IS NOT NULL AND oa_url != '' ON CONFLICT DO NOTHING; @@ -93,7 +97,7 @@ SELECT oa_original_id, arr[1] AS otherid, arr[2] AS type FROM ( SELECT oa_original_id, string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr - FROM tmp_dedup_events + FROM tmp_dedup_events_work WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' ) as c ON CONFLICT DO NOTHING; @@ -101,23 +105,23 @@ ON CONFLICT DO NOTHING; -- DUPLICATES (relations to openorgs) INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by) SELECT local_id, oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf' -FROM tmp_dedup_events +FROM tmp_dedup_events_work WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' ON CONFLICT DO NOTHING; -- DUPLICATES (relations to suggested) INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by) SELECT 'pending_org_::'||MD5(local_id), oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf' -FROM tmp_dedup_events +FROM tmp_dedup_events_work WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' ON CONFLICT DO NOTHING; -- CONFLICTS (I generate all the couples) CREATE TEMPORARY TABLE tmp_conflict_groups AS - SELECT DISTINCT group_id as gid, local_id oid from tmp_dedup_events + SELECT DISTINCT group_id as gid, local_id oid from tmp_dedup_events_work WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != '' UNION - SELECT DISTINCT group_id as gid, oa_original_id oid from tmp_dedup_events + SELECT DISTINCT group_id as gid, oa_original_id oid from tmp_dedup_events_work WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != ''; INSERT INTO oa_conflicts (id1, id2, idgroup, created_by, modified_by) SELECT DISTINCT