149 lines
7.7 KiB
PL/PgSQL
149 lines
7.7 KiB
PL/PgSQL
BEGIN;
|
|
|
|
DELETE FROM oa_conflicts WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested';
|
|
DELETE FROM oa_duplicates WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and reltype = 'suggested';
|
|
DELETE FROM organizations WHERE created_by = 'dedupWf' and modified_by = 'dedupWf' and status = 'suggested';
|
|
-- IMPORTANT : DO NOT DELETE THE RAW ORGS TO AVOID THE 'ON CASCADE' DELETIONS
|
|
|
|
-- FIX ORIGINAL DATA
|
|
UPDATE organizations SET id = 'pending_org_::'||MD5(id) WHERE status = 'suggested' AND id NOT LIKE 'pending_org_::%';
|
|
|
|
-- FIX IMPORT DATA
|
|
DELETE FROM tmp_dedup_events WHERE oa_original_id = '' OR oa_original_id IS NULL;
|
|
UPDATE tmp_dedup_events SET local_id = oa_original_id WHERE local_id = '' OR local_id IS NULL;
|
|
UPDATE tmp_dedup_events SET oa_country = 'UNKNOWN' WHERE oa_country = '' OR oa_country IS NULL;
|
|
UPDATE tmp_dedup_events SET oa_name = oa_acronym WHERE oa_name = '' OR oa_name IS NULL;
|
|
DELETE FROM tmp_dedup_events WHERE oa_name = '' OR oa_name IS NULL;
|
|
|
|
-- delete invalid relations (a raw org can not be suggested as duplicate and as new org)
|
|
DELETE FROM tmp_dedup_events WHERE oa_original_id IN (
|
|
SELECT oa_original_id
|
|
FROM tmp_dedup_events
|
|
GROUP BY oa_original_id HAVING count(oa_original_id) > 1)
|
|
AND (local_id = '' OR local_id is NULL);
|
|
|
|
-- delete invalid relations (a raw org can not be suggested to multiple orgs)
|
|
DELETE FROM tmp_dedup_events WHERE oa_original_id IN (
|
|
SELECT oa_original_id
|
|
FROM tmp_dedup_events
|
|
GROUP BY oa_original_id HAVING count(oa_original_id) > 1)
|
|
AND local_id NOT LIKE 'openorgs____::%';
|
|
|
|
-- IMPORT MISSING TERMS
|
|
INSERT INTO id_types(val, name) SELECT distinct arr[2], arr[2] FROM (SELECT string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr FROM tmp_dedup_events WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%') as c ON CONFLICT DO NOTHING;
|
|
|
|
-- NEW ORGANIZATIONS (suggested)
|
|
INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by)
|
|
SELECT 'pending_org_::'||MD5(local_id), oa_name, oa_country, 'suggested', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf'
|
|
FROM tmp_dedup_events
|
|
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
INSERT INTO acronyms(id, acronym)
|
|
SELECT 'pending_org_::'||MD5(local_id), oa_acronym FROM tmp_dedup_events
|
|
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_acronym IS NOT NULL AND oa_acronym != ''
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
INSERT INTO urls(id, url)
|
|
SELECT 'pending_org_::'||MD5(local_id), oa_url FROM tmp_dedup_events
|
|
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id AND oa_url IS NOT NULL AND oa_url != ''
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
INSERT INTO other_ids(id, otherid, type)
|
|
SELECT 'pending_org_::'||MD5(local_id), arr[1] AS otherid, arr[2] AS type
|
|
FROM (
|
|
SELECT local_id, string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr
|
|
FROM tmp_dedup_events
|
|
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND local_id = oa_original_id
|
|
) as c
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
-- NEW ORGANIZATIONS (raw)
|
|
INSERT INTO organizations(id, name, country, status, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, created_by, modified_by)
|
|
SELECT oa_original_id, oa_name, oa_country, 'raw', ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, 'dedupWf', 'dedupWf'
|
|
FROM tmp_dedup_events
|
|
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
|
ON CONFLICT(id) DO UPDATE SET
|
|
(name, country, ec_legalbody, ec_legalperson, ec_nonprofit, ec_researchorganization, ec_highereducation, ec_internationalorganizationeurinterests, ec_internationalorganization, ec_enterprise, ec_smevalidated, ec_nutscode, modification_date, modified_by) =
|
|
(EXCLUDED.name, EXCLUDED.country, EXCLUDED.ec_legalbody, EXCLUDED.ec_legalperson, EXCLUDED.ec_nonprofit, EXCLUDED.ec_researchorganization, EXCLUDED.ec_highereducation, EXCLUDED.ec_internationalorganizationeurinterests, EXCLUDED.ec_internationalorganization, EXCLUDED.ec_enterprise, EXCLUDED.ec_smevalidated, EXCLUDED.ec_nutscode, now(), 'dedupWf');
|
|
|
|
INSERT INTO acronyms(id, acronym)
|
|
SELECT oa_original_id, oa_acronym
|
|
FROM tmp_dedup_events
|
|
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_acronym IS NOT NULL AND oa_acronym != ''
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
INSERT INTO urls(id, url)
|
|
SELECT oa_original_id, oa_url
|
|
FROM tmp_dedup_events
|
|
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_url IS NOT NULL AND oa_url != ''
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
INSERT INTO other_ids(id, otherid, type)
|
|
SELECT oa_original_id, arr[1] AS otherid, arr[2] AS type
|
|
FROM (
|
|
SELECT oa_original_id,
|
|
string_to_array(unnest(string_to_array(pid_list, '@@@')), '###') AS arr
|
|
FROM tmp_dedup_events
|
|
WHERE oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
|
) as c
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
-- DUPLICATES (relations to openorgs)
|
|
INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by)
|
|
SELECT local_id, oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf'
|
|
FROM tmp_dedup_events
|
|
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
-- DUPLICATES (relations to suggested)
|
|
INSERT INTO oa_duplicates (local_id, oa_original_id, oa_collectedfrom, created_by, modified_by)
|
|
SELECT 'pending_org_::'||MD5(local_id), oa_original_id, oa_collectedfrom, 'dedupWf', 'dedupWf'
|
|
FROM tmp_dedup_events
|
|
WHERE local_id NOT LIKE 'openorgs\_\_\_\_::%' AND oa_original_id NOT LIKE 'openorgs\_\_\_\_::%'
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
-- CONFLICTS (I generate all the couples)
|
|
CREATE TEMPORARY TABLE tmp_conflict_groups AS
|
|
SELECT DISTINCT group_id as gid, local_id oid from tmp_dedup_events
|
|
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != ''
|
|
UNION
|
|
SELECT DISTINCT group_id as gid, oa_original_id oid from tmp_dedup_events
|
|
WHERE local_id LIKE 'openorgs\_\_\_\_::%' AND oa_original_id LIKE 'openorgs\_\_\_\_::%' AND local_id != oa_original_id AND group_id IS NOT NULL AND group_id != '';
|
|
|
|
INSERT INTO oa_conflicts (id1, id2, idgroup, created_by, modified_by) SELECT DISTINCT
|
|
c1.oid, c2.oid, c1.gid, 'dedupWf', 'dedupWf'
|
|
FROM
|
|
tmp_conflict_groups c1
|
|
JOIN tmp_conflict_groups c2 ON (c1.gid = c2.gid)
|
|
WHERE
|
|
c1.oid != c2.oid
|
|
ON CONFLICT DO NOTHING;
|
|
|
|
|
|
-- CONSISTENCY (respect the order of the deletions)
|
|
|
|
-- remove the pending organizations that have been recently approved
|
|
DELETE FROM organizations
|
|
WHERE id in (
|
|
SELECT o1.id
|
|
FROM
|
|
oa_duplicates d1
|
|
JOIN organizations o1 ON (o1.id = d1.local_id)
|
|
JOIN oa_duplicates d2 on (d1.oa_original_id = d2.oa_original_id)
|
|
JOIN organizations o2 on (o2.id = d2.local_id)
|
|
WHERE d1.local_id != d2.local_id
|
|
AND o1.status = 'suggested'
|
|
AND o1.created_by = 'dedupWf'
|
|
AND o1.modified_by = 'dedupWf'
|
|
AND d1.reltype = 'suggested'
|
|
AND d2.reltype != 'is_different');
|
|
|
|
|
|
-- Remove invalid suggestions (an existing relation has already been approved)
|
|
DELETE FROM oa_duplicates d
|
|
USING oa_duplicates d1
|
|
WHERE d.oa_original_id = d1.oa_original_id AND d.reltype = 'suggested' AND d1.reltype = 'is_similar';
|
|
|
|
COMMIT;
|