Published on

Postgres SCD1 and SCD2 strategy comparison

In this experimentation, we will demonstrate that using one CTE which differentiate both update and insert improve significantly the merge processus for both SCD1 and SCD2

Experiment tables

drop table if exists target;
drop table if exists tmp;
create table target(id int, cd text, hash int, end_datetime timestamp);
create table tmp(id int, cd text, hash int);
-- TARGET
insert into target(id,cd,hash) SELECT g.* as id, 'jim' as cd, g.* as hash FROM generate_series(1,10000000) g;

-- TMP
insert into tmp(id,cd,hash) SELECT g.* as id, 'jim' as cd, g.* as hash FROM generate_series(4000000,5000000) g;
insert into tmp(id,cd,hash) SELECT g.* as id, 'john' as cd, g.* as hash FROM generate_series(4000000,5000000) g;
insert into tmp(id,cd,hash) SELECT g.* as id, 'john' as cd, g.* as hash FROM generate_series(14000000,15000000) g;

Atomic implementation scd1

BEGIN;
LOCK TABLE target IN EXCLUSIVE MODE;
explain analyse 
with fj as (
   select 
   tmp.*
   , case when targ.id is null then false else true end as upd
   from target targ
   full outer join tmp on (targ.id = tmp.id)
   where targ.id is null or targ.hash != tmp.hash
),
upd as (
update target targ set id=fj.id, cd=fj.cd, hash=fj.hash 
from fj 
where fj.id = targ.id 
and fj.upd is true)
insert into target select id, cd, hash from fj where fj.upd is false;
ROLLBACK;
--                                                                   QUERY PLAN                                                                  
-- ----------------------------------------------------------------------------------------------------------------------------------------------
--  Insert on target  (cost=1139239.61..1364236.44 rows=49999 width=48) (actual time=6858.761..6858.761 rows=0 loops=1)
--    CTE fj
--      ->  Hash Full Join  (cost=318113.85..515190.96 rows=9999859 width=44) (actual time=3340.633..5629.141 rows=1000001 loops=1)
--            Hash Cond: (tmp.id = targ.id)
--            Filter: ((targ.id IS NULL) OR (targ.hash <> tmp.hash))
--            Rows Removed by Filter: 11000001
--            ->  Seq Scan on tmp  (cost=0.00..46217.03 rows=3000003 width=12) (actual time=0.007..227.686 rows=3000003 loops=1)
--            ->  Hash  (cost=154053.60..154053.60 rows=9999860 width=8) (actual time=2919.111..2919.111 rows=10000000 loops=1)
--                  Buckets: 131072  Batches: 256  Memory Usage: 2553kB
--                  ->  Seq Scan on target targ  (cost=0.00..154053.60 rows=9999860 width=8) (actual time=0.042..1421.153 rows=10000000 loops=1)
--    CTE upd
--      ->  Update on target targ_1  (cost=337644.85..624048.65 rows=49999 width=118) (actual time=109.975..109.975 rows=0 loops=1)
--            ->  Hash Join  (cost=337644.85..624048.65 rows=49999 width=118) (actual time=109.974..109.974 rows=0 loops=1)
--                  Hash Cond: (fj_1.id = targ_1.id)
--                  ->  CTE Scan on fj fj_1  (cost=0.00..224996.83 rows=49999 width=104) (actual time=109.973..109.973 rows=0 loops=1)
--                        Filter: (type = 'update'::text)
--                        Rows Removed by Filter: 1000001
--                  ->  Hash  (cost=154053.60..154053.60 rows=9999860 width=18) (never executed)
--                        ->  Seq Scan on target targ_1  (cost=0.00..154053.60 rows=9999860 width=18) (never executed)
--    ->  CTE Scan on fj  (cost=0.00..224996.83 rows=49999 width=48) (actual time=3340.639..6016.017 rows=1000001 loops=1)
--          Filter: (type = 'insert'::text)
--  Planning time: 0.518 ms
--  Execution time: 6974.737 ms
BEGIN;
explain analyse 
with u as (
UPDATE target
SET id=tmp.id, cd=tmp.cd, hash=tmp.hash
FROM tmp
WHERE tmp.id = target.id
)
INSERT INTO target
SELECT tmp.*
FROM tmp
LEFT OUTER JOIN target ON (target.id = tmp.id)
WHERE target.id IS NULL;
ROLLBACK;
--                                                                         QUERY PLAN                                                                        
-- ----------------------------------------------------------------------------------------------------------------------------------------------------------
--  Insert on target  (cost=945906.04..1113139.12 rows=1 width=20) (actual time=5206.659..5206.659 rows=0 loops=1)
--    CTE u
--      ->  Update on target target_2  (cost=371406.47..595983.57 rows=3000003 width=32) (actual time=8586.434..8586.434 rows=0 loops=1)
--            ->  Hash Join  (cost=371406.47..595983.57 rows=3000003 width=32) (actual time=2899.041..5459.650 rows=2000002 loops=1)
--                  Hash Cond: (tmp_1.id = target_2.id)
--                  ->  Seq Scan on tmp tmp_1  (cost=0.00..46217.03 rows=3000003 width=18) (actual time=0.024..403.061 rows=3000003 loops=1)
--                  ->  Hash  (cost=169457.54..169457.54 rows=10999754 width=18) (actual time=2896.588..2896.588 rows=10000000 loops=1)
--                        Buckets: 65536  Batches: 256  Memory Usage: 2195kB
--                        ->  Seq Scan on target target_2  (cost=0.00..169457.54 rows=10999754 width=18) (actual time=0.012..1319.621 rows=10000000 loops=1)
--    ->  Hash Anti Join  (cost=349922.47..517155.54 rows=1 width=20) (actual time=3003.185..4367.119 rows=1000001 loops=1)
--          Hash Cond: (tmp.id = target_1.id)
--          ->  Seq Scan on tmp  (cost=0.00..46217.03 rows=3000003 width=12) (actual time=0.027..220.187 rows=3000003 loops=1)
--          ->  Hash  (cost=169457.54..169457.54 rows=10999754 width=4) (actual time=2591.285..2591.285 rows=10000000 loops=1)
--                Buckets: 131072  Batches: 256  Memory Usage: 2400kB
--                ->  Seq Scan on target target_1  (cost=0.00..169457.54 rows=10999754 width=4) (actual time=0.035..1162.839 rows=10000000 loops=1)
--  Planning time: 0.206 ms
--  Execution time: 13793.154 ms
BEGIN;
explain analyse
with u as (
UPDATE target
SET id=tmp.id, cd=tmp.cd, hash=tmp.hash
FROM tmp
WHERE tmp.id = target.id
returning target.id
),
i as (
select tmp.*
from tmp
left join u using (id)
where u.id is null
)
INSERT INTO target
SELECT i.*
FROM i
LEFT OUTER JOIN target ON (target.id = i.id)
WHERE target.id IS NULL;
ROLLBACK;
--                                                                         QUERY PLAN                                                                        
-- ----------------------------------------------------------------------------------------------------------------------------------------------------------
--  Insert on target  (cost=2111532.64..2235532.95 rows=750001 width=48) (actual time=17158.676..17158.676 rows=0 loops=1)
--    CTE u
--      ->  Update on target target_2  (cost=438935.84..682731.95 rows=3000003 width=32) (actual time=2858.907..8387.049 rows=1000001 loops=1)
--            ->  Hash Join  (cost=438935.84..682731.95 rows=3000003 width=32) (actual time=2858.892..5487.211 rows=2000002 loops=1)
--                  Hash Cond: (tmp.id = target_2.id)
--                  ->  Seq Scan on tmp  (cost=0.00..46217.03 rows=3000003 width=18) (actual time=0.026..376.514 rows=3000003 loops=1)
--                  ->  Hash  (cost=200268.26..200268.26 rows=12999726 width=18) (actual time=2856.471..2856.471 rows=10000000 loops=1)
--                        Buckets: 65536  Batches: 256  Memory Usage: 2195kB
--                        ->  Seq Scan on target target_2  (cost=0.00..200268.26 rows=12999726 width=18) (actual time=0.014..1352.460 rows=10000000 loops=1)
--    CTE i
--      ->  Merge Anti Join  (cost=936289.71..1015254.85 rows=1500002 width=12) (actual time=10701.594..10942.156 rows=1000001 loops=1)
--            Merge Cond: (tmp_1.id = u.id)
--            ->  Sort  (cost=471508.34..479008.35 rows=3000003 width=12) (actual time=1143.032..1468.744 rows=3000003 loops=1)
--                  Sort Key: tmp_1.id
--                  Sort Method: external merge  Disk: 72464kB
--                  ->  Seq Scan on tmp tmp_1  (cost=0.00..46217.03 rows=3000003 width=12) (actual time=0.075..266.799 rows=3000003 loops=1)
--            ->  Sort  (cost=464781.37..472281.38 rows=3000003 width=4) (actual time=8993.435..9113.745 rows=1000001 loops=1)
--                  Sort Key: u.id
--                  Sort Method: external merge  Disk: 13800kB
--                  ->  CTE Scan on u  (cost=0.00..60000.06 rows=3000003 width=4) (actual time=2858.909..8670.080 rows=1000001 loops=1)
--    ->  Hash Anti Join  (cost=413545.84..537546.15 rows=750001 width=48) (actual time=13088.538..15120.417 rows=1000001 loops=1)
--          Hash Cond: (i.id = target_1.id)
--          ->  CTE Scan on i  (cost=0.00..30000.04 rows=1500002 width=40) (actual time=10701.598..11209.117 rows=1000001 loops=1)
--          ->  Hash  (cost=200268.26..200268.26 rows=12999726 width=4) (actual time=2386.139..2386.139 rows=10000000 loops=1)
--                Buckets: 131072  Batches: 256  Memory Usage: 2400kB
--                ->  Seq Scan on target target_1  (cost=0.00..200268.26 rows=12999726 width=4) (actual time=0.025..1091.998 rows=10000000 loops=1)
--  Planning time: 0.474 ms
--  Execution time: 17204.783 ms
BEGIN;
explain ANALYSE with u as (
INSERT INTO target
SELECT tmp.*
FROM tmp
LEFT OUTER JOIN target ON (target.id = tmp.id)
WHERE target.id IS NULL
)
UPDATE target
SET id=tmp.id, cd=tmp.cd, hash=tmp.hash
FROM tmp
WHERE tmp.id = target.id;
ROLLBACK;
--                                                                        QUERY PLAN                                                                        
-- ---------------------------------------------------------------------------------------------------------------------------------------------------------
--  Update on target  (cost=1181490.50..1444504.63 rows=3000003 width=32) (actual time=8281.352..8281.352 rows=0 loops=1)
--    CTE u
--      ->  Insert on target target_1  (cost=477168.21..675026.30 rows=1 width=20) (actual time=5430.576..5430.576 rows=0 loops=1)
--            ->  Hash Anti Join  (cost=477168.21..675026.30 rows=1 width=20) (actual time=2954.507..4443.851 rows=1000001 loops=1)
--                  Hash Cond: (tmp_1.id = target_2.id)
--                  ->  Seq Scan on tmp tmp_1  (cost=0.00..46217.03 rows=3000003 width=12) (actual time=0.019..256.940 rows=3000003 loops=1)
--                  ->  Hash  (cost=231078.98..231078.98 rows=14999698 width=4) (actual time=2520.272..2520.272 rows=10000000 loops=1)
--                        Buckets: 131072  Batches: 256  Memory Usage: 2400kB
--                        ->  Seq Scan on target target_2  (cost=0.00..231078.98 rows=14999698 width=4) (actual time=0.014..1175.381 rows=10000000 loops=1)
--    ->  Hash Join  (cost=506464.21..769478.33 rows=3000003 width=32) (actual time=2964.008..5611.788 rows=2000002 loops=1)
--          Hash Cond: (tmp.id = target.id)
--          ->  Seq Scan on tmp  (cost=0.00..46217.03 rows=3000003 width=18) (actual time=0.071..429.298 rows=3000003 loops=1)
--          ->  Hash  (cost=231078.98..231078.98 rows=14999698 width=18) (actual time=2961.366..2961.366 rows=10000000 loops=1)
--                Buckets: 65536  Batches: 256  Memory Usage: 2195kB
--                ->  Seq Scan on target  (cost=0.00..231078.98 rows=14999698 width=18) (actual time=0.034..1409.287 rows=10000000 loops=1)
--  Planning time: 0.424 ms
--  Execution time: 13712.030 ms
\timing
BEGIN;
UPDATE target
SET id=tmp.id, cd=tmp.cd, hash=tmp.hash
FROM tmp
WHERE tmp.id = target.id;
INSERT INTO target
SELECT tmp.*
FROM tmp
LEFT OUTER JOIN target ON (target.id = tmp.id)
WHERE target.id IS NULL;
ROLLBACK;
-- Time: 5329,939 ms (00:05,330)
-- Time: 12554,331 ms (00:12,554)

Atomic implementation scd2

begin;
explain analyse
with i as (
INSERT INTO target
SELECT tmp.*
FROM tmp
LEFT OUTER JOIN target ON (target.id = tmp.id)
WHERE target.id IS NULL
), 
u as (
INSERT INTO target
SELECT tmp.*
FROM tmp
JOIN target ON (target.id = tmp.id)
WHERE target.hash is distinct from tmp.hash
returning target.id
)
UPDATE target
SET end_datetime = now()
FROM u
WHERE u.id = target.id;
rollback;
--                                                                        QUERY PLAN                                                                       
-- --------------------------------------------------------------------------------------------------------------------------------------------------------
--  Update on target  (cost=1560116.39..1800721.53 rows=3000003 width=54) (actual time=6156.396..6156.396 rows=0 loops=1)
--    CTE i
--      ->  Insert on target target_1  (cost=400062.40..552387.47 rows=1 width=20) (actual time=8926.725..8926.725 rows=0 loops=1)
--            ->  Hash Anti Join  (cost=400062.40..552387.47 rows=1 width=20) (actual time=3602.728..5597.313 rows=1000001 loops=1)
--                  Hash Cond: (tmp.id = target_2.id)
--                  ->  Seq Scan on tmp  (cost=0.00..46217.03 rows=3000003 width=12) (actual time=0.450..300.704 rows=3000003 loops=1)
--                  ->  Hash  (cost=250713.29..250713.29 rows=9103129 width=4) (actual time=3092.523..3092.523 rows=10000000 loops=1)
--                        Buckets: 131072  Batches: 128  Memory Usage: 3781kB
--                        ->  Seq Scan on target target_2  (cost=0.00..250713.29 rows=9103129 width=4) (actual time=0.010..1519.702 rows=10000000 loops=1)
--    CTE u
--      ->  Insert on target target_3  (cost=400062.40..589887.51 rows=3000003 width=20) (actual time=6156.392..6156.393 rows=0 loops=1)
--            ->  Hash Join  (cost=400062.40..589887.51 rows=3000003 width=20) (actual time=6156.391..6156.391 rows=0 loops=1)
--                  Hash Cond: (tmp_1.id = target_4.id)
--                  Join Filter: (target_4.hash IS DISTINCT FROM tmp_1.hash)
--                  Rows Removed by Join Filter: 2000002
--                  ->  Seq Scan on tmp tmp_1  (cost=0.00..46217.03 rows=3000003 width=12) (actual time=0.013..334.453 rows=3000003 loops=1)
--                  ->  Hash  (cost=250713.29..250713.29 rows=9103129 width=8) (actual time=3359.346..3359.346 rows=10000000 loops=1)
--                        Buckets: 131072 (originally 131072)  Batches: 256 (originally 128)  Memory Usage: 4087kB
--                        ->  Seq Scan on target target_4  (cost=0.00..250713.29 rows=9103129 width=8) (actual time=0.014..1677.709 rows=10000000 loops=1)
--    ->  Hash Join  (cost=417841.40..658446.55 rows=3000003 width=54) (actual time=6156.394..6156.395 rows=0 loops=1)
--          Hash Cond: (u.id = target.id)
--          ->  CTE Scan on u  (cost=0.00..60000.06 rows=3000003 width=32) (actual time=6156.394..6156.394 rows=0 loops=1)
--          ->  Hash  (cost=250713.29..250713.29 rows=9103129 width=18) (never executed)
--                ->  Seq Scan on target  (cost=0.00..250713.29 rows=9103129 width=18) (never executed)
--  Planning time: 0.220 ms
--  Execution time: 15083.304 ms
begin;
explain analyze
with tmp as (
select 
tmp.*
, case when targ.id is null then false else true end as upd
from target targ
full outer join tmp on (targ.id = tmp.id)
where targ.id is null or targ.hash != tmp.hash
 ), 
 u as (
 UPDATE target
 SET end_datetime = now()
 FROM tmp
 where 
 target.id = tmp.id
 AND tmp.upd is true
 )
 INSERT INTO target (id, cd, hash)
 SELECT id, cd, hash
 FROM tmp;
rollback;
--                                                                   QUERY PLAN                                                                  
-- ----------------------------------------------------------------------------------------------------------------------------------------------
--  Insert on target  (cost=1166655991509.07..1166656191396.43 rows=9994368 width=48) (actual time=8666.204..8666.204 rows=0 loops=1)
--    CTE tmp
--      ->  Hash Full Join  (cost=438165.30..635221.42 rows=9994368 width=13) (actual time=4531.084..7278.771 rows=1000001 loops=1)
--            Hash Cond: (tmp_1.id = targ.id)
--            Filter: ((targ.id IS NULL) OR (targ.hash <> tmp_1.hash))
--            Rows Removed by Filter: 11000001
--            ->  Seq Scan on tmp tmp_1  (cost=0.00..46217.03 rows=3000003 width=12) (actual time=0.140..307.256 rows=3000003 loops=1)
--            ->  Hash  (cost=274194.69..274194.69 rows=9994369 width=8) (actual time=3974.719..3974.719 rows=10000000 loops=1)
--                  Buckets: 131072  Batches: 256  Memory Usage: 2553kB
--                  ->  Seq Scan on target targ  (cost=0.00..274194.69 rows=9994369 width=8) (actual time=0.059..2009.929 rows=10000000 loops=1)
--    CTE u
--      ->  Update on target target_1  (cost=0.00..1166655356287.65 rows=49943700856896 width=50) (actual time=98.574..98.574 rows=0 loops=1)
--            ->  Nested Loop  (cost=0.00..1166655356287.65 rows=49943700856896 width=50) (actual time=98.573..98.573 rows=0 loops=1)
--                  ->  CTE Scan on tmp tmp_2  (cost=0.00..199887.36 rows=4997184 width=24) (actual time=98.572..98.572 rows=0 loops=1)
--                        Filter: (upd IS TRUE)
--                        Rows Removed by Filter: 1000001
--                  ->  Materialize  (cost=0.00..382727.54 rows=9994369 width=18) (never executed)
--                        ->  Seq Scan on target target_1  (cost=0.00..274194.69 rows=9994369 width=18) (never executed)
--    ->  CTE Scan on tmp  (cost=0.00..199887.36 rows=9994368 width=48) (actual time=4531.092..7663.395 rows=1000001 loops=1)
--  Planning time: 0.137 ms
--  Execution time: 8769.648 ms
This page was last modified: