Skip to content

Commit 9f8a050

Browse files
committed
Add minimal tests for recovery conflict handling.
Previously none of our tests triggered recovery conflicts. The test is primarily motivated by needing tests for recovery conflict stats for shared memory based pgstats. But it's also a decent start for recovery conflict handling in general. The only type of recovery conflict not tested yet are rcovery deadlock conflicts. By configuring log_recovery_conflict_waits the test adds some very minimal testing for that path as well. Author: Melanie Plageman <[email protected]> Author: Andres Freund <[email protected]> Discussion: https://postgr.es/m/[email protected]
1 parent 53b9cd2 commit 9f8a050

File tree

1 file changed

+296
-0
lines changed

1 file changed

+296
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
# Copyright (c) 2021-2022, PostgreSQL Global Development Group
2+
3+
# Test that connections to a hot standby are correctly canceled when a
4+
# recovery conflict is detected Also, test that statistics in
5+
# pg_stat_database_conflicts are populated correctly
6+
7+
# TODO: add a test for deadlock recovery conflicts.
8+
9+
use strict;
10+
use warnings;
11+
use PostgreSQL::Test::Cluster;
12+
use PostgreSQL::Test::Utils;
13+
use Test::More;
14+
15+
16+
# Set up nodes
17+
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
18+
$node_primary->init(allows_streaming => 1);
19+
20+
my $tablespace1 = "test_recovery_conflict_tblspc";
21+
22+
$node_primary->append_conf(
23+
'postgresql.conf', qq[
24+
allow_in_place_tablespaces = on
25+
log_temp_files = 0
26+
27+
# wait some to test the wait paths as well, but not long for obvious reasons
28+
max_standby_streaming_delay = 50ms
29+
30+
temp_tablespaces = $tablespace1
31+
# Some of the recovery conflict logging code only gets exercised after
32+
# deadlock_timeout. The test doesn't rely on that additional output, but it's
33+
# nice to get some minimal coverage of that code.
34+
log_recovery_conflict_waits = on
35+
deadlock_timeout = 10ms
36+
]);
37+
$node_primary->start;
38+
39+
my $backup_name = 'my_backup';
40+
41+
$node_primary->safe_psql('postgres',
42+
qq[CREATE TABLESPACE $tablespace1 LOCATION '']);
43+
44+
$node_primary->backup($backup_name);
45+
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
46+
$node_standby->init_from_backup($node_primary, $backup_name,
47+
has_streaming => 1);
48+
49+
$node_standby->start;
50+
51+
my $test_db = "test_db";
52+
53+
# use a new database, to trigger database recovery conflict
54+
$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db");
55+
56+
# test schema / data
57+
my $table1 = "test_recovery_conflict_table1";
58+
$node_primary->safe_psql($test_db, qq[CREATE TABLE ${table1}(a int, b int);]);
59+
$node_primary->safe_psql($test_db,
60+
qq[INSERT INTO $table1 SELECT i % 3, 0 FROM generate_series(1,20) i]);
61+
my $primary_lsn = $node_primary->lsn('flush');
62+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
63+
64+
65+
# a longrunning psql that we can use to trigger conflicts
66+
my $psql_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default);
67+
my %psql_standby = ('stdin' => '', 'stdout' => '');
68+
$psql_standby{run} =
69+
$node_standby->background_psql($test_db, \$psql_standby{stdin},
70+
\$psql_standby{stdout},
71+
$psql_timeout);
72+
$psql_standby{stdout} = '';
73+
74+
my $expected_conflicts = 0;
75+
76+
77+
## RECOVERY CONFLICT 1: Buffer pin conflict
78+
my $sect = "buffer pin conflict";
79+
$expected_conflicts++;
80+
81+
# Aborted INSERT on primary that will be cleaned up by vacuum. Has to be old
82+
# enough so that there's not a snapshot conflict before the buffer pin
83+
# conflict.
84+
85+
$node_primary->safe_psql(
86+
$test_db,
87+
qq[
88+
BEGIN;
89+
INSERT INTO $table1 VALUES (1,0);
90+
ROLLBACK;
91+
-- ensure flush, rollback doesn't do so
92+
BEGIN; LOCK $table1; COMMIT;
93+
]);
94+
95+
$primary_lsn = $node_primary->lsn('flush');
96+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
97+
98+
my $cursor1 = "test_recovery_conflict_cursor";
99+
100+
# DECLARE and use a cursor on standby, causing buffer with the only block of
101+
# the relation to be pinned on the standby
102+
$psql_standby{stdin} .= qq[
103+
BEGIN;
104+
DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
105+
FETCH FORWARD FROM $cursor1;
106+
];
107+
# FETCH FORWARD should have returned a 0 since all values of b in the table
108+
# are 0
109+
ok(pump_until_standby(qr/^0$/m),
110+
"$sect: cursor with conflicting pin established");
111+
112+
# to check the log starting now for recovery conflict messages
113+
my $log_location = -s $node_standby->logfile;
114+
115+
# VACUUM on the primary
116+
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
117+
118+
# Wait for catchup. Existing connection will be terminated before replay is
119+
# finished, so waiting for catchup ensures that there is no race between
120+
# encountering the recovery conflict which causes the disconnect and checking
121+
# the logfile for the terminated connection.
122+
$primary_lsn = $node_primary->lsn('flush');
123+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
124+
125+
check_conflict_log("User was holding shared buffer pin for too long");
126+
reconnect_and_clear();
127+
check_conflict_stat("bufferpin");
128+
129+
130+
## RECOVERY CONFLICT 2: Snapshot conflict
131+
$sect = "snapshot conflict";
132+
$expected_conflicts++;
133+
134+
$node_primary->safe_psql($test_db,
135+
qq[INSERT INTO $table1 SELECT i, 0 FROM generate_series(1,20) i]);
136+
$primary_lsn = $node_primary->lsn('flush');
137+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
138+
139+
# DECLARE and FETCH from cursor on the standby
140+
$psql_standby{stdin} .= qq[
141+
BEGIN;
142+
DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
143+
FETCH FORWARD FROM $cursor1;
144+
];
145+
ok( pump_until(
146+
$psql_standby{run}, $psql_timeout,
147+
\$psql_standby{stdout}, qr/^0$/m,),
148+
"$sect: cursor with conflicting snapshot established");
149+
150+
# Do some HOT updates
151+
$node_primary->safe_psql($test_db,
152+
qq[UPDATE $table1 SET a = a + 1 WHERE a > 2;]);
153+
154+
# VACUUM, pruning those dead tuples
155+
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
156+
157+
# Wait for attempted replay of PRUNE records
158+
$primary_lsn = $node_primary->lsn('flush');
159+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
160+
161+
check_conflict_log(
162+
"User query might have needed to see row versions that must be removed");
163+
reconnect_and_clear();
164+
check_conflict_stat("snapshot");
165+
166+
167+
## RECOVERY CONFLICT 3: Lock conflict
168+
$sect = "lock conflict";
169+
$expected_conflicts++;
170+
171+
# acquire lock to conflict with
172+
$psql_standby{stdin} .= qq[
173+
BEGIN;
174+
LOCK TABLE $table1 IN ACCESS SHARE MODE;
175+
SELECT 1;
176+
];
177+
ok(pump_until_standby(qr/^1$/m), "$sect: conflicting lock acquired");
178+
179+
# DROP TABLE containing block which standby has in a pinned buffer
180+
$node_primary->safe_psql($test_db, qq[DROP TABLE $table1;]);
181+
182+
$primary_lsn = $node_primary->lsn('flush');
183+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
184+
185+
check_conflict_log("User was holding a relation lock for too long");
186+
reconnect_and_clear();
187+
check_conflict_stat("lock");
188+
189+
190+
## RECOVERY CONFLICT 4: Tablespace conflict
191+
$sect = "tablespace conflict";
192+
$expected_conflicts++;
193+
194+
# DECLARE a cursor for a query which, with sufficiently low work_mem, will
195+
# spill tuples into temp files in the temporary tablespace created during
196+
# setup.
197+
$psql_standby{stdin} .= qq[
198+
BEGIN;
199+
SET work_mem = '64kB';
200+
DECLARE $cursor1 CURSOR FOR
201+
SELECT count(*) FROM generate_series(1,6000);
202+
FETCH FORWARD FROM $cursor1;
203+
];
204+
ok(pump_until_standby(qr/^6000$/m),
205+
"$sect: cursor with conflicting temp file established");
206+
207+
# Drop the tablespace currently containing spill files for the query on the
208+
# standby
209+
$node_primary->safe_psql($test_db, qq[DROP TABLESPACE $tablespace1;]);
210+
211+
$primary_lsn = $node_primary->lsn('flush');
212+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
213+
214+
check_conflict_log(
215+
"User was or might have been using tablespace that must be dropped");
216+
reconnect_and_clear();
217+
check_conflict_stat("tablespace");
218+
219+
220+
# Check that expected number of conflicts show in pg_stat_database. Needs to
221+
# be tested before database is dropped, for obvious reasons.
222+
is( $node_standby->safe_psql(
223+
$test_db,
224+
qq[SELECT conflicts FROM pg_stat_database WHERE datname='$test_db';]),
225+
$expected_conflicts,
226+
qq[$expected_conflicts recovery conflicts shown in pg_stat_database]);
227+
228+
229+
## RECOVERY CONFLICT 5: Database conflict
230+
$sect = "database conflict";
231+
232+
$node_primary->safe_psql('postgres', qq[DROP DATABASE $test_db;]);
233+
234+
$primary_lsn = $node_primary->lsn('flush');
235+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
236+
237+
check_conflict_log("User was connected to a database that must be dropped");
238+
239+
240+
# explicitly shut down psql instances gracefully - to avoid hangs or worse on
241+
# windows
242+
$psql_standby{stdin} .= "\\q\n";
243+
$psql_standby{run}->finish;
244+
245+
$node_standby->stop();
246+
$node_primary->stop();
247+
248+
249+
done_testing();
250+
251+
252+
sub pump_until_standby
253+
{
254+
my $match = shift;
255+
256+
return pump_until($psql_standby{run}, $psql_timeout,
257+
\$psql_standby{stdout}, $match);
258+
}
259+
260+
sub reconnect_and_clear
261+
{
262+
$psql_standby{stdin} .= "\\q\n";
263+
$psql_standby{run}->finish;
264+
265+
# restart
266+
$psql_standby{run}->run();
267+
$psql_standby{stdin} = '';
268+
$psql_standby{stdout} = '';
269+
270+
# Run query to ensure connection has finished re-establishing
271+
$psql_standby{stdin} .= qq[SELECT 1;\n];
272+
die unless pump_until_standby(qr/^1$/m);
273+
$psql_standby{stdout} = '';
274+
}
275+
276+
sub check_conflict_log
277+
{
278+
my $message = shift;
279+
my $old_log_location = $log_location;
280+
281+
$log_location = $node_standby->wait_for_log(qr/$message/, $log_location);
282+
283+
cmp_ok($log_location, '>', $old_log_location,
284+
"$sect: logfile contains terminated connection due to recovery conflict"
285+
);
286+
}
287+
288+
sub check_conflict_stat
289+
{
290+
my $conflict_type = shift;
291+
my $count = $node_standby->safe_psql($test_db,
292+
qq[SELECT confl_$conflict_type FROM pg_stat_database_conflicts WHERE datname='$test_db';]
293+
);
294+
295+
is($count, 1, "$sect: stats show conflict on standby");
296+
}

0 commit comments

Comments
 (0)