Skip to content

Commit 32a9c0b

Browse files
committedOct 6, 2020
postgres_fdw: reestablish new connection if cached one is detected as broken.
In postgres_fdw, once remote connections are established, they are cached and re-used for subsequent queries and transactions. There can be some cases where those cached connections are unavaiable, for example, by the restart of remote server. In these cases, previously an error was reported and the query accessing to remote server failed if new remote transaction failed to start because the cached connection was broken. This commit improves postgres_fdw so that new connection is remade if broken connection is detected when starting new remote transaction. This is useful to avoid unnecessary failure of queries when connection is broken but can be reestablished. Author: Bharath Rupireddy, tweaked a bit by Fujii Masao Reviewed-by: Ashutosh Bapat, Tatsuhito Kasahara, Fujii Masao Discussion: https://postgr.es/m/CALj2ACUAi23vf1WiHNar_LksM9EDOWXcbHCo-fD4Mbr1d=78YQ@mail.gmail.com

File tree

3 files changed

+131
-12
lines changed

3 files changed

+131
-12
lines changed
 

‎contrib/postgres_fdw/connection.c

+42-12
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ PGconn *
108108
GetConnection(UserMapping *user, bool will_prep_stmt)
109109
{
110110
bool found;
111+
volatile bool retry_conn = false;
111112
ConnCacheEntry *entry;
112113
ConnCacheKey key;
113114

@@ -159,23 +160,26 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
159160
/* Reject further use of connections which failed abort cleanup. */
160161
pgfdw_reject_incomplete_xact_state_change(entry);
161162

163+
retry:
164+
162165
/*
163166
* If the connection needs to be remade due to invalidation, disconnect as
164-
* soon as we're out of all transactions.
167+
* soon as we're out of all transactions. Also, if previous attempt to
168+
* start new remote transaction failed on the cached connection,
169+
* disconnect it to retry a new connection.
165170
*/
166-
if (entry->conn != NULL && entry->invalidated && entry->xact_depth == 0)
171+
if ((entry->conn != NULL && entry->invalidated &&
172+
entry->xact_depth == 0) || retry_conn)
167173
{
168-
elog(DEBUG3, "closing connection %p for option changes to take effect",
169-
entry->conn);
174+
if (retry_conn)
175+
elog(DEBUG3, "closing connection %p to reestablish a new one",
176+
entry->conn);
177+
else
178+
elog(DEBUG3, "closing connection %p for option changes to take effect",
179+
entry->conn);
170180
disconnect_pg_server(entry);
171181
}
172182

173-
/*
174-
* We don't check the health of cached connection here, because it would
175-
* require some overhead. Broken connection will be detected when the
176-
* connection is actually used.
177-
*/
178-
179183
/*
180184
* If cache entry doesn't have a connection, we have to establish a new
181185
* connection. (If connect_pg_server throws an error, the cache entry
@@ -206,9 +210,35 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
206210
}
207211

208212
/*
209-
* Start a new transaction or subtransaction if needed.
213+
* We check the health of the cached connection here when starting a new
214+
* remote transaction. If a broken connection is detected in the first
215+
* attempt, we try to reestablish a new connection. If broken connection
216+
* is detected again here, we give up getting a connection.
210217
*/
211-
begin_remote_xact(entry);
218+
PG_TRY();
219+
{
220+
/* Start a new transaction or subtransaction if needed. */
221+
begin_remote_xact(entry);
222+
retry_conn = false;
223+
}
224+
PG_CATCH();
225+
{
226+
if (PQstatus(entry->conn) != CONNECTION_BAD ||
227+
entry->xact_depth > 0 ||
228+
retry_conn)
229+
PG_RE_THROW();
230+
retry_conn = true;
231+
}
232+
PG_END_TRY();
233+
234+
if (retry_conn)
235+
{
236+
ereport(DEBUG3,
237+
(errmsg_internal("could not start remote transaction on connection %p",
238+
entry->conn)),
239+
errdetail_internal("%s", pchomp(PQerrorMessage(entry->conn))));
240+
goto retry;
241+
}
212242

213243
/* Remember if caller will prepare statements */
214244
entry->have_prep_stmt |= will_prep_stmt;

‎contrib/postgres_fdw/expected/postgres_fdw.out

+48
Original file line numberDiff line numberDiff line change
@@ -8987,3 +8987,51 @@ PREPARE TRANSACTION 'fdw_tpc';
89878987
ERROR: cannot PREPARE a transaction that has operated on postgres_fdw foreign tables
89888988
ROLLBACK;
89898989
WARNING: there is no transaction in progress
8990+
-- ===================================================================
8991+
-- reestablish new connection
8992+
-- ===================================================================
8993+
-- Terminate the backend having the specified application_name and wait for
8994+
-- the termination to complete.
8995+
CREATE OR REPLACE PROCEDURE terminate_backend_and_wait(appname text) AS $$
8996+
BEGIN
8997+
PERFORM pg_terminate_backend(pid) FROM pg_stat_activity
8998+
WHERE application_name = appname;
8999+
LOOP
9000+
PERFORM * FROM pg_stat_activity WHERE application_name = appname;
9001+
EXIT WHEN NOT FOUND;
9002+
PERFORM pg_sleep(1), pg_stat_clear_snapshot();
9003+
END LOOP;
9004+
END;
9005+
$$ LANGUAGE plpgsql;
9006+
-- Change application_name of remote connection to special one
9007+
-- so that we can easily terminate the connection later.
9008+
ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
9009+
SELECT 1 FROM ft1 LIMIT 1;
9010+
?column?
9011+
----------
9012+
1
9013+
(1 row)
9014+
9015+
-- Terminate the remote connection.
9016+
CALL terminate_backend_and_wait('fdw_retry_check');
9017+
-- This query should detect the broken connection when starting new remote
9018+
-- transaction, reestablish new connection, and then succeed.
9019+
BEGIN;
9020+
SELECT 1 FROM ft1 LIMIT 1;
9021+
?column?
9022+
----------
9023+
1
9024+
(1 row)
9025+
9026+
-- If the query detects the broken connection when starting new remote
9027+
-- subtransaction, it doesn't reestablish new connection and should fail.
9028+
CALL terminate_backend_and_wait('fdw_retry_check');
9029+
SAVEPOINT s;
9030+
SELECT 1 FROM ft1 LIMIT 1; -- should fail
9031+
ERROR: server closed the connection unexpectedly
9032+
This probably means the server terminated abnormally
9033+
before or while processing the request.
9034+
CONTEXT: remote SQL command: SAVEPOINT s2
9035+
COMMIT;
9036+
-- Clean up
9037+
DROP PROCEDURE terminate_backend_and_wait(text);

‎contrib/postgres_fdw/sql/postgres_fdw.sql

+41
Original file line numberDiff line numberDiff line change
@@ -2653,3 +2653,44 @@ SELECT count(*) FROM ft1;
26532653
-- error here
26542654
PREPARE TRANSACTION 'fdw_tpc';
26552655
ROLLBACK;
2656+
2657+
-- ===================================================================
2658+
-- reestablish new connection
2659+
-- ===================================================================
2660+
2661+
-- Terminate the backend having the specified application_name and wait for
2662+
-- the termination to complete.
2663+
CREATE OR REPLACE PROCEDURE terminate_backend_and_wait(appname text) AS $$
2664+
BEGIN
2665+
PERFORM pg_terminate_backend(pid) FROM pg_stat_activity
2666+
WHERE application_name = appname;
2667+
LOOP
2668+
PERFORM * FROM pg_stat_activity WHERE application_name = appname;
2669+
EXIT WHEN NOT FOUND;
2670+
PERFORM pg_sleep(1), pg_stat_clear_snapshot();
2671+
END LOOP;
2672+
END;
2673+
$$ LANGUAGE plpgsql;
2674+
2675+
-- Change application_name of remote connection to special one
2676+
-- so that we can easily terminate the connection later.
2677+
ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
2678+
SELECT 1 FROM ft1 LIMIT 1;
2679+
2680+
-- Terminate the remote connection.
2681+
CALL terminate_backend_and_wait('fdw_retry_check');
2682+
2683+
-- This query should detect the broken connection when starting new remote
2684+
-- transaction, reestablish new connection, and then succeed.
2685+
BEGIN;
2686+
SELECT 1 FROM ft1 LIMIT 1;
2687+
2688+
-- If the query detects the broken connection when starting new remote
2689+
-- subtransaction, it doesn't reestablish new connection and should fail.
2690+
CALL terminate_backend_and_wait('fdw_retry_check');
2691+
SAVEPOINT s;
2692+
SELECT 1 FROM ft1 LIMIT 1; -- should fail
2693+
COMMIT;
2694+
2695+
-- Clean up
2696+
DROP PROCEDURE terminate_backend_and_wait(text);

0 commit comments

Comments
 (0)
Please sign in to comment.