11 files changed, 225 insertions, 306 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 4f9192316e0..aa5b8772436 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -7971,7 +7971,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
       </para>
       <para>
        Finish LSN of the transaction whose changes are to be skipped, if a valid
-       LSN; otherwise <literal>0/0</literal>.
+       LSN; otherwise <literal>0/0000000</literal>.
       </para></entry>
      </row>
 
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 49a7c180a80..0994e089311 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -5121,7 +5121,7 @@ WHERE ...
     <literal>+(pg_lsn,numeric)</literal> and
     <literal>-(pg_lsn,numeric)</literal> operators, respectively. Note that
     the calculated LSN should be in the range of <type>pg_lsn</type> type,
-    i.e., between <literal>0/0</literal> and
+    i.e., between <literal>0/00000000</literal> and
     <literal>FFFFFFFF/FFFFFFFF</literal>.
    </para>
   </sect1>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 810b2b50f0d..c28aa71f570 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -28521,7 +28521,7 @@ acl      | {postgres=arwdDxtm/postgres,foo=r/postgres}
         Returns information about the progress of the WAL summarizer. If the
         WAL summarizer has never run since the instance was started, then
         <literal>summarized_tli</literal> and <literal>summarized_lsn</literal>
-        will be <literal>0</literal> and <literal>0/0</literal> respectively;
+        will be <literal>0</literal> and <literal>0/00000000</literal> respectively;
         otherwise, they will be the TLI and ending LSN of the last WAL summary
         file written to disk. If the WAL summarizer is currently running,
         <literal>pending_lsn</literal> will be the ending LSN of the last
diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index 298c4b38ef9..b2c2cf9eac8 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -2740,26 +2740,6 @@ char *PQport(const PGconn *conn);
      </listitem>
     </varlistentry>
 
-    <varlistentry id="libpq-PQservice">
-     <term><function>PQservice</function><indexterm><primary>PQservice</primary></indexterm></term>
-
-     <listitem>
-      <para>
-       Returns the service of the active connection.
-
-<synopsis>
-char *PQservice(const PGconn *conn);
-</synopsis>
-      </para>
-
-      <para>
-       <xref linkend="libpq-PQservice"/> returns <symbol>NULL</symbol> if the
-       <parameter>conn</parameter> argument is <symbol>NULL</symbol>.
-       Otherwise, if there was no service provided, it returns an empty string.
-      </para>
-     </listitem>
-    </varlistentry>
-
     <varlistentry id="libpq-PQtty">
      <term><function>PQtty</function><indexterm><primary>PQtty</primary></indexterm></term>
 
diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml
index c32e6bc000d..e26f7f59d4a 100644
--- a/doc/src/sgml/logical-replication.sgml
+++ b/doc/src/sgml/logical-replication.sgml
@@ -575,8 +575,8 @@ HINT:  To initiate replication, you must manually create the replication slot, e
 <programlisting>
 /* pub # */ SELECT * FROM pg_create_logical_replication_slot('sub1', 'pgoutput');
  slot_name |    lsn
------------+-----------
- sub1      | 0/19404D0
+-----------+------------
+ sub1      | 0/019404D0
 (1 row)
 </programlisting></para>
      </listitem>
@@ -617,8 +617,8 @@ HINT:  To initiate replication, you must manually create the replication slot, e
 <programlisting>
 /* pub # */ SELECT * FROM pg_create_logical_replication_slot('myslot', 'pgoutput');
  slot_name |    lsn
------------+-----------
- myslot    | 0/19059A0
+-----------+------------
+ myslot    | 0/019059A0
 (1 row)
 </programlisting></para>
      </listitem>
@@ -655,8 +655,8 @@ HINT:  To initiate replication, you must manually create the replication slot, e
 <programlisting>
 /* pub # */ SELECT * FROM pg_create_logical_replication_slot('myslot', 'pgoutput');
  slot_name |    lsn
------------+-----------
- myslot    | 0/1905930
+-----------+------------
+ myslot    | 0/01905930
 (1 row)
 </programlisting></para>
      </listitem>
@@ -709,8 +709,8 @@ HINT:  To initiate replication, you must manually create the replication slot, e
   </para>
 
   <para>
-   To confirm that the standby server is indeed ready for failover, follow these
-   steps to verify that all necessary logical replication slots have been
+   To confirm that the standby server is indeed ready for failover for a given subscriber, follow these
+   steps to verify that all the logical replication slots required by that subscriber have been
    synchronized to the standby server:
   </para>
 
@@ -764,7 +764,7 @@ HINT:  To initiate replication, you must manually create the replication slot, e
      Check that the logical replication slots identified above exist on
      the standby server and are ready for failover.
 <programlisting>
-/* standby # */ SELECT slot_name, (synced AND NOT temporary AND NOT conflicting) AS failover_ready
+/* standby # */ SELECT slot_name, (synced AND NOT temporary AND invalidation_reason IS NULL) AS failover_ready
                FROM pg_replication_slots
                WHERE slot_name IN
                    ('sub1','sub2','sub3', 'pg_16394_sync_16385_7394666715149055164');
@@ -782,10 +782,42 @@ HINT:  To initiate replication, you must manually create the replication slot, e
   <para>
    If all the slots are present on the standby server and the result
    (<literal>failover_ready</literal>) of the above SQL query is true, then
-   existing subscriptions can continue subscribing to publications now on the
-   new primary server.
+   existing subscriptions can continue subscribing to publications on the new
+   primary server.
+  </para>
+
+  <para>
+   The first two steps in the above procedure are meant for a
+   <productname>PostgreSQL</productname> subscriber. It is recommended to run
+   these steps on each subscriber node, that will be served by the designated
+   standby after failover, to obtain the complete list of replication
+   slots. This list can then be verified in Step 3 to ensure failover readiness.
+   Non-<productname>PostgreSQL</productname> subscribers, on the other hand, may
+   use their own methods to identify the replication slots used by their
+   respective subscriptions.
+  </para>
+
+  <para>
+   In some cases, such as during a planned failover, it is necessary to confirm
+   that all subscribers, whether <productname>PostgreSQL</productname> or
+   non-<productname>PostgreSQL</productname>, will be able to continue
+   replication after failover to a given standby server. In such cases, use the
+   following SQL, instead of performing the first two steps above, to identify
+   which replication slots on the primary need to be synced to the standby that
+   is intended for promotion. This query returns the relevant replication slots
+   associated with all the failover-enabled subscriptions.
   </para>
 
+   <para>
+<programlisting>
+/* primary # */ SELECT array_agg(quote_literal(r.slot_name)) AS slots
+               FROM pg_replication_slots r
+               WHERE r.failover AND NOT r.temporary;
+ slots
+-------
+ {'sub1','sub2','sub3', 'pg_16394_sync_16385_7394666715149055164'}
+(1 row)
+</programlisting></para>
  </sect1>
 
  <sect1 id="logical-replication-row-filter">
@@ -1965,15 +1997,15 @@ DETAIL:  <replaceable class="parameter">detailed_explanation</replaceable>.
 ERROR:  conflict detected on relation "public.test": conflict=insert_exists
 DETAIL:  Key already exists in unique index "t_pkey", which was modified locally in transaction 740 at 2024-06-26 10:47:04.727375+08.
 Key (c)=(1); existing local tuple (1, 'local'); remote tuple (1, 'remote').
-CONTEXT:  processing remote data for replication origin "pg_16395" during "INSERT" for replication target relation "public.test" in transaction 725 finished at 0/14C0378
+CONTEXT:  processing remote data for replication origin "pg_16395" during "INSERT" for replication target relation "public.test" in transaction 725 finished at 0/014C0378
 </screen>
    The LSN of the transaction that contains the change violating the constraint and
-   the replication origin name can be found from the server log (LSN 0/14C0378 and
+   the replication origin name can be found from the server log (LSN 0/014C0378 and
    replication origin <literal>pg_16395</literal> in the above case).  The
    transaction that produced the conflict can be skipped by using
    <link linkend="sql-altersubscription-params-skip"><command>ALTER SUBSCRIPTION ... SKIP</command></link>
    with the finish LSN
-   (i.e., LSN 0/14C0378).  The finish LSN could be an LSN at which the transaction
+   (i.e., LSN 0/014C0378).  The finish LSN could be an LSN at which the transaction
    is committed or prepared on the publisher.  Alternatively, the transaction can
    also be skipped by calling the <link linkend="pg-replication-origin-advance">
    <function>pg_replication_origin_advance()</function></link> function.
@@ -1984,7 +2016,7 @@ CONTEXT:  processing remote data for replication origin "pg_16395" during "INSER
    <link linkend="sql-createsubscription-params-with-disable-on-error"><literal>disable_on_error</literal></link>
    option. Then, you can use <function>pg_replication_origin_advance()</function>
    function with the <parameter>node_name</parameter> (i.e., <literal>pg_16395</literal>)
-   and the next LSN of the finish LSN (i.e., 0/14C0379).  The current position of
+   and the next LSN of the finish LSN (i.e., 0/014C0379).  The current position of
    origins can be seen in the <link linkend="view-pg-replication-origin-status">
    <structname>pg_replication_origin_status</structname></link> system view.
    Please note that skipping the whole transaction includes skipping changes that
diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml
index a45a1412416..593f784b69d 100644
--- a/doc/src/sgml/logicaldecoding.sgml
+++ b/doc/src/sgml/logicaldecoding.sgml
@@ -57,14 +57,14 @@
 postgres=# -- Create a slot named 'regression_slot' using the output plugin 'test_decoding'
 postgres=# SELECT * FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding', false, true);
     slot_name    |    lsn
------------------+-----------
- regression_slot | 0/16B1970
+-----------------+------------
+ regression_slot | 0/016B1970
 (1 row)
 
 postgres=# SELECT slot_name, plugin, slot_type, database, active, restart_lsn, confirmed_flush_lsn FROM pg_replication_slots;
     slot_name    |    plugin     | slot_type | database | active | restart_lsn | confirmed_flush_lsn
------------------+---------------+-----------+----------+--------+-------------+-----------------
- regression_slot | test_decoding | logical   | postgres | f      | 0/16A4408   | 0/16A4440
+-----------------+---------------+-----------+----------+--------+-------------+---------------------
+ regression_slot | test_decoding | logical   | postgres | f      | 0/016A4408  | 0/016A4440
 (1 row)
 
 postgres=# -- There are no changes to see yet
@@ -78,10 +78,10 @@ CREATE TABLE
 
 postgres=# -- DDL isn't replicated, so all you'll see is the transaction
 postgres=# SELECT * FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL);
-    lsn    |  xid  |     data
------------+-------+--------------
- 0/BA2DA58 | 10297 | BEGIN 10297
- 0/BA5A5A0 | 10297 | COMMIT 10297
+    lsn     |  xid  |     data
+------------+-------+--------------
+ 0/0BA2DA58 | 10297 | BEGIN 10297
+ 0/0BA5A5A0 | 10297 | COMMIT 10297
 (2 rows)
 
 postgres=# -- Once changes are read, they're consumed and not emitted
@@ -97,41 +97,41 @@ postgres=*# INSERT INTO data(data) VALUES('2');
 postgres=*# COMMIT;
 
 postgres=# SELECT * FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL);
-    lsn    |  xid  |                          data
------------+-------+---------------------------------------------------------
- 0/BA5A688 | 10298 | BEGIN 10298
- 0/BA5A6F0 | 10298 | table public.data: INSERT: id[integer]:1 data[text]:'1'
- 0/BA5A7F8 | 10298 | table public.data: INSERT: id[integer]:2 data[text]:'2'
- 0/BA5A8A8 | 10298 | COMMIT 10298
+    lsn     |  xid  |                          data
+------------+-------+---------------------------------------------------------
+ 0/0BA5A688 | 10298 | BEGIN 10298
+ 0/0BA5A6F0 | 10298 | table public.data: INSERT: id[integer]:1 data[text]:'1'
+ 0/0BA5A7F8 | 10298 | table public.data: INSERT: id[integer]:2 data[text]:'2'
+ 0/0BA5A8A8 | 10298 | COMMIT 10298
 (4 rows)
 
 postgres=# INSERT INTO data(data) VALUES('3');
 
 postgres=# -- You can also peek ahead in the change stream without consuming changes
 postgres=# SELECT * FROM pg_logical_slot_peek_changes('regression_slot', NULL, NULL);
-    lsn    |  xid  |                          data
------------+-------+---------------------------------------------------------
- 0/BA5A8E0 | 10299 | BEGIN 10299
- 0/BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3'
- 0/BA5A990 | 10299 | COMMIT 10299
+    lsn     |  xid  |                          data
+------------+-------+---------------------------------------------------------
+ 0/0BA5A8E0 | 10299 | BEGIN 10299
+ 0/0BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3'
+ 0/0BA5A990 | 10299 | COMMIT 10299
 (3 rows)
 
 postgres=# -- The next call to pg_logical_slot_peek_changes() returns the same changes again
 postgres=# SELECT * FROM pg_logical_slot_peek_changes('regression_slot', NULL, NULL);
-    lsn    |  xid  |                          data
------------+-------+---------------------------------------------------------
- 0/BA5A8E0 | 10299 | BEGIN 10299
- 0/BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3'
- 0/BA5A990 | 10299 | COMMIT 10299
+    lsn     |  xid  |                          data
+------------+-------+---------------------------------------------------------
+ 0/0BA5A8E0 | 10299 | BEGIN 10299
+ 0/0BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3'
+ 0/0BA5A990 | 10299 | COMMIT 10299
 (3 rows)
 
 postgres=# -- options can be passed to output plugin, to influence the formatting
 postgres=# SELECT * FROM pg_logical_slot_peek_changes('regression_slot', NULL, NULL, 'include-timestamp', 'on');
-    lsn    |  xid  |                          data
------------+-------+---------------------------------------------------------
- 0/BA5A8E0 | 10299 | BEGIN 10299
- 0/BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3'
- 0/BA5A990 | 10299 | COMMIT 10299 (at 2017-05-10 12:07:21.272494-04)
+    lsn     |  xid  |                          data
+------------+-------+---------------------------------------------------------
+ 0/0BA5A8E0 | 10299 | BEGIN 10299
+ 0/0BA5A8E0 | 10299 | table public.data: INSERT: id[integer]:3 data[text]:'3'
+ 0/0BA5A990 | 10299 | COMMIT 10299 (at 2017-05-10 12:07:21.272494-04)
 (3 rows)
 
 postgres=# -- Remember to destroy a slot you no longer need to stop it consuming
@@ -200,18 +200,18 @@ postgres=*# INSERT INTO data(data) VALUES('5');
 postgres=*# PREPARE TRANSACTION 'test_prepared1';
 
 postgres=# SELECT * FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL);
-    lsn    | xid |                          data
------------+-----+---------------------------------------------------------
- 0/1689DC0 | 529 | BEGIN 529
- 0/1689DC0 | 529 | table public.data: INSERT: id[integer]:3 data[text]:'5'
- 0/1689FC0 | 529 | PREPARE TRANSACTION 'test_prepared1', txid 529
+    lsn     | xid |                          data
+------------+-----+---------------------------------------------------------
+ 0/01689DC0 | 529 | BEGIN 529
+ 0/01689DC0 | 529 | table public.data: INSERT: id[integer]:3 data[text]:'5'
+ 0/01689FC0 | 529 | PREPARE TRANSACTION 'test_prepared1', txid 529
 (3 rows)
 
 postgres=# COMMIT PREPARED 'test_prepared1';
 postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NULL);
-    lsn    | xid |                    data
------------+-----+--------------------------------------------
- 0/168A060 | 529 | COMMIT PREPARED 'test_prepared1', txid 529
+    lsn     | xid |                    data
+------------+-----+--------------------------------------------
+ 0/0168A060 | 529 | COMMIT PREPARED 'test_prepared1', txid 529
 (4 row)
 
 postgres=#-- you can also rollback a prepared transaction
@@ -219,18 +219,18 @@ postgres=# BEGIN;
 postgres=*# INSERT INTO data(data) VALUES('6');
 postgres=*# PREPARE TRANSACTION 'test_prepared2';
 postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NULL);
-    lsn    | xid |                          data
------------+-----+---------------------------------------------------------
- 0/168A180 | 530 | BEGIN 530
- 0/168A1E8 | 530 | table public.data: INSERT: id[integer]:4 data[text]:'6'
- 0/168A430 | 530 | PREPARE TRANSACTION 'test_prepared2', txid 530
+    lsn     | xid |                          data
+------------+-----+---------------------------------------------------------
+ 0/0168A180 | 530 | BEGIN 530
+ 0/0168A1E8 | 530 | table public.data: INSERT: id[integer]:4 data[text]:'6'
+ 0/0168A430 | 530 | PREPARE TRANSACTION 'test_prepared2', txid 530
 (3 rows)
 
 postgres=# ROLLBACK PREPARED 'test_prepared2';
 postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NULL);
-    lsn    | xid |                     data
------------+-----+----------------------------------------------
- 0/168A4B8 | 530 | ROLLBACK PREPARED 'test_prepared2', txid 530
+    lsn     | xid |                     data
+------------+-----+----------------------------------------------
+ 0/0168A4B8 | 530 | ROLLBACK PREPARED 'test_prepared2', txid 530
 (1 row)
 </programlisting>
 </sect1>
@@ -431,7 +431,7 @@ postgres=# select * from pg_logical_slot_get_changes('regression_slot', NULL, NU
      cases, the following log message may appear:
 <programlisting>
      LOG:  could not synchronize replication slot "failover_slot"
-     DETAIL:  Synchronization could lead to data loss as the remote slot needs WAL at LSN 0/3003F28 and catalog xmin 754, but the standby has LSN 0/3003F28 and catalog xmin 756
+     DETAIL:  Synchronization could lead to data loss as the remote slot needs WAL at LSN 0/03003F28 and catalog xmin 754, but the standby has LSN 0/03003F28 and catalog xmin 756
 </programlisting>
      If the logical replication slot is actively used by a consumer, no
      manual intervention is needed; the slot will advance automatically,
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 487c5d758ff..12929333665 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -73,9 +73,9 @@
       passed as argument.  For example:
 <screen>
 test=# SELECT * FROM page_header(get_raw_page('pg_class', 0));
-    lsn    | checksum | flags  | lower | upper | special | pagesize | version | prune_xid
------------+----------+--------+-------+-------+---------+----------+---------+-----------
- 0/24A1B50 |        0 |      1 |   232 |   368 |    8192 |     8192 |       4 |         0
+    lsn     | checksum | flags  | lower | upper | special | pagesize | version | prune_xid
+------------+----------+--------+-------+-------+---------+----------+---------+-----------
+ 0/024A1B50 |        0 |      1 |   232 |   368 |    8192 |     8192 |       4 |         0
 </screen>
       The returned columns correspond to the fields in the
       <structname>PageHeaderData</structname> struct.
@@ -741,9 +741,9 @@ test=# SELECT first_tid, nbytes, tids[0:5] AS some_tids
       For example:
 <screen>
 test=# SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
- lsn | nsn | rightlink | flags
------+-----+-----------+--------
- 0/1 | 0/0 |         1 | {leaf}
+    lsn     |    nsn     | rightlink | flags  
+------------+------------+-----------+--------
+ 0/0B5FE088 | 0/00000000 |         1 | {leaf}
 (1 row)
 </screen>
      </para>
diff --git a/doc/src/sgml/pglogicalinspect.sgml b/doc/src/sgml/pglogicalinspect.sgml
index 4b111f96113..1c1a9d14e51 100644
--- a/doc/src/sgml/pglogicalinspect.sgml
+++ b/doc/src/sgml/pglogicalinspect.sgml
@@ -95,7 +95,7 @@ two_phase_at             | 0/40796AF8
 initial_xmin_horizon     | 0
 building_full_snapshot   | f
 in_slot_creation         | f
-last_serialized_snapshot | 0/0
+last_serialized_snapshot | 0/00000000
 next_phase_at            | 0
 committed_count          | 0
 committed_xip            |
@@ -114,7 +114,7 @@ two_phase_at             | 0/40796AF8
 initial_xmin_horizon     | 0
 building_full_snapshot   | f
 in_slot_creation         | f
-last_serialized_snapshot | 0/0
+last_serialized_snapshot | 0/00000000
 next_phase_at            | 0
 committed_count          | 0
 committed_xip            |
diff --git a/doc/src/sgml/pgwalinspect.sgml b/doc/src/sgml/pgwalinspect.sgml
index 3a8121c70f1..79c3ead40bc 100644
--- a/doc/src/sgml/pgwalinspect.sgml
+++ b/doc/src/sgml/pgwalinspect.sgml
@@ -73,9 +73,9 @@
 <screen>
 postgres=# SELECT * FROM pg_get_wal_record_info('0/E419E28');
 -[ RECORD 1 ]----+-------------------------------------------------
-start_lsn        | 0/E419E28
-end_lsn          | 0/E419E68
-prev_lsn         | 0/E419D78
+start_lsn        | 0/0E419E28
+end_lsn          | 0/0E419E68
+prev_lsn         | 0/0E419D78
 xid              | 0
 resource_manager | Heap2
 record_type      | VACUUM
@@ -146,9 +146,9 @@ block_ref        |
 <screen>
 postgres=# SELECT * FROM pg_get_wal_block_info('0/1230278', '0/12302B8');
 -[ RECORD 1 ]-----+-----------------------------------
-start_lsn         | 0/1230278
-end_lsn           | 0/12302B8
-prev_lsn          | 0/122FD40
+start_lsn         | 0/01230278
+end_lsn           | 0/012302B8
+prev_lsn          | 0/0122FD40
 block_id          | 0
 reltablespace     | 1663
 reldatabase       | 1
diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml
index a5eb3aa25e0..1fcdbf7f06e 100644
--- a/doc/src/sgml/ref/pgtesttiming.sgml
+++ b/doc/src/sgml/ref/pgtesttiming.sgml
@@ -30,11 +30,23 @@ PostgreSQL documentation
   <title>Description</title>
 
  <para>
-  <application>pg_test_timing</application> is a tool to measure the timing overhead
-  on your system and confirm that the system time never moves backwards.
+  <application>pg_test_timing</application> is a tool to measure the
+  timing overhead on your system and confirm that the system time never
+  moves backwards.  It simply reads the system clock over and over again
+  as fast as it can for a specified length of time, and then prints
+  statistics about the observed differences in successive clock readings.
+ </para>
+ <para>
+  Smaller (but not zero) differences are better, since they imply both
+  more-precise clock hardware and less overhead to collect a clock reading.
   Systems that are slow to collect timing data can give less accurate
   <command>EXPLAIN ANALYZE</command> results.
  </para>
+ <para>
+  This tool is also helpful to determine if
+  the <varname>track_io_timing</varname> configuration parameter is likely
+  to produce useful results.
+ </para>
  </refsect1>
 
  <refsect1>
@@ -60,6 +72,21 @@ PostgreSQL documentation
      </varlistentry>
 
      <varlistentry>
+      <term><option>-c <replaceable class="parameter">cutoff</replaceable></option></term>
+      <term><option>--cutoff=<replaceable class="parameter">cutoff</replaceable></option></term>
+      <listitem>
+       <para>
+        Specifies the cutoff percentage for the list of exact observed
+        timing durations (that is, the changes in the system clock value
+        from one reading to the next).  The list will end once the running
+        percentage total reaches or exceeds this value, except that the
+        largest observed duration will always be printed.  The default
+        cutoff is 99.99.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-V</option></term>
       <term><option>--version</option></term>
       <listitem>
@@ -92,205 +119,83 @@ PostgreSQL documentation
   <title>Interpreting Results</title>
 
   <para>
-   Good results will show most (>90%) individual timing calls take less than
-   one microsecond. Average per loop overhead will be even lower, below 100
-   nanoseconds. This example from an Intel i7-860 system using a TSC clock
-   source shows excellent performance:
-
-<screen><![CDATA[
-Testing timing overhead for 3 seconds.
-Per loop time including overhead: 35.96 ns
-Histogram of timing durations:
-  < us   % of total      count
-     1     96.40465   80435604
-     2      3.59518    2999652
-     4      0.00015        126
-     8      0.00002         13
-    16      0.00000          2
-]]></screen>
+   The first block of output has four columns, with rows showing a
+   shifted-by-one log2(ns) histogram of timing durations (that is, the
+   differences between successive clock readings).  This is not the
+   classic log2(n+1) histogram as it counts zeros separately and then
+   switches to log2(ns) starting from value 1.
   </para>
-
   <para>
-   Note that different units are used for the per loop time than the
-   histogram. The loop can have resolution within a few nanoseconds (ns),
-   while the individual timing calls can only resolve down to one microsecond
-   (us).
+   The columns are:
+   <itemizedlist spacing="compact">
+    <listitem>
+     <simpara>nanosecond value that is &gt;= the durations in this
+     bucket</simpara>
+    </listitem>
+    <listitem>
+     <simpara>percentage of durations in this bucket</simpara>
+    </listitem>
+    <listitem>
+     <simpara>running-sum percentage of durations in this and previous
+     buckets</simpara>
+    </listitem>
+    <listitem>
+     <simpara>count of durations in this bucket</simpara>
+    </listitem>
+   </itemizedlist>
   </para>
-
- </refsect2>
- <refsect2>
-  <title>Measuring Executor Timing Overhead</title>
-
   <para>
-   When the query executor is running a statement using
-   <command>EXPLAIN ANALYZE</command>, individual operations are timed as well
-   as showing a summary.  The overhead of your system can be checked by
-   counting rows with the <application>psql</application> program:
-
-<screen>
-CREATE TABLE t AS SELECT * FROM generate_series(1,100000);
-\timing
-SELECT COUNT(*) FROM t;
-EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
-</screen>
+   The second block of output goes into more detail, showing the exact
+   timing differences observed.  For brevity this list is cut off when the
+   running-sum percentage exceeds the user-selectable cutoff value.
+   However, the largest observed difference is always shown.
   </para>
-
   <para>
-   The i7-860 system measured runs the count query in 9.8 ms while
-   the <command>EXPLAIN ANALYZE</command> version takes 16.6 ms, each
-   processing just over 100,000 rows.  That 6.8 ms difference means the timing
-   overhead per row is 68 ns, about twice what pg_test_timing estimated it
-   would be.  Even that relatively small amount of overhead is making the fully
-   timed count statement take almost 70% longer.  On more substantial queries,
-   the timing overhead would be less problematic.
+   The example results below show that 99.99% of timing loops took between
+   8 and 31 nanoseconds, with the worst case somewhere between 32768 and
+   65535 nanoseconds.  In the second block, we can see that typical loop
+   time is 16 nanoseconds, and the readings appear to have full nanosecond
+   precision.
   </para>
 
- </refsect2>
-
- <refsect2>
-  <title>Changing Time Sources</title>
   <para>
-   On some newer Linux systems, it's possible to change the clock source used
-   to collect timing data at any time.  A second example shows the slowdown
-   possible from switching to the slower acpi_pm time source, on the same
-   system used for the fast results above:
-
 <screen><![CDATA[
-# cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-tsc hpet acpi_pm
-# echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource
-# pg_test_timing
-Per loop time including overhead: 722.92 ns
+Testing timing overhead for 3 seconds.
+Per loop time including overhead: 16.40 ns
 Histogram of timing durations:
-  < us   % of total      count
-     1     27.84870    1155682
-     2     72.05956    2990371
-     4      0.07810       3241
-     8      0.01357        563
-    16      0.00007          3
+   <= ns   % of total  running %      count
+       0       0.0000     0.0000          0
+       1       0.0000     0.0000          0
+       3       0.0000     0.0000          0
+       7       0.0000     0.0000          0
+      15       4.5452     4.5452    8313178
+      31      95.4527    99.9979  174581501
+      63       0.0001    99.9981        253
+     127       0.0001    99.9982        165
+     255       0.0000    99.9982         35
+     511       0.0000    99.9982          1
+    1023       0.0013    99.9994       2300
+    2047       0.0004    99.9998        690
+    4095       0.0000    99.9998          9
+    8191       0.0000    99.9998          8
+   16383       0.0002   100.0000        337
+   32767       0.0000   100.0000          2
+   65535       0.0000   100.0000          1
+
+Observed timing durations up to 99.9900%:
+      ns   % of total  running %      count
+      15       4.5452     4.5452    8313178
+      16      58.3785    62.9237  106773354
+      17      33.6840    96.6078   61607584
+      18       3.1151    99.7229    5697480
+      19       0.2638    99.9867     482570
+      20       0.0093    99.9960      17054
+...
+   38051       0.0000   100.0000          1
 ]]></screen>
   </para>
 
-  <para>
-   In this configuration, the sample <command>EXPLAIN ANALYZE</command> above
-   takes 115.9 ms.  That's 1061 ns of timing overhead, again a small multiple
-   of what's measured directly by this utility.  That much timing overhead
-   means the actual query itself is only taking a tiny fraction of the
-   accounted for time, most of it is being consumed in overhead instead.  In
-   this configuration, any <command>EXPLAIN ANALYZE</command> totals involving
-   many timed operations would be inflated significantly by timing overhead.
-  </para>
-
-  <para>
-   FreeBSD also allows changing the time source on the fly, and it logs
-   information about the timer selected during boot:
-
-<screen>
-# dmesg | grep "Timecounter"
-Timecounter "ACPI-fast" frequency 3579545 Hz quality 900
-Timecounter "i8254" frequency 1193182 Hz quality 0
-Timecounters tick every 10.000 msec
-Timecounter "TSC" frequency 2531787134 Hz quality 800
-# sysctl kern.timecounter.hardware=TSC
-kern.timecounter.hardware: ACPI-fast -> TSC
-</screen>
-  </para>
-
-  <para>
-   Other systems may only allow setting the time source on boot.  On older
-   Linux systems the "clock" kernel setting is the only way to make this sort
-   of change.  And even on some more recent ones, the only option you'll see
-   for a clock source is "jiffies".  Jiffies are the older Linux software clock
-   implementation, which can have good resolution when it's backed by fast
-   enough timing hardware, as in this example:
-
-<screen><![CDATA[
-$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-jiffies
-$ dmesg | grep time.c
-time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer.
-time.c: Detected 2400.153 MHz processor.
-$ pg_test_timing
-Testing timing overhead for 3 seconds.
-Per timing duration including loop overhead: 97.75 ns
-Histogram of timing durations:
-  < us   % of total      count
-     1     90.23734   27694571
-     2      9.75277    2993204
-     4      0.00981       3010
-     8      0.00007         22
-    16      0.00000          1
-    32      0.00000          1
-]]></screen></para>
-
  </refsect2>
-
- <refsect2>
-  <title>Clock Hardware and Timing Accuracy</title>
-
-  <para>
-   Collecting accurate timing information is normally done on computers using
-   hardware clocks with various levels of accuracy.  With some hardware the
-   operating systems can pass the system clock time almost directly to
-   programs.  A system clock can also be derived from a chip that simply
-   provides timing interrupts, periodic ticks at some known time interval.  In
-   either case, operating system kernels provide a clock source that hides
-   these details.  But the accuracy of that clock source and how quickly it can
-   return results varies based on the underlying hardware.
-  </para>
-
-  <para>
-   Inaccurate time keeping can result in system instability.  Test any change
-   to the clock source very carefully.  Operating system defaults are sometimes
-   made to favor reliability over best accuracy. And if you are using a virtual
-   machine, look into the recommended time sources compatible with it.  Virtual
-   hardware faces additional difficulties when emulating timers, and there are
-   often per operating system settings suggested by vendors.
-  </para>
-
-  <para>
-   The Time Stamp Counter (TSC) clock source is the most accurate one available
-   on current generation CPUs. It's the preferred way to track the system time
-   when it's supported by the operating system and the TSC clock is
-   reliable. There are several ways that TSC can fail to provide an accurate
-   timing source, making it unreliable. Older systems can have a TSC clock that
-   varies based on the CPU temperature, making it unusable for timing. Trying
-   to use TSC on some older multicore CPUs can give a reported time that's
-   inconsistent among multiple cores. This can result in the time going
-   backwards, a problem this program checks for.  And even the newest systems
-   can fail to provide accurate TSC timing with very aggressive power saving
-   configurations.
-  </para>
-
-  <para>
-   Newer operating systems may check for the known TSC problems and switch to a
-   slower, more stable clock source when they are seen.  If your system
-   supports TSC time but doesn't default to that, it may be disabled for a good
-   reason.  And some operating systems may not detect all the possible problems
-   correctly, or will allow using TSC even in situations where it's known to be
-   inaccurate.
-  </para>
-
-  <para>
-   The High Precision Event Timer (HPET) is the preferred timer on systems
-   where it's available and TSC is not accurate.  The timer chip itself is
-   programmable to allow up to 100 nanosecond resolution, but you may not see
-   that much accuracy in your system clock.
-  </para>
-
-  <para>
-   Advanced Configuration and Power Interface (ACPI) provides a Power
-   Management (PM) Timer, which Linux refers to as the acpi_pm.  The clock
-   derived from acpi_pm will at best provide 300 nanosecond resolution.
-  </para>
-
-  <para>
-   Timers used on older PC hardware include the 8254 Programmable Interval
-   Timer (PIT), the real-time clock (RTC), the Advanced Programmable Interrupt
-   Controller (APIC) timer, and the Cyclone timer.  These timers aim for
-   millisecond resolution.
-  </para>
-  </refsect2>
  </refsect1>
 
  <refsect1>
@@ -298,6 +203,8 @@ Histogram of timing durations:
 
   <simplelist type="inline">
    <member><xref linkend="sql-explain"/></member>
+   <member><ulink url="https://wiki.postgresql.org/wiki/Pg_test_timing">Wiki
+   discussion about timing</ulink></member>
   </simplelist>
  </refsect1>
 </refentry>
diff --git a/doc/src/sgml/test-decoding.sgml b/doc/src/sgml/test-decoding.sgml
index 5d1ae8f4f52..7d3d590471a 100644
--- a/doc/src/sgml/test-decoding.sgml
+++ b/doc/src/sgml/test-decoding.sgml
@@ -25,16 +25,16 @@
 
 <programlisting>
 postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'include-xids', '0');
-   lsn     | xid |                       data
------------+-----+--------------------------------------------------
- 0/16D30F8 | 691 | BEGIN
- 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:2 data[text]:'arg'
- 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:3 data[text]:'demo'
- 0/16D32A0 | 691 | COMMIT
- 0/16D32D8 | 692 | BEGIN
- 0/16D3398 | 692 | table public.data: DELETE: id[int4]:2
- 0/16D3398 | 692 | table public.data: DELETE: id[int4]:3
- 0/16D3398 | 692 | COMMIT
+    lsn     | xid |                       data
+------------+-----+--------------------------------------------------
+ 0/016D30F8 | 691 | BEGIN
+ 0/016D32A0 | 691 | table public.data: INSERT: id[int4]:2 data[text]:'arg'
+ 0/016D32A0 | 691 | table public.data: INSERT: id[int4]:3 data[text]:'demo'
+ 0/016D32A0 | 691 | COMMIT
+ 0/016D32D8 | 692 | BEGIN
+ 0/016D3398 | 692 | table public.data: DELETE: id[int4]:2
+ 0/016D3398 | 692 | table public.data: DELETE: id[int4]:3
+ 0/016D3398 | 692 | COMMIT
 (8 rows)
 </programlisting>
  </para>
@@ -45,18 +45,18 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'i
 
 <programlisting>
 postgres[33712]=#* SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'stream-changes', '1');
-    lsn    | xid |                       data
------------+-----+--------------------------------------------------
- 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503
- 0/16B21F8 | 503 | streaming change for TXN 503
- 0/16B2300 | 503 | streaming change for TXN 503
- 0/16B2408 | 503 | streaming change for TXN 503
- 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503
- 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503
- 0/16BECA8 | 503 | streaming change for TXN 503
- 0/16BEDB0 | 503 | streaming change for TXN 503
- 0/16BEEB8 | 503 | streaming change for TXN 503
- 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503
+    lsn     | xid |                       data
+------------+-----+--------------------------------------------------
+ 0/016B21F8 | 503 | opening a streamed block for transaction TXN 503
+ 0/016B21F8 | 503 | streaming change for TXN 503
+ 0/016B2300 | 503 | streaming change for TXN 503
+ 0/016B2408 | 503 | streaming change for TXN 503
+ 0/016BEBA0 | 503 | closing a streamed block for transaction TXN 503
+ 0/016B21F8 | 503 | opening a streamed block for transaction TXN 503
+ 0/016BECA8 | 503 | streaming change for TXN 503
+ 0/016BEDB0 | 503 | streaming change for TXN 503
+ 0/016BEEB8 | 503 | streaming change for TXN 503
+ 0/016BEBA0 | 503 | closing a streamed block for transaction TXN 503
 (10 rows)
 </programlisting>
  </para>