32 files changed, 852 insertions, 369 deletions
diff --git a/configure b/configure
index 16ef5b58d1a..cfaf3757dd7 100755
--- a/configure
+++ b/configure
@@ -13309,6 +13309,23 @@ fi
 
 fi
 
+if test "$with_liburing" = yes; then
+  _LIBS="$LIBS"
+  LIBS="$LIBURING_LIBS $LIBS"
+  for ac_func in io_uring_queue_init_mem
+do :
+  ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem"
+if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_IO_URING_QUEUE_INIT_MEM 1
+_ACEOF
+
+fi
+done
+
+  LIBS="$_LIBS"
+fi
+
 if test "$with_lz4" = yes ; then
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
 $as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }
diff --git a/configure.ac b/configure.ac
index b3efc49c97a..c2877e36935 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then
   AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])])
 fi
 
+if test "$with_liburing" = yes; then
+  _LIBS="$LIBS"
+  LIBS="$LIBURING_LIBS $LIBS"
+  AC_CHECK_FUNCS([io_uring_queue_init_mem])
+  LIBS="$_LIBS"
+fi
+
 if test "$with_lz4" = yes ; then
   AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
 fi
diff --git a/contrib/xml2/xpath.c b/contrib/xml2/xpath.c
index 11216b9b7f9..4ac291c8251 100644
--- a/contrib/xml2/xpath.c
+++ b/contrib/xml2/xpath.c
@@ -54,7 +54,7 @@ static xmlChar *pgxml_texttoxmlchar(text *textstring);
 static xpath_workspace *pgxml_xpath(text *document, xmlChar *xpath,
 									PgXmlErrorContext *xmlerrcxt);
 
-static void cleanup_workspace(volatile xpath_workspace *workspace);
+static void cleanup_workspace(xpath_workspace *workspace);
 
 
 /*
@@ -88,8 +88,8 @@ Datum
 xml_encode_special_chars(PG_FUNCTION_ARGS)
 {
 	text	   *tin = PG_GETARG_TEXT_PP(0);
-	text	   *tout;
-	volatile xmlChar *tt = NULL;
+	text	   *volatile tout = NULL;
+	xmlChar    *volatile tt = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
 	xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
@@ -111,7 +111,7 @@ xml_encode_special_chars(PG_FUNCTION_ARGS)
 	PG_CATCH();
 	{
 		if (tt != NULL)
-			xmlFree((xmlChar *) tt);
+			xmlFree(tt);
 
 		pg_xml_done(xmlerrcxt, true);
 
@@ -120,7 +120,7 @@ xml_encode_special_chars(PG_FUNCTION_ARGS)
 	PG_END_TRY();
 
 	if (tt != NULL)
-		xmlFree((xmlChar *) tt);
+		xmlFree(tt);
 
 	pg_xml_done(xmlerrcxt, false);
 
@@ -145,11 +145,10 @@ pgxmlNodeSetToText(xmlNodeSetPtr nodeset,
 				   xmlChar *plainsep)
 {
 	volatile xmlBufferPtr buf = NULL;
-	xmlChar    *result;
-	int			i;
+	xmlChar    *volatile result = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
-	/* spin some error handling */
+	/* spin up some error handling */
 	xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
 
 	PG_TRY();
@@ -168,7 +167,7 @@ pgxmlNodeSetToText(xmlNodeSetPtr nodeset,
 		}
 		if (nodeset != NULL)
 		{
-			for (i = 0; i < nodeset->nodeNr; i++)
+			for (int i = 0; i < nodeset->nodeNr; i++)
 			{
 				if (plainsep != NULL)
 				{
@@ -257,8 +256,8 @@ xpath_nodeset(PG_FUNCTION_ARGS)
 	xmlChar    *toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2));
 	xmlChar    *septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(3));
 	xmlChar    *xpath;
-	text	   *xpres;
-	volatile xpath_workspace *workspace;
+	text	   *volatile xpres = NULL;
+	xpath_workspace *volatile workspace = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
 	xpath = pgxml_texttoxmlchar(xpathsupp);
@@ -302,8 +301,8 @@ xpath_list(PG_FUNCTION_ARGS)
 	text	   *xpathsupp = PG_GETARG_TEXT_PP(1);	/* XPath expression */
 	xmlChar    *plainsep = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2));
 	xmlChar    *xpath;
-	text	   *xpres;
-	volatile xpath_workspace *workspace;
+	text	   *volatile xpres = NULL;
+	xpath_workspace *volatile workspace = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
 	xpath = pgxml_texttoxmlchar(xpathsupp);
@@ -344,8 +343,8 @@ xpath_string(PG_FUNCTION_ARGS)
 	text	   *xpathsupp = PG_GETARG_TEXT_PP(1);	/* XPath expression */
 	xmlChar    *xpath;
 	int32		pathsize;
-	text	   *xpres;
-	volatile xpath_workspace *workspace;
+	text	   *volatile xpres = NULL;
+	xpath_workspace *volatile workspace = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
 	pathsize = VARSIZE_ANY_EXHDR(xpathsupp);
@@ -398,9 +397,9 @@ xpath_number(PG_FUNCTION_ARGS)
 	text	   *document = PG_GETARG_TEXT_PP(0);
 	text	   *xpathsupp = PG_GETARG_TEXT_PP(1);	/* XPath expression */
 	xmlChar    *xpath;
-	float4		fRes = 0.0;
-	bool		isNull = false;
-	volatile xpath_workspace *workspace = NULL;
+	volatile float4 fRes = 0.0;
+	volatile bool isNull = false;
+	xpath_workspace *volatile workspace = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
 	xpath = pgxml_texttoxmlchar(xpathsupp);
@@ -444,8 +443,8 @@ xpath_bool(PG_FUNCTION_ARGS)
 	text	   *document = PG_GETARG_TEXT_PP(0);
 	text	   *xpathsupp = PG_GETARG_TEXT_PP(1);	/* XPath expression */
 	xmlChar    *xpath;
-	int			bRes;
-	volatile xpath_workspace *workspace = NULL;
+	volatile int bRes = 0;
+	xpath_workspace *volatile workspace = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
 	xpath = pgxml_texttoxmlchar(xpathsupp);
@@ -518,7 +517,7 @@ pgxml_xpath(text *document, xmlChar *xpath, PgXmlErrorContext *xmlerrcxt)
 
 /* Clean up after processing the result of pgxml_xpath() */
 static void
-cleanup_workspace(volatile xpath_workspace *workspace)
+cleanup_workspace(xpath_workspace *workspace)
 {
 	if (workspace->res)
 		xmlXPathFreeObject(workspace->res);
@@ -537,9 +536,9 @@ pgxml_result_to_text(xmlXPathObjectPtr res,
 					 xmlChar *septag,
 					 xmlChar *plainsep)
 {
-	volatile xmlChar *xpresstr = NULL;
+	xmlChar    *volatile xpresstr = NULL;
+	text	   *volatile xpres = NULL;
 	PgXmlErrorContext *xmlerrcxt;
-	text	   *xpres;
 
 	if (res == NULL)
 		return NULL;
@@ -578,7 +577,7 @@ pgxml_result_to_text(xmlXPathObjectPtr res,
 	PG_CATCH();
 	{
 		if (xpresstr != NULL)
-			xmlFree((xmlChar *) xpresstr);
+			xmlFree(xpresstr);
 
 		pg_xml_done(xmlerrcxt, true);
 
@@ -587,7 +586,7 @@ pgxml_result_to_text(xmlXPathObjectPtr res,
 	PG_END_TRY();
 
 	/* Free various storage */
-	xmlFree((xmlChar *) xpresstr);
+	xmlFree(xpresstr);
 
 	pg_xml_done(xmlerrcxt, false);
 
diff --git a/contrib/xml2/xslt_proc.c b/contrib/xml2/xslt_proc.c
index c8e7dd45ed5..53550c7dc24 100644
--- a/contrib/xml2/xslt_proc.c
+++ b/contrib/xml2/xslt_proc.c
@@ -48,7 +48,7 @@ xslt_process(PG_FUNCTION_ARGS)
 
 	text	   *doct = PG_GETARG_TEXT_PP(0);
 	text	   *ssheet = PG_GETARG_TEXT_PP(1);
-	text	   *result;
+	text	   *volatile result = NULL;
 	text	   *paramstr;
 	const char **params;
 	PgXmlErrorContext *xmlerrcxt;
@@ -58,8 +58,7 @@ xslt_process(PG_FUNCTION_ARGS)
 	volatile xsltSecurityPrefsPtr xslt_sec_prefs = NULL;
 	volatile xsltTransformContextPtr xslt_ctxt = NULL;
 	volatile int resstat = -1;
-	volatile xmlChar *resstr = NULL;
-	int			reslen = 0;
+	xmlChar    *volatile resstr = NULL;
 
 	if (fcinfo->nargs == 3)
 	{
@@ -80,6 +79,7 @@ xslt_process(PG_FUNCTION_ARGS)
 	{
 		xmlDocPtr	ssdoc;
 		bool		xslt_sec_prefs_error;
+		int			reslen = 0;
 
 		/* Parse document */
 		doctree = xmlReadMemory((char *) VARDATA_ANY(doct),
@@ -160,7 +160,7 @@ xslt_process(PG_FUNCTION_ARGS)
 		if (doctree != NULL)
 			xmlFreeDoc(doctree);
 		if (resstr != NULL)
-			xmlFree((xmlChar *) resstr);
+			xmlFree(resstr);
 		xsltCleanupGlobals();
 
 		pg_xml_done(xmlerrcxt, true);
@@ -177,7 +177,7 @@ xslt_process(PG_FUNCTION_ARGS)
 	xsltCleanupGlobals();
 
 	if (resstr)
-		xmlFree((xmlChar *) resstr);
+		xmlFree(resstr);
 
 	pg_xml_done(xmlerrcxt, false);
 
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 49a7c180a80..0994e089311 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -5121,7 +5121,7 @@ WHERE ...
     <literal>+(pg_lsn,numeric)</literal> and
     <literal>-(pg_lsn,numeric)</literal> operators, respectively. Note that
     the calculated LSN should be in the range of <type>pg_lsn</type> type,
-    i.e., between <literal>0/0</literal> and
+    i.e., between <literal>0/00000000</literal> and
     <literal>FFFFFFFF/FFFFFFFF</literal>.
    </para>
   </sect1>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 810b2b50f0d..c28aa71f570 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -28521,7 +28521,7 @@ acl      | {postgres=arwdDxtm/postgres,foo=r/postgres}
         Returns information about the progress of the WAL summarizer. If the
         WAL summarizer has never run since the instance was started, then
         <literal>summarized_tli</literal> and <literal>summarized_lsn</literal>
-        will be <literal>0</literal> and <literal>0/0</literal> respectively;
+        will be <literal>0</literal> and <literal>0/00000000</literal> respectively;
         otherwise, they will be the TLI and ending LSN of the last WAL summary
         file written to disk. If the WAL summarizer is currently running,
         <literal>pending_lsn</literal> will be the ending LSN of the last
diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index 298c4b38ef9..b2c2cf9eac8 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -2740,26 +2740,6 @@ char *PQport(const PGconn *conn);
      </listitem>
     </varlistentry>
 
-    <varlistentry id="libpq-PQservice">
-     <term><function>PQservice</function><indexterm><primary>PQservice</primary></indexterm></term>
-
-     <listitem>
-      <para>
-       Returns the service of the active connection.
-
-<synopsis>
-char *PQservice(const PGconn *conn);
-</synopsis>
-      </para>
-
-      <para>
-       <xref linkend="libpq-PQservice"/> returns <symbol>NULL</symbol> if the
-       <parameter>conn</parameter> argument is <symbol>NULL</symbol>.
-       Otherwise, if there was no service provided, it returns an empty string.
-      </para>
-     </listitem>
-    </varlistentry>
-
     <varlistentry id="libpq-PQtty">
      <term><function>PQtty</function><indexterm><primary>PQtty</primary></indexterm></term>
 
diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml
index f317ed9c50e..e26f7f59d4a 100644
--- a/doc/src/sgml/logical-replication.sgml
+++ b/doc/src/sgml/logical-replication.sgml
@@ -709,8 +709,8 @@ HINT:  To initiate replication, you must manually create the replication slot, e
   </para>
 
   <para>
-   To confirm that the standby server is indeed ready for failover, follow these
-   steps to verify that all necessary logical replication slots have been
+   To confirm that the standby server is indeed ready for failover for a given subscriber, follow these
+   steps to verify that all the logical replication slots required by that subscriber have been
    synchronized to the standby server:
   </para>
 
@@ -764,7 +764,7 @@ HINT:  To initiate replication, you must manually create the replication slot, e
      Check that the logical replication slots identified above exist on
      the standby server and are ready for failover.
 <programlisting>
-/* standby # */ SELECT slot_name, (synced AND NOT temporary AND NOT conflicting) AS failover_ready
+/* standby # */ SELECT slot_name, (synced AND NOT temporary AND invalidation_reason IS NULL) AS failover_ready
                FROM pg_replication_slots
                WHERE slot_name IN
                    ('sub1','sub2','sub3', 'pg_16394_sync_16385_7394666715149055164');
@@ -782,10 +782,42 @@ HINT:  To initiate replication, you must manually create the replication slot, e
   <para>
    If all the slots are present on the standby server and the result
    (<literal>failover_ready</literal>) of the above SQL query is true, then
-   existing subscriptions can continue subscribing to publications now on the
-   new primary server.
+   existing subscriptions can continue subscribing to publications on the new
+   primary server.
+  </para>
+
+  <para>
+   The first two steps in the above procedure are meant for a
+   <productname>PostgreSQL</productname> subscriber. It is recommended to run
+   these steps on each subscriber node, that will be served by the designated
+   standby after failover, to obtain the complete list of replication
+   slots. This list can then be verified in Step 3 to ensure failover readiness.
+   Non-<productname>PostgreSQL</productname> subscribers, on the other hand, may
+   use their own methods to identify the replication slots used by their
+   respective subscriptions.
+  </para>
+
+  <para>
+   In some cases, such as during a planned failover, it is necessary to confirm
+   that all subscribers, whether <productname>PostgreSQL</productname> or
+   non-<productname>PostgreSQL</productname>, will be able to continue
+   replication after failover to a given standby server. In such cases, use the
+   following SQL, instead of performing the first two steps above, to identify
+   which replication slots on the primary need to be synced to the standby that
+   is intended for promotion. This query returns the relevant replication slots
+   associated with all the failover-enabled subscriptions.
   </para>
 
+   <para>
+<programlisting>
+/* primary # */ SELECT array_agg(quote_literal(r.slot_name)) AS slots
+               FROM pg_replication_slots r
+               WHERE r.failover AND NOT r.temporary;
+ slots
+-------
+ {'sub1','sub2','sub3', 'pg_16394_sync_16385_7394666715149055164'}
+(1 row)
+</programlisting></para>
  </sect1>
 
  <sect1 id="logical-replication-row-filter">
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 12873d17d7f..12929333665 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -741,9 +741,9 @@ test=# SELECT first_tid, nbytes, tids[0:5] AS some_tids
       For example:
 <screen>
 test=# SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
-    lsn     │    nsn     │ rightlink │ flags  
-────────────┼────────────┼───────────┼────────
- 0/0B5FE088 │ 0/00000000 │         1 │ {leaf}
+    lsn     |    nsn     | rightlink | flags  
+------------+------------+-----------+--------
+ 0/0B5FE088 | 0/00000000 |         1 | {leaf}
 (1 row)
 </screen>
      </para>
diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml
index a5eb3aa25e0..1fcdbf7f06e 100644
--- a/doc/src/sgml/ref/pgtesttiming.sgml
+++ b/doc/src/sgml/ref/pgtesttiming.sgml
@@ -30,11 +30,23 @@ PostgreSQL documentation
   <title>Description</title>
 
  <para>
-  <application>pg_test_timing</application> is a tool to measure the timing overhead
-  on your system and confirm that the system time never moves backwards.
+  <application>pg_test_timing</application> is a tool to measure the
+  timing overhead on your system and confirm that the system time never
+  moves backwards.  It simply reads the system clock over and over again
+  as fast as it can for a specified length of time, and then prints
+  statistics about the observed differences in successive clock readings.
+ </para>
+ <para>
+  Smaller (but not zero) differences are better, since they imply both
+  more-precise clock hardware and less overhead to collect a clock reading.
   Systems that are slow to collect timing data can give less accurate
   <command>EXPLAIN ANALYZE</command> results.
  </para>
+ <para>
+  This tool is also helpful to determine if
+  the <varname>track_io_timing</varname> configuration parameter is likely
+  to produce useful results.
+ </para>
  </refsect1>
 
  <refsect1>
@@ -60,6 +72,21 @@ PostgreSQL documentation
      </varlistentry>
 
      <varlistentry>
+      <term><option>-c <replaceable class="parameter">cutoff</replaceable></option></term>
+      <term><option>--cutoff=<replaceable class="parameter">cutoff</replaceable></option></term>
+      <listitem>
+       <para>
+        Specifies the cutoff percentage for the list of exact observed
+        timing durations (that is, the changes in the system clock value
+        from one reading to the next).  The list will end once the running
+        percentage total reaches or exceeds this value, except that the
+        largest observed duration will always be printed.  The default
+        cutoff is 99.99.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-V</option></term>
       <term><option>--version</option></term>
       <listitem>
@@ -92,205 +119,83 @@ PostgreSQL documentation
   <title>Interpreting Results</title>
 
   <para>
-   Good results will show most (>90%) individual timing calls take less than
-   one microsecond. Average per loop overhead will be even lower, below 100
-   nanoseconds. This example from an Intel i7-860 system using a TSC clock
-   source shows excellent performance:
-
-<screen><![CDATA[
-Testing timing overhead for 3 seconds.
-Per loop time including overhead: 35.96 ns
-Histogram of timing durations:
-  < us   % of total      count
-     1     96.40465   80435604
-     2      3.59518    2999652
-     4      0.00015        126
-     8      0.00002         13
-    16      0.00000          2
-]]></screen>
+   The first block of output has four columns, with rows showing a
+   shifted-by-one log2(ns) histogram of timing durations (that is, the
+   differences between successive clock readings).  This is not the
+   classic log2(n+1) histogram as it counts zeros separately and then
+   switches to log2(ns) starting from value 1.
   </para>
-
   <para>
-   Note that different units are used for the per loop time than the
-   histogram. The loop can have resolution within a few nanoseconds (ns),
-   while the individual timing calls can only resolve down to one microsecond
-   (us).
+   The columns are:
+   <itemizedlist spacing="compact">
+    <listitem>
+     <simpara>nanosecond value that is &gt;= the durations in this
+     bucket</simpara>
+    </listitem>
+    <listitem>
+     <simpara>percentage of durations in this bucket</simpara>
+    </listitem>
+    <listitem>
+     <simpara>running-sum percentage of durations in this and previous
+     buckets</simpara>
+    </listitem>
+    <listitem>
+     <simpara>count of durations in this bucket</simpara>
+    </listitem>
+   </itemizedlist>
   </para>
-
- </refsect2>
- <refsect2>
-  <title>Measuring Executor Timing Overhead</title>
-
   <para>
-   When the query executor is running a statement using
-   <command>EXPLAIN ANALYZE</command>, individual operations are timed as well
-   as showing a summary.  The overhead of your system can be checked by
-   counting rows with the <application>psql</application> program:
-
-<screen>
-CREATE TABLE t AS SELECT * FROM generate_series(1,100000);
-\timing
-SELECT COUNT(*) FROM t;
-EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
-</screen>
+   The second block of output goes into more detail, showing the exact
+   timing differences observed.  For brevity this list is cut off when the
+   running-sum percentage exceeds the user-selectable cutoff value.
+   However, the largest observed difference is always shown.
   </para>
-
   <para>
-   The i7-860 system measured runs the count query in 9.8 ms while
-   the <command>EXPLAIN ANALYZE</command> version takes 16.6 ms, each
-   processing just over 100,000 rows.  That 6.8 ms difference means the timing
-   overhead per row is 68 ns, about twice what pg_test_timing estimated it
-   would be.  Even that relatively small amount of overhead is making the fully
-   timed count statement take almost 70% longer.  On more substantial queries,
-   the timing overhead would be less problematic.
+   The example results below show that 99.99% of timing loops took between
+   8 and 31 nanoseconds, with the worst case somewhere between 32768 and
+   65535 nanoseconds.  In the second block, we can see that typical loop
+   time is 16 nanoseconds, and the readings appear to have full nanosecond
+   precision.
   </para>
 
- </refsect2>
-
- <refsect2>
-  <title>Changing Time Sources</title>
   <para>
-   On some newer Linux systems, it's possible to change the clock source used
-   to collect timing data at any time.  A second example shows the slowdown
-   possible from switching to the slower acpi_pm time source, on the same
-   system used for the fast results above:
-
 <screen><![CDATA[
-# cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-tsc hpet acpi_pm
-# echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource
-# pg_test_timing
-Per loop time including overhead: 722.92 ns
+Testing timing overhead for 3 seconds.
+Per loop time including overhead: 16.40 ns
 Histogram of timing durations:
-  < us   % of total      count
-     1     27.84870    1155682
-     2     72.05956    2990371
-     4      0.07810       3241
-     8      0.01357        563
-    16      0.00007          3
+   <= ns   % of total  running %      count
+       0       0.0000     0.0000          0
+       1       0.0000     0.0000          0
+       3       0.0000     0.0000          0
+       7       0.0000     0.0000          0
+      15       4.5452     4.5452    8313178
+      31      95.4527    99.9979  174581501
+      63       0.0001    99.9981        253
+     127       0.0001    99.9982        165
+     255       0.0000    99.9982         35
+     511       0.0000    99.9982          1
+    1023       0.0013    99.9994       2300
+    2047       0.0004    99.9998        690
+    4095       0.0000    99.9998          9
+    8191       0.0000    99.9998          8
+   16383       0.0002   100.0000        337
+   32767       0.0000   100.0000          2
+   65535       0.0000   100.0000          1
+
+Observed timing durations up to 99.9900%:
+      ns   % of total  running %      count
+      15       4.5452     4.5452    8313178
+      16      58.3785    62.9237  106773354
+      17      33.6840    96.6078   61607584
+      18       3.1151    99.7229    5697480
+      19       0.2638    99.9867     482570
+      20       0.0093    99.9960      17054
+...
+   38051       0.0000   100.0000          1
 ]]></screen>
   </para>
 
-  <para>
-   In this configuration, the sample <command>EXPLAIN ANALYZE</command> above
-   takes 115.9 ms.  That's 1061 ns of timing overhead, again a small multiple
-   of what's measured directly by this utility.  That much timing overhead
-   means the actual query itself is only taking a tiny fraction of the
-   accounted for time, most of it is being consumed in overhead instead.  In
-   this configuration, any <command>EXPLAIN ANALYZE</command> totals involving
-   many timed operations would be inflated significantly by timing overhead.
-  </para>
-
-  <para>
-   FreeBSD also allows changing the time source on the fly, and it logs
-   information about the timer selected during boot:
-
-<screen>
-# dmesg | grep "Timecounter"
-Timecounter "ACPI-fast" frequency 3579545 Hz quality 900
-Timecounter "i8254" frequency 1193182 Hz quality 0
-Timecounters tick every 10.000 msec
-Timecounter "TSC" frequency 2531787134 Hz quality 800
-# sysctl kern.timecounter.hardware=TSC
-kern.timecounter.hardware: ACPI-fast -> TSC
-</screen>
-  </para>
-
-  <para>
-   Other systems may only allow setting the time source on boot.  On older
-   Linux systems the "clock" kernel setting is the only way to make this sort
-   of change.  And even on some more recent ones, the only option you'll see
-   for a clock source is "jiffies".  Jiffies are the older Linux software clock
-   implementation, which can have good resolution when it's backed by fast
-   enough timing hardware, as in this example:
-
-<screen><![CDATA[
-$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-jiffies
-$ dmesg | grep time.c
-time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer.
-time.c: Detected 2400.153 MHz processor.
-$ pg_test_timing
-Testing timing overhead for 3 seconds.
-Per timing duration including loop overhead: 97.75 ns
-Histogram of timing durations:
-  < us   % of total      count
-     1     90.23734   27694571
-     2      9.75277    2993204
-     4      0.00981       3010
-     8      0.00007         22
-    16      0.00000          1
-    32      0.00000          1
-]]></screen></para>
-
  </refsect2>
-
- <refsect2>
-  <title>Clock Hardware and Timing Accuracy</title>
-
-  <para>
-   Collecting accurate timing information is normally done on computers using
-   hardware clocks with various levels of accuracy.  With some hardware the
-   operating systems can pass the system clock time almost directly to
-   programs.  A system clock can also be derived from a chip that simply
-   provides timing interrupts, periodic ticks at some known time interval.  In
-   either case, operating system kernels provide a clock source that hides
-   these details.  But the accuracy of that clock source and how quickly it can
-   return results varies based on the underlying hardware.
-  </para>
-
-  <para>
-   Inaccurate time keeping can result in system instability.  Test any change
-   to the clock source very carefully.  Operating system defaults are sometimes
-   made to favor reliability over best accuracy. And if you are using a virtual
-   machine, look into the recommended time sources compatible with it.  Virtual
-   hardware faces additional difficulties when emulating timers, and there are
-   often per operating system settings suggested by vendors.
-  </para>
-
-  <para>
-   The Time Stamp Counter (TSC) clock source is the most accurate one available
-   on current generation CPUs. It's the preferred way to track the system time
-   when it's supported by the operating system and the TSC clock is
-   reliable. There are several ways that TSC can fail to provide an accurate
-   timing source, making it unreliable. Older systems can have a TSC clock that
-   varies based on the CPU temperature, making it unusable for timing. Trying
-   to use TSC on some older multicore CPUs can give a reported time that's
-   inconsistent among multiple cores. This can result in the time going
-   backwards, a problem this program checks for.  And even the newest systems
-   can fail to provide accurate TSC timing with very aggressive power saving
-   configurations.
-  </para>
-
-  <para>
-   Newer operating systems may check for the known TSC problems and switch to a
-   slower, more stable clock source when they are seen.  If your system
-   supports TSC time but doesn't default to that, it may be disabled for a good
-   reason.  And some operating systems may not detect all the possible problems
-   correctly, or will allow using TSC even in situations where it's known to be
-   inaccurate.
-  </para>
-
-  <para>
-   The High Precision Event Timer (HPET) is the preferred timer on systems
-   where it's available and TSC is not accurate.  The timer chip itself is
-   programmable to allow up to 100 nanosecond resolution, but you may not see
-   that much accuracy in your system clock.
-  </para>
-
-  <para>
-   Advanced Configuration and Power Interface (ACPI) provides a Power
-   Management (PM) Timer, which Linux refers to as the acpi_pm.  The clock
-   derived from acpi_pm will at best provide 300 nanosecond resolution.
-  </para>
-
-  <para>
-   Timers used on older PC hardware include the 8254 Programmable Interval
-   Timer (PIT), the real-time clock (RTC), the Advanced Programmable Interrupt
-   Controller (APIC) timer, and the Cyclone timer.  These timers aim for
-   millisecond resolution.
-  </para>
-  </refsect2>
  </refsect1>
 
  <refsect1>
@@ -298,6 +203,8 @@ Histogram of timing durations:
 
   <simplelist type="inline">
    <member><xref linkend="sql-explain"/></member>
+   <member><ulink url="https://wiki.postgresql.org/wiki/Pg_test_timing">Wiki
+   discussion about timing</ulink></member>
   </simplelist>
  </refsect1>
 </refentry>
diff --git a/meson.build b/meson.build
index a97854a947d..5365aaf95e6 100644
--- a/meson.build
+++ b/meson.build
@@ -948,10 +948,10 @@ if not libcurlopt.disabled()
   # libcurl and one of either epoll or kqueue.
   oauth_flow_supported = (
     libcurl.found()
-    and (cc.check_header('sys/event.h', required: false,
-                         args: test_c_args, include_directories: postgres_inc)
-         or cc.check_header('sys/epoll.h', required: false,
-                            args: test_c_args, include_directories: postgres_inc))
+    and (cc.has_header('sys/event.h',
+                       args: test_c_args, include_directories: postgres_inc)
+         or cc.has_header('sys/epoll.h',
+                          args: test_c_args, include_directories: postgres_inc))
   )
 
   if oauth_flow_supported
@@ -995,6 +995,12 @@ liburingopt = get_option('liburing')
 liburing = dependency('liburing', required: liburingopt)
 if liburing.found()
   cdata.set('USE_LIBURING', 1)
+
+  if cc.has_function('io_uring_queue_init_mem',
+      dependencies: liburing, args: test_c_args)
+    cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1)
+  endif
+
 endif
 
 
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 3d44815ed5a..1f04a2c182c 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2247,7 +2247,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
  *	  Determines and returns the cost of an Append node.
  */
 void
-cost_append(AppendPath *apath)
+cost_append(AppendPath *apath, PlannerInfo *root)
 {
 	ListCell   *l;
 
@@ -2309,26 +2309,52 @@ cost_append(AppendPath *apath)
 			foreach(l, apath->subpaths)
 			{
 				Path	   *subpath = (Path *) lfirst(l);
-				Path		sort_path;	/* dummy for result of cost_sort */
+				int			presorted_keys;
+				Path		sort_path;	/* dummy for result of
+										 * cost_sort/cost_incremental_sort */
 
-				if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
+				if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys,
+												 &presorted_keys))
 				{
 					/*
 					 * We'll need to insert a Sort node, so include costs for
-					 * that.  We can use the parent's LIMIT if any, since we
+					 * that.  We choose to use incremental sort if it is
+					 * enabled and there are presorted keys; otherwise we use
+					 * full sort.
+					 *
+					 * We can use the parent's LIMIT if any, since we
 					 * certainly won't pull more than that many tuples from
 					 * any child.
 					 */
-					cost_sort(&sort_path,
-							  NULL, /* doesn't currently need root */
-							  pathkeys,
-							  subpath->disabled_nodes,
-							  subpath->total_cost,
-							  subpath->rows,
-							  subpath->pathtarget->width,
-							  0.0,
-							  work_mem,
-							  apath->limit_tuples);
+					if (enable_incremental_sort && presorted_keys > 0)
+					{
+						cost_incremental_sort(&sort_path,
+											  root,
+											  pathkeys,
+											  presorted_keys,
+											  subpath->disabled_nodes,
+											  subpath->startup_cost,
+											  subpath->total_cost,
+											  subpath->rows,
+											  subpath->pathtarget->width,
+											  0.0,
+											  work_mem,
+											  apath->limit_tuples);
+					}
+					else
+					{
+						cost_sort(&sort_path,
+								  root,
+								  pathkeys,
+								  subpath->disabled_nodes,
+								  subpath->total_cost,
+								  subpath->rows,
+								  subpath->pathtarget->width,
+								  0.0,
+								  work_mem,
+								  apath->limit_tuples);
+					}
+
 					subpath = &sort_path;
 				}
 
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 0b61aef962c..8a9f1d7a943 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -1318,6 +1318,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags)
 			Oid		   *sortOperators;
 			Oid		   *collations;
 			bool	   *nullsFirst;
+			int			presorted_keys;
 
 			/*
 			 * Compute sort column info, and adjust subplan's tlist as needed.
@@ -1353,14 +1354,38 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags)
 						  numsortkeys * sizeof(bool)) == 0);
 
 			/* Now, insert a Sort node if subplan isn't sufficiently ordered */
-			if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
+			if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys,
+											 &presorted_keys))
 			{
-				Sort	   *sort = make_sort(subplan, numsortkeys,
+				Plan	   *sort_plan;
+
+				/*
+				 * We choose to use incremental sort if it is enabled and
+				 * there are presorted keys; otherwise we use full sort.
+				 */
+				if (enable_incremental_sort && presorted_keys > 0)
+				{
+					sort_plan = (Plan *)
+						make_incrementalsort(subplan, numsortkeys, presorted_keys,
 											 sortColIdx, sortOperators,
 											 collations, nullsFirst);
 
-				label_sort_with_costsize(root, sort, best_path->limit_tuples);
-				subplan = (Plan *) sort;
+					label_incrementalsort_with_costsize(root,
+														(IncrementalSort *) sort_plan,
+														pathkeys,
+														best_path->limit_tuples);
+				}
+				else
+				{
+					sort_plan = (Plan *) make_sort(subplan, numsortkeys,
+												   sortColIdx, sortOperators,
+												   collations, nullsFirst);
+
+					label_sort_with_costsize(root, (Sort *) sort_plan,
+											 best_path->limit_tuples);
+				}
+
+				subplan = sort_plan;
 			}
 		}
 
@@ -1491,6 +1516,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path,
 		Oid		   *sortOperators;
 		Oid		   *collations;
 		bool	   *nullsFirst;
+		int			presorted_keys;
 
 		/* Build the child plan */
 		/* Must insist that all children return the same tlist */
@@ -1525,14 +1551,38 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path,
 					  numsortkeys * sizeof(bool)) == 0);
 
 		/* Now, insert a Sort node if subplan isn't sufficiently ordered */
-		if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
+		if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys,
+										 &presorted_keys))
 		{
-			Sort	   *sort = make_sort(subplan, numsortkeys,
+			Plan	   *sort_plan;
+
+			/*
+			 * We choose to use incremental sort if it is enabled and there
+			 * are presorted keys; otherwise we use full sort.
+			 */
+			if (enable_incremental_sort && presorted_keys > 0)
+			{
+				sort_plan = (Plan *)
+					make_incrementalsort(subplan, numsortkeys, presorted_keys,
 										 sortColIdx, sortOperators,
 										 collations, nullsFirst);
 
-			label_sort_with_costsize(root, sort, best_path->limit_tuples);
-			subplan = (Plan *) sort;
+				label_incrementalsort_with_costsize(root,
+													(IncrementalSort *) sort_plan,
+													pathkeys,
+													best_path->limit_tuples);
+			}
+			else
+			{
+				sort_plan = (Plan *) make_sort(subplan, numsortkeys,
+											   sortColIdx, sortOperators,
+											   collations, nullsFirst);
+
+				label_sort_with_costsize(root, (Sort *) sort_plan,
+										 best_path->limit_tuples);
+			}
+
+			subplan = sort_plan;
 		}
 
 		subplans = lappend(subplans, subplan);
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e0192d4a491..9cc602788ea 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1404,12 +1404,12 @@ create_append_path(PlannerInfo *root,
 			pathnode->path.total_cost = child->total_cost;
 		}
 		else
-			cost_append(pathnode);
+			cost_append(pathnode, root);
 		/* Must do this last, else cost_append complains */
 		pathnode->path.pathkeys = child->pathkeys;
 	}
 	else
-		cost_append(pathnode);
+		cost_append(pathnode, root);
 
 	/* If the caller provided a row estimate, override the computed value. */
 	if (rows >= 0)
@@ -1515,6 +1515,9 @@ create_merge_append_path(PlannerInfo *root,
 	foreach(l, subpaths)
 	{
 		Path	   *subpath = (Path *) lfirst(l);
+		int			presorted_keys;
+		Path		sort_path;	/* dummy for result of
+								 * cost_sort/cost_incremental_sort */
 
 		/* All child paths should be unparameterized */
 		Assert(bms_is_empty(PATH_REQ_OUTER(subpath)));
@@ -1523,32 +1526,52 @@ create_merge_append_path(PlannerInfo *root,
 		pathnode->path.parallel_safe = pathnode->path.parallel_safe &&
 			subpath->parallel_safe;
 
-		if (pathkeys_contained_in(pathkeys, subpath->pathkeys))
+		if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys,
+										 &presorted_keys))
 		{
-			/* Subpath is adequately ordered, we won't need to sort it */
-			input_disabled_nodes += subpath->disabled_nodes;
-			input_startup_cost += subpath->startup_cost;
-			input_total_cost += subpath->total_cost;
-		}
-		else
-		{
-			/* We'll need to insert a Sort node, so include cost for that */
-			Path		sort_path;	/* dummy for result of cost_sort */
+			/*
+			 * We'll need to insert a Sort node, so include costs for that. We
+			 * choose to use incremental sort if it is enabled and there are
+			 * presorted keys; otherwise we use full sort.
+			 *
+			 * We can use the parent's LIMIT if any, since we certainly won't
+			 * pull more than that many tuples from any child.
+			 */
+			if (enable_incremental_sort && presorted_keys > 0)
+			{
+				cost_incremental_sort(&sort_path,
+									  root,
+									  pathkeys,
+									  presorted_keys,
+									  subpath->disabled_nodes,
+									  subpath->startup_cost,
+									  subpath->total_cost,
+									  subpath->rows,
+									  subpath->pathtarget->width,
+									  0.0,
+									  work_mem,
+									  pathnode->limit_tuples);
+			}
+			else
+			{
+				cost_sort(&sort_path,
+						  root,
+						  pathkeys,
+						  subpath->disabled_nodes,
+						  subpath->total_cost,
+						  subpath->rows,
+						  subpath->pathtarget->width,
+						  0.0,
+						  work_mem,
+						  pathnode->limit_tuples);
+			}
 
-			cost_sort(&sort_path,
-					  root,
-					  pathkeys,
-					  subpath->disabled_nodes,
-					  subpath->total_cost,
-					  subpath->rows,
-					  subpath->pathtarget->width,
-					  0.0,
-					  work_mem,
-					  pathnode->limit_tuples);
-			input_disabled_nodes += sort_path.disabled_nodes;
-			input_startup_cost += sort_path.startup_cost;
-			input_total_cost += sort_path.total_cost;
+			subpath = &sort_path;
 		}
+
+		input_disabled_nodes += subpath->disabled_nodes;
+		input_startup_cost += subpath->startup_cost;
+		input_total_cost += subpath->total_cost;
 	}
 
 	/*
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index b78048328e1..0a8c054162f 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -29,6 +29,9 @@
 
 #ifdef IOMETHOD_IO_URING_ENABLED
 
+#include <sys/mman.h>
+#include <unistd.h>
+
 #include <liburing.h>
 
 #include "miscadmin.h"
@@ -94,12 +97,32 @@ PgAioUringContext
 	struct io_uring io_uring_ring;
 } PgAioUringContext;
 
+/*
+ * Information about the capabilities that io_uring has.
+ *
+ * Depending on liburing and kernel version different features are
+ * supported. At least for the kernel a kernel version check does not suffice
+ * as various vendors do backport features to older kernels :(.
+ */
+typedef struct PgAioUringCaps
+{
+	bool		checked;
+	/* -1 if io_uring_queue_init_mem() is unsupported */
+	int			mem_init_size;
+} PgAioUringCaps;
+
+
 /* PgAioUringContexts for all backends */
 static PgAioUringContext *pgaio_uring_contexts;
 
 /* the current backend's context */
 static PgAioUringContext *pgaio_my_uring_context;
 
+static PgAioUringCaps pgaio_uring_caps =
+{
+	.checked = false,
+	.mem_init_size = -1,
+};
 
 static uint32
 pgaio_uring_procs(void)
@@ -111,16 +134,145 @@ pgaio_uring_procs(void)
 	return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
 }
 
-static Size
+/*
+ * Initializes pgaio_uring_caps, unless that's already done.
+ */
+static void
+pgaio_uring_check_capabilities(void)
+{
+	if (pgaio_uring_caps.checked)
+		return;
+
+	/*
+	 * By default io_uring creates a shared memory mapping for each io_uring
+	 * instance, leading to a large number of memory mappings. Unfortunately a
+	 * large number of memory mappings slows things down, backend exit is
+	 * particularly affected.  To address that, newer kernels (6.5) support
+	 * using user-provided memory for the memory, by putting the relevant
+	 * memory into shared memory we don't need any additional mappings.
+	 *
+	 * To know whether this is supported, we unfortunately need to probe the
+	 * kernel by trying to create a ring with userspace-provided memory. This
+	 * also has a secondary benefit: We can determine precisely how much
+	 * memory we need for each io_uring instance.
+	 */
+#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
+	{
+		struct io_uring test_ring;
+		size_t		ring_size;
+		void	   *ring_ptr;
+		struct io_uring_params p = {0};
+		int			ret;
+
+		/*
+		 * Liburing does not yet provide an API to query how much memory a
+		 * ring will need. So we over-estimate it here. As the memory is freed
+		 * just below that's small temporary waste of memory.
+		 *
+		 * 1MB is more than enough for rings within io_max_concurrency's
+		 * range.
+		 */
+		ring_size = 1024 * 1024;
+
+		/*
+		 * Hard to believe a system exists where 1MB would not be a multiple
+		 * of the page size. But it's cheap to ensure...
+		 */
+		ring_size -= ring_size % sysconf(_SC_PAGESIZE);
+
+		ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+		if (ring_ptr == MAP_FAILED)
+			elog(ERROR,
+				 "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
+				 ring_size);
+
+		ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size);
+		if (ret > 0)
+		{
+			pgaio_uring_caps.mem_init_size = ret;
+
+			elog(DEBUG1,
+				 "can use combined memory mapping for io_uring, each ring needs %d bytes",
+				 ret);
+
+			/* clean up the created ring, it was just for a test */
+			io_uring_queue_exit(&test_ring);
+		}
+		else
+		{
+			/*
+			 * There are different reasons for ring creation to fail, but it's
+			 * ok to treat that just as io_uring_queue_init_mem() not being
+			 * supported. We'll report a more detailed error in
+			 * pgaio_uring_shmem_init().
+			 */
+			errno = -ret;
+			elog(DEBUG1,
+				 "cannot use combined memory mapping for io_uring, ring creation failed: %m");
+
+		}
+
+		if (munmap(ring_ptr, ring_size) != 0)
+			elog(ERROR, "munmap() failed: %m");
+	}
+#else
+	{
+		elog(DEBUG1,
+			 "can't use combined memory mapping for io_uring, kernel or liburing too old");
+	}
+#endif
+
+	pgaio_uring_caps.checked = true;
+}
+
+/*
+ * Memory for all PgAioUringContext instances
+ */
+static size_t
 pgaio_uring_context_shmem_size(void)
 {
 	return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
 }
 
+/*
+ * Memory for the combined memory used by io_uring instances. Returns 0 if
+ * that is not supported by kernel/liburing.
+ */
+static size_t
+pgaio_uring_ring_shmem_size(void)
+{
+	size_t		sz = 0;
+
+	if (pgaio_uring_caps.mem_init_size > 0)
+	{
+		/*
+		 * Memory for rings needs to be allocated to the page boundary,
+		 * reserve space. Luckily it does not need to be aligned to hugepage
+		 * boundaries, even if huge pages are used.
+		 */
+		sz = add_size(sz, sysconf(_SC_PAGESIZE));
+		sz = add_size(sz, mul_size(pgaio_uring_procs(),
+								   pgaio_uring_caps.mem_init_size));
+	}
+
+	return sz;
+}
+
 static size_t
 pgaio_uring_shmem_size(void)
 {
-	return pgaio_uring_context_shmem_size();
+	size_t		sz;
+
+	/*
+	 * Kernel and liburing support for various features influences how much
+	 * shmem we need, perform the necessary checks.
+	 */
+	pgaio_uring_check_capabilities();
+
+	sz = pgaio_uring_context_shmem_size();
+	sz = add_size(sz, pgaio_uring_ring_shmem_size());
+
+	return sz;
 }
 
 static void
@@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time)
 {
 	int			TotalProcs = pgaio_uring_procs();
 	bool		found;
+	char	   *shmem;
+	size_t		ring_mem_remain = 0;
+	char	   *ring_mem_next = 0;
 
-	pgaio_uring_contexts = (PgAioUringContext *)
-		ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found);
-
+	/*
+	 * We allocate memory for all PgAioUringContext instances and, if
+	 * supported, the memory required for each of the io_uring instances, in
+	 * one ShmemInitStruct().
+	 */
+	shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
 	if (found)
 		return;
 
+	pgaio_uring_contexts = (PgAioUringContext *) shmem;
+	shmem += pgaio_uring_context_shmem_size();
+
+	/* if supported, handle memory alignment / sizing for io_uring memory */
+	if (pgaio_uring_caps.mem_init_size > 0)
+	{
+		ring_mem_remain = pgaio_uring_ring_shmem_size();
+		ring_mem_next = (char *) shmem;
+
+		/* align to page boundary, see also pgaio_uring_ring_shmem_size() */
+		ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next);
+
+		/* account for alignment */
+		ring_mem_remain -= ring_mem_next - shmem;
+		shmem += ring_mem_next - shmem;
+
+		shmem += ring_mem_remain;
+	}
+
 	for (int contextno = 0; contextno < TotalProcs; contextno++)
 	{
 		PgAioUringContext *context = &pgaio_uring_contexts[contextno];
@@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
 		 * be worth using that - also need to evaluate if that causes
 		 * noticeable additional contention?
 		 */
-		ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
+
+		/*
+		 * If supported (c.f. pgaio_uring_check_capabilities()), create ring
+		 * with its data in shared memory. Otherwise fall back io_uring
+		 * creating a memory mapping for each ring.
+		 */
+#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
+		if (pgaio_uring_caps.mem_init_size > 0)
+		{
+			struct io_uring_params p = {0};
+
+			ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
+
+			ring_mem_remain -= ret;
+			ring_mem_next += ret;
+		}
+		else
+#endif
+		{
+			ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
+		}
+
 		if (ret < 0)
 		{
 			char	   *hint = NULL;
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 2bd39b6ac4b..f7b731825fc 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -532,7 +532,7 @@ xmltext(PG_FUNCTION_ARGS)
 	volatile xmlChar *xmlbuf = NULL;
 	PgXmlErrorContext *xmlerrcxt;
 
-	/* Otherwise, we gotta spin up some error handling. */
+	/* First we gotta spin up some error handling. */
 	xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
 
 	PG_TRY();
@@ -685,7 +685,7 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent)
 	volatile xmlBufferPtr buf = NULL;
 	volatile xmlSaveCtxtPtr ctxt = NULL;
 	ErrorSaveContext escontext = {T_ErrorSaveContext};
-	PgXmlErrorContext *xmlerrcxt;
+	PgXmlErrorContext *volatile xmlerrcxt = NULL;
 #endif
 
 	if (xmloption_arg != XMLOPTION_DOCUMENT && !indent)
@@ -726,13 +726,18 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent)
 		return (text *) data;
 	}
 
-	/* Otherwise, we gotta spin up some error handling. */
-	xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
-
+	/*
+	 * Otherwise, we gotta spin up some error handling.  Unlike most other
+	 * routines in this module, we already have a libxml "doc" structure to
+	 * free, so we need to call pg_xml_init() inside the PG_TRY and be
+	 * prepared for it to fail (typically due to palloc OOM).
+	 */
 	PG_TRY();
 	{
 		size_t		decl_len = 0;
 
+		xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
+
 		/* The serialized data will go into this buffer. */
 		buf = xmlBufferCreate();
 
@@ -863,10 +868,10 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent)
 			xmlSaveClose(ctxt);
 		if (buf)
 			xmlBufferFree(buf);
-		if (doc)
-			xmlFreeDoc(doc);
+		xmlFreeDoc(doc);
 
-		pg_xml_done(xmlerrcxt, true);
+		if (xmlerrcxt)
+			pg_xml_done(xmlerrcxt, true);
 
 		PG_RE_THROW();
 	}
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index ce7aad4b25a..64d080335eb 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -9,19 +9,30 @@
 #include <limits.h>
 
 #include "getopt_long.h"
+#include "port/pg_bitutils.h"
 #include "portability/instr_time.h"
 
 static const char *progname;
 
 static unsigned int test_duration = 3;
+static double max_rprct = 99.99;
+
+/* record duration in powers of 2 nanoseconds */
+static long long int histogram[32];
+
+/* record counts of first 1024 durations directly */
+#define NUM_DIRECT 1024
+static long long int direct_histogram[NUM_DIRECT];
+
+/* separately record highest observed duration */
+static int32 largest_diff;
+static long long int largest_diff_count;
+
 
 static void handle_args(int argc, char *argv[]);
 static uint64 test_timing(unsigned int duration);
 static void output(uint64 loop_count);
 
-/* record duration in powers of 2 microseconds */
-static long long int histogram[32];
-
 int
 main(int argc, char *argv[])
 {
@@ -44,6 +55,7 @@ handle_args(int argc, char *argv[])
 {
 	static struct option long_options[] = {
 		{"duration", required_argument, NULL, 'd'},
+		{"cutoff", required_argument, NULL, 'c'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -56,7 +68,7 @@ handle_args(int argc, char *argv[])
 	{
 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
 		{
-			printf(_("Usage: %s [-d DURATION]\n"), progname);
+			printf(_("Usage: %s [-d DURATION] [-c CUTOFF]\n"), progname);
 			exit(0);
 		}
 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
@@ -66,7 +78,7 @@ handle_args(int argc, char *argv[])
 		}
 	}
 
-	while ((option = getopt_long(argc, argv, "d:",
+	while ((option = getopt_long(argc, argv, "d:c:",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -93,6 +105,26 @@ handle_args(int argc, char *argv[])
 				}
 				break;
 
+			case 'c':
+				errno = 0;
+				max_rprct = strtod(optarg, &endptr);
+
+				if (endptr == optarg || *endptr != '\0' || errno != 0)
+				{
+					fprintf(stderr, _("%s: invalid argument for option %s\n"),
+							progname, "--cutoff");
+					fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+					exit(1);
+				}
+
+				if (max_rprct < 0 || max_rprct > 100)
+				{
+					fprintf(stderr, _("%s: %s must be in range %u..%u\n"),
+							progname, "--cutoff", 0, 100);
+					exit(1);
+				}
+				break;
+
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 						progname);
@@ -111,7 +143,6 @@ handle_args(int argc, char *argv[])
 		exit(1);
 	}
 
-
 	printf(ngettext("Testing timing overhead for %u second.\n",
 					"Testing timing overhead for %u seconds.\n",
 					test_duration),
@@ -130,19 +161,19 @@ test_timing(unsigned int duration)
 				end_time,
 				temp;
 
-	total_time = duration > 0 ? duration * INT64CONST(1000000) : 0;
+	total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0;
 
 	INSTR_TIME_SET_CURRENT(start_time);
-	cur = INSTR_TIME_GET_MICROSEC(start_time);
+	cur = INSTR_TIME_GET_NANOSEC(start_time);
 
 	while (time_elapsed < total_time)
 	{
 		int32		diff,
-					bits = 0;
+					bits;
 
 		prev = cur;
 		INSTR_TIME_SET_CURRENT(temp);
-		cur = INSTR_TIME_GET_MICROSEC(temp);
+		cur = INSTR_TIME_GET_NANOSEC(temp);
 		diff = cur - prev;
 
 		/* Did time go backwards? */
@@ -154,18 +185,30 @@ test_timing(unsigned int duration)
 		}
 
 		/* What is the highest bit in the time diff? */
-		while (diff)
-		{
-			diff >>= 1;
-			bits++;
-		}
+		if (diff > 0)
+			bits = pg_leftmost_one_pos32(diff) + 1;
+		else
+			bits = 0;
 
 		/* Update appropriate duration bucket */
 		histogram[bits]++;
 
+		/* Update direct histogram of time diffs */
+		if (diff < NUM_DIRECT)
+			direct_histogram[diff]++;
+
+		/* Also track the largest observed duration, even if >= NUM_DIRECT */
+		if (diff > largest_diff)
+		{
+			largest_diff = diff;
+			largest_diff_count = 1;
+		}
+		else if (diff == largest_diff)
+			largest_diff_count++;
+
 		loop_count++;
 		INSTR_TIME_SUBTRACT(temp, start_time);
-		time_elapsed = INSTR_TIME_GET_MICROSEC(temp);
+		time_elapsed = INSTR_TIME_GET_NANOSEC(temp);
 	}
 
 	INSTR_TIME_SET_CURRENT(end_time);
@@ -181,28 +224,95 @@ test_timing(unsigned int duration)
 static void
 output(uint64 loop_count)
 {
-	int64		max_bit = 31,
-				i;
-	char	   *header1 = _("< us");
-	char	   *header2 = /* xgettext:no-c-format */ _("% of total");
-	char	   *header3 = _("count");
+	int			max_bit = 31;
+	const char *header1 = _("<= ns");
+	const char *header1b = _("ns");
+	const char *header2 = /* xgettext:no-c-format */ _("% of total");
+	const char *header3 = /* xgettext:no-c-format */ _("running %");
+	const char *header4 = _("count");
 	int			len1 = strlen(header1);
 	int			len2 = strlen(header2);
 	int			len3 = strlen(header3);
+	int			len4 = strlen(header4);
+	double		rprct;
+	bool		stopped = false;
 
 	/* find highest bit value */
 	while (max_bit > 0 && histogram[max_bit] == 0)
 		max_bit--;
 
+	/* set minimum column widths */
+	len1 = Max(8, len1);
+	len2 = Max(10, len2);
+	len3 = Max(10, len3);
+	len4 = Max(10, len4);
+
 	printf(_("Histogram of timing durations:\n"));
-	printf("%*s   %*s %*s\n",
-		   Max(6, len1), header1,
-		   Max(10, len2), header2,
-		   Max(10, len3), header3);
-
-	for (i = 0; i <= max_bit; i++)
-		printf("%*ld    %*.5f %*lld\n",
-			   Max(6, len1), 1l << i,
-			   Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count,
-			   Max(10, len3), histogram[i]);
+	printf("%*s   %*s %*s %*s\n",
+		   len1, header1,
+		   len2, header2,
+		   len3, header3,
+		   len4, header4);
+
+	rprct = 0;
+	for (int i = 0; i <= max_bit; i++)
+	{
+		double		prct = (double) histogram[i] * 100 / loop_count;
+
+		rprct += prct;
+		printf("%*ld   %*.4f %*.4f %*lld\n",
+			   len1, (1L << i) - 1,
+			   len2, prct,
+			   len3, rprct,
+			   len4, histogram[i]);
+	}
+
+	printf(_("\nObserved timing durations up to %.4f%%:\n"), max_rprct);
+	printf("%*s   %*s %*s %*s\n",
+		   len1, header1b,
+		   len2, header2,
+		   len3, header3,
+		   len4, header4);
+
+	rprct = 0;
+	for (int i = 0; i < NUM_DIRECT; i++)
+	{
+		if (direct_histogram[i])
+		{
+			double		prct = (double) direct_histogram[i] * 100 / loop_count;
+			bool		print_it = !stopped;
+
+			rprct += prct;
+
+			/* if largest diff is < NUM_DIRECT, be sure we print it */
+			if (i == largest_diff)
+			{
+				if (stopped)
+					printf("...\n");
+				print_it = true;
+			}
+
+			if (print_it)
+				printf("%*d   %*.4f %*.4f %*lld\n",
+					   len1, i,
+					   len2, prct,
+					   len3, rprct,
+					   len4, direct_histogram[i]);
+			if (rprct >= max_rprct)
+				stopped = true;
+		}
+	}
+
+	/* print largest diff when it's outside the array range */
+	if (largest_diff >= NUM_DIRECT)
+	{
+		double		prct = (double) largest_diff_count * 100 / loop_count;
+
+		printf("...\n");
+		printf("%*d   %*.4f %*.4f %*lld\n",
+			   len1, largest_diff,
+			   len2, prct,
+			   len3, 100.0,
+			   len4, largest_diff_count);
+	}
 }
diff --git a/src/bin/pg_test_timing/t/001_basic.pl b/src/bin/pg_test_timing/t/001_basic.pl
index 6554cd981af..9912acc052a 100644
--- a/src/bin/pg_test_timing/t/001_basic.pl
+++ b/src/bin/pg_test_timing/t/001_basic.pl
@@ -25,5 +25,22 @@ command_fails_like(
 	[ 'pg_test_timing', '--duration' => '0' ],
 	qr/\Qpg_test_timing: --duration must be in range 1..4294967295\E/,
 	'pg_test_timing: --duration must be in range');
+command_fails_like(
+	[ 'pg_test_timing', '--cutoff' => '101' ],
+	qr/\Qpg_test_timing: --cutoff must be in range 0..100\E/,
+	'pg_test_timing: --cutoff must be in range');
+
+#########################################
+# We obviously can't check for specific output, but we can
+# do a simple run and make sure it produces something.
+
+command_like(
+	[ 'pg_test_timing', '--duration' => '1' ],
+	qr/
+\QTesting timing overhead for 1 second.\E.*
+\QHistogram of timing durations:\E.*
+\QObserved timing durations up to 99.9900%:\E
+/sx,
+	'pg_test_timing: sanity check');
 
 done_testing();
diff --git a/src/bin/pg_walsummary/t/002_blocks.pl b/src/bin/pg_walsummary/t/002_blocks.pl
index 270332780a4..0f98c7df82e 100644
--- a/src/bin/pg_walsummary/t/002_blocks.pl
+++ b/src/bin/pg_walsummary/t/002_blocks.pl
@@ -47,11 +47,12 @@ EOM
 ok($result, "WAL summarization caught up after insert");
 
 # The WAL summarizer should have generated some IO statistics.
-my $stats_reads = $node1->safe_psql(
+$node1->poll_query_until(
 	'postgres',
-	qq{SELECT sum(reads) > 0 FROM pg_stat_io
-   WHERE backend_type = 'walsummarizer' AND object = 'wal'});
-is($stats_reads, 't', "WAL summarizer generates statistics for WAL reads");
+	q{SELECT sum(reads) > 0 FROM pg_stat_io
+   WHERE backend_type = 'walsummarizer' AND object = 'wal'})
+  or die
+  "Timed out while waiting for WAL summarizer to generate statistics for WAL reads";
 
 # Find the highest LSN that is summarized on disk.
 my $summarized_lsn = $node1->safe_psql('postgres', <<EOM);
diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
index 9fcd2db8326..0a55901b14e 100644
--- a/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@ -4480,6 +4480,7 @@ SyncVariables(void)
 {
 	char		vbuf[32];
 	const char *server_version;
+	char	   *service_name;
 
 	/* get stuff from connection */
 	pset.encoding = PQclientEncoding(pset.db);
@@ -4489,12 +4490,16 @@ SyncVariables(void)
 	setFmtEncoding(pset.encoding);
 
 	SetVariable(pset.vars, "DBNAME", PQdb(pset.db));
-	SetVariable(pset.vars, "SERVICE", PQservice(pset.db));
 	SetVariable(pset.vars, "USER", PQuser(pset.db));
 	SetVariable(pset.vars, "HOST", PQhost(pset.db));
 	SetVariable(pset.vars, "PORT", PQport(pset.db));
 	SetVariable(pset.vars, "ENCODING", pg_encoding_to_char(pset.encoding));
 
+	service_name = get_conninfo_value("service");
+	SetVariable(pset.vars, "SERVICE", service_name);
+	if (service_name)
+		pg_free(service_name);
+
 	/* this bit should match connection_warnings(): */
 	/* Try to get full text form of version, might include "devel" etc */
 	server_version = PQparameterStatus(pset.db, "server_version");
diff --git a/src/bin/psql/common.c b/src/bin/psql/common.c
index d2c0a49c46c..cd329ade12b 100644
--- a/src/bin/psql/common.c
+++ b/src/bin/psql/common.c
@@ -2531,6 +2531,41 @@ session_username(void)
 		return PQuser(pset.db);
 }
 
+/*
+ * Return the value of option for keyword in the current connection.
+ *
+ * The caller is responsible for freeing the result value allocated.
+ */
+char *
+get_conninfo_value(const char *keyword)
+{
+	PQconninfoOption *opts;
+	PQconninfoOption *serviceopt = NULL;
+	char	   *res = NULL;
+
+	if (pset.db == NULL)
+		return NULL;
+
+	opts = PQconninfo(pset.db);
+	if (opts == NULL)
+		return NULL;
+
+	for (PQconninfoOption *opt = opts; opt->keyword != NULL; ++opt)
+	{
+		if (strcmp(opt->keyword, keyword) == 0)
+		{
+			serviceopt = opt;
+			break;
+		}
+	}
+
+	/* Take a copy of the value, as it is freed by PQconninfoFree(). */
+	if (serviceopt && serviceopt->val != NULL)
+		res = pg_strdup(serviceopt->val);
+	PQconninfoFree(opts);
+
+	return res;
+}
 
 /* expand_tilde
  *
diff --git a/src/bin/psql/common.h b/src/bin/psql/common.h
index 7f1a23de1e8..64762ab9817 100644
--- a/src/bin/psql/common.h
+++ b/src/bin/psql/common.h
@@ -39,6 +39,7 @@ extern bool SendQuery(const char *query);
 extern bool is_superuser(void);
 extern bool standard_strings(void);
 extern const char *session_username(void);
+extern char *get_conninfo_value(const char *keyword);
 
 extern void expand_tilde(char **filename);
 extern void clean_extended_state(void);
diff --git a/src/bin/psql/prompt.c b/src/bin/psql/prompt.c
index 3aa7d2d06c8..b08d7328fbf 100644
--- a/src/bin/psql/prompt.c
+++ b/src/bin/psql/prompt.c
@@ -169,8 +169,12 @@ get_prompt(promptStatus_t status, ConditionalStack cstack)
 					break;
 					/* service name */
 				case 's':
-					if (pset.db && PQservice(pset.db))
-						strlcpy(buf, PQservice(pset.db), sizeof(buf));
+					{
+						const char *service_name = GetVariable(pset.vars, "SERVICE");
+
+						if (service_name)
+							strlcpy(buf, service_name, sizeof(buf));
+					}
 					break;
 					/* backend pid */
 				case 'p':
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index d397fe27dc1..b523bcda8f3 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -118,7 +118,7 @@ extern void cost_incremental_sort(Path *path,
 								  Cost input_startup_cost, Cost input_total_cost,
 								  double input_tuples, int width, Cost comparison_cost, int sort_mem,
 								  double limit_tuples);
-extern void cost_append(AppendPath *apath);
+extern void cost_append(AppendPath *apath, PlannerInfo *root);
 extern void cost_merge_append(Path *path, PlannerInfo *root,
 							  List *pathkeys, int n_streams,
 							  int input_disabled_nodes,
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 726a7c1be1f..c4dc5d72bdb 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -229,6 +229,9 @@
 /* Define to 1 if you have the global variable 'int timezone'. */
 #undef HAVE_INT_TIMEZONE
 
+/* Define to 1 if you have the `io_uring_queue_init_mem' function. */
+#undef HAVE_IO_URING_QUEUE_INIT_MEM
+
 /* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
 #undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
 
diff --git a/src/interfaces/libpq/exports.txt b/src/interfaces/libpq/exports.txt
index 0625cf39e9a..dbbae642d76 100644
--- a/src/interfaces/libpq/exports.txt
+++ b/src/interfaces/libpq/exports.txt
@@ -205,9 +205,8 @@ PQcancelFinish            202
 PQsocketPoll              203
 PQsetChunkedRowsMode      204
 PQgetCurrentTimeUSec      205
-PQservice                 206
-PQsetAuthDataHook         207
-PQgetAuthDataHook         208
-PQdefaultAuthDataHook     209
-PQfullProtocolVersion     210
-appendPQExpBufferVA       211
+PQsetAuthDataHook         206
+PQgetAuthDataHook         207
+PQdefaultAuthDataHook     208
+PQfullProtocolVersion     209
+appendPQExpBufferVA       210
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 51a9c416584..09eb79812ac 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -7462,14 +7462,6 @@ PQdb(const PGconn *conn)
 }
 
 char *
-PQservice(const PGconn *conn)
-{
-	if (!conn)
-		return NULL;
-	return conn->pgservice;
-}
-
-char *
 PQuser(const PGconn *conn)
 {
 	if (!conn)
diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h
index 7d3a9df6fd5..af8004f952a 100644
--- a/src/interfaces/libpq/libpq-fe.h
+++ b/src/interfaces/libpq/libpq-fe.h
@@ -400,7 +400,6 @@ extern int	PQrequestCancel(PGconn *conn);
 
 /* Accessor functions for PGconn objects */
 extern char *PQdb(const PGconn *conn);
-extern char *PQservice(const PGconn *conn);
 extern char *PQuser(const PGconn *conn);
 extern char *PQpass(const PGconn *conn);
 extern char *PQhost(const PGconn *conn);
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
index b00219643b9..5a1dd9fc022 100644
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1722,3 +1722,43 @@ order by t1.four, t1.two limit 1;
                ->  Seq Scan on tenk1 t2
 (12 rows)
 
+--
+-- Test incremental sort for Append/MergeAppend
+--
+create table prt_tbl (a int, b int) partition by range (a);
+create table prt_tbl_1 partition of prt_tbl for values from (0) to (100);
+create table prt_tbl_2 partition of prt_tbl for values from (100) to (200);
+insert into prt_tbl select i%200, i from generate_series(1,1000)i;
+create index on prt_tbl_1(a);
+create index on prt_tbl_2(a, b);
+analyze prt_tbl;
+set enable_seqscan to off;
+set enable_bitmapscan to off;
+-- Ensure we get an incremental sort for the subpath of Append
+explain (costs off) select * from prt_tbl order by a, b;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Append
+   ->  Incremental Sort
+         Sort Key: prt_tbl_1.a, prt_tbl_1.b
+         Presorted Key: prt_tbl_1.a
+         ->  Index Scan using prt_tbl_1_a_idx on prt_tbl_1
+   ->  Index Only Scan using prt_tbl_2_a_b_idx on prt_tbl_2
+(6 rows)
+
+-- Ensure we get an incremental sort for the subpath of MergeAppend
+explain (costs off) select * from prt_tbl_1 union all select * from prt_tbl_2 order by a, b;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Merge Append
+   Sort Key: prt_tbl_1.a, prt_tbl_1.b
+   ->  Incremental Sort
+         Sort Key: prt_tbl_1.a, prt_tbl_1.b
+         Presorted Key: prt_tbl_1.a
+         ->  Index Scan using prt_tbl_1_a_idx on prt_tbl_1
+   ->  Index Only Scan using prt_tbl_2_a_b_idx on prt_tbl_2
+(7 rows)
+
+reset enable_bitmapscan;
+reset enable_seqscan;
+drop table prt_tbl;
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 78dead65325..5b5055babdc 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -1898,10 +1898,11 @@ ORDER BY thousand, tenthous;
  Merge Append
    Sort Key: tenk1.thousand, tenk1.tenthous
    ->  Index Only Scan using tenk1_thous_tenthous on tenk1
-   ->  Sort
+   ->  Incremental Sort
          Sort Key: tenk1_1.thousand, tenk1_1.thousand
+         Presorted Key: tenk1_1.thousand
          ->  Index Only Scan using tenk1_thous_tenthous on tenk1 tenk1_1
-(6 rows)
+(7 rows)
 
 explain (costs off)
 SELECT thousand, tenthous, thousand+tenthous AS x FROM tenk1
@@ -1982,10 +1983,11 @@ ORDER BY x, y;
  Merge Append
    Sort Key: a.thousand, a.tenthous
    ->  Index Only Scan using tenk1_thous_tenthous on tenk1 a
-   ->  Sort
+   ->  Incremental Sort
          Sort Key: b.unique2, b.unique2
+         Presorted Key: b.unique2
          ->  Index Only Scan using tenk1_unique2 on tenk1 b
-(6 rows)
+(7 rows)
 
 -- exercise rescan code path via a repeatedly-evaluated subquery
 explain (costs off)
diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql
index f1f8fae5654..bbe658a7588 100644
--- a/src/test/regress/sql/incremental_sort.sql
+++ b/src/test/regress/sql/incremental_sort.sql
@@ -298,3 +298,27 @@ explain (costs off)
 select * from
   (select * from tenk1 order by four) t1 join tenk1 t2 on t1.four = t2.four and t1.two = t2.two
 order by t1.four, t1.two limit 1;
+
+--
+-- Test incremental sort for Append/MergeAppend
+--
+create table prt_tbl (a int, b int) partition by range (a);
+create table prt_tbl_1 partition of prt_tbl for values from (0) to (100);
+create table prt_tbl_2 partition of prt_tbl for values from (100) to (200);
+insert into prt_tbl select i%200, i from generate_series(1,1000)i;
+create index on prt_tbl_1(a);
+create index on prt_tbl_2(a, b);
+analyze prt_tbl;
+
+set enable_seqscan to off;
+set enable_bitmapscan to off;
+
+-- Ensure we get an incremental sort for the subpath of Append
+explain (costs off) select * from prt_tbl order by a, b;
+
+-- Ensure we get an incremental sort for the subpath of MergeAppend
+explain (costs off) select * from prt_tbl_1 union all select * from prt_tbl_2 order by a, b;
+
+reset enable_bitmapscan;
+reset enable_seqscan;
+drop table prt_tbl;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 114bdafafdf..83192038571 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2181,6 +2181,7 @@ PgAioReturn
 PgAioTargetData
 PgAioTargetID
 PgAioTargetInfo
+PgAioUringCaps
 PgAioUringContext
 PgAioWaitRef
 PgArchData