diff options
Diffstat (limited to 'src')
21 files changed, 532 insertions, 318 deletions
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 48f10bec91e..e80fbe109cf 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -110,9 +110,7 @@ static SlruCtlData XactCtlData; #define XactCtl (&XactCtlData) -static int ZeroCLOGPage(int64 pageno, bool writeXlog); static bool CLOGPagePrecedes(int64 page1, int64 page2); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb); static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, @@ -832,41 +830,8 @@ check_transaction_buffers(int *newval, void **extra, GucSource source) void BootStrapCLOG(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(XactCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the commit log */ - slotno = ZeroCLOGPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of CLOG to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCLOGPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(XactCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(XactCtl, 0); } /* @@ -974,8 +939,9 @@ ExtendCLOG(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCLOGPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(XactCtl, pageno); + XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -1068,17 +1034,6 @@ CLOGPagePrecedes(int64 page1, int64 page2) /* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); -} - -/* * Write a TRUNCATE xlog record * * We must flush the xlog record to disk before returning --- see notes @@ -1114,19 +1069,9 @@ clog_redo(XLogReaderState *record) if (info == CLOG_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(XactCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCLOGPage(pageno, false); - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(XactCtl, pageno); } else if (info == CLOG_TRUNCATE) { diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 225ff7ca9f2..370b38e048b 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -114,11 +114,9 @@ static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, RepOriginId nodeid, int slotno); static void error_commit_ts_disabled(void); -static int ZeroCommitTsPage(int64 pageno, bool writeXlog); static bool CommitTsPagePrecedes(int64 page1, int64 page2); static void ActivateCommitTs(void); static void DeactivateCommitTs(void); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid); /* @@ -603,28 +601,6 @@ BootStrapCommitTs(void) } /* - * Initialize (or reinitialize) a page of CommitTs to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCommitTsPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(CommitTsCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; -} - -/* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized TransamVariables->nextXid. */ @@ -754,16 +730,7 @@ ActivateCommitTs(void) /* Create the current segment file, if necessary */ if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) - { - LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - int slotno; - - LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - LWLockRelease(lock); - } + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); /* Change the activation status in shared memory. */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); @@ -874,8 +841,12 @@ ExtendCommitTs(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCommitTsPage(pageno, !InRecovery); + /* Zero the page ... */ + SimpleLruZeroPage(CommitTsCtl, pageno); + + /* and make a WAL entry about that, unless we're in REDO */ + if (!InRecovery) + XLogSimpleInsertInt64(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -990,17 +961,6 @@ CommitTsPagePrecedes(int64 page1, int64 page2) /* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); -} - -/* * Write a TRUNCATE xlog record */ static void @@ -1030,19 +990,9 @@ commit_ts_redo(XLogReaderState *record) if (info == COMMIT_TS_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); } else if (info == COMMIT_TS_TRUNCATE) { diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 7a7afe3edc6..3cb09c3d598 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -401,8 +401,6 @@ static void mXactCachePut(MultiXactId multi, int nmembers, static char *mxstatus_to_string(MultiXactStatus status); /* management of SLRU infrastructure */ -static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog); -static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog); static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2); static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2); static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, @@ -413,7 +411,6 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance); static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); -static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, @@ -2033,70 +2030,9 @@ check_multixact_member_buffers(int *newval, void **extra, GucSource source) void BootStrapMultiXact(void) { - int slotno; - LWLock *lock; - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the offsets log */ - slotno = ZeroMultiXactOffsetPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the members log */ - slotno = ZeroMultiXactMemberPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); - - return slotno; -} - -/* - * Ditto, for MultiXactMember - */ -static int -ZeroMultiXactMemberPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); - - return slotno; + /* Zero the initial pages and flush them to disk */ + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, 0); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } /* @@ -2134,7 +2070,7 @@ MaybeExtendOffsetSlru(void) * with creating a new segment file even if the page we're writing is * not the first in it, so this is enough. */ - slotno = ZeroMultiXactOffsetPage(pageno, false); + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); SimpleLruWritePage(MultiXactOffsetCtl, slotno); } @@ -2568,8 +2504,10 @@ ExtendMultiXactOffset(MultiXactId multi) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactOffsetPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE, + pageno); LWLockRelease(lock); } @@ -2611,8 +2549,10 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactMemberPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactMemberCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, + XLOG_MULTIXACT_ZERO_MEM_PAGE, pageno); LWLockRelease(lock); } @@ -3348,18 +3288,6 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) } /* - * Write an xlog record reflecting the zeroing of either a MEMBERs or - * OFFSETs page (info shows which) - */ -static void -WriteMZeroPageXlogRec(int64 pageno, uint8 info) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_MULTIXACT_ID, info); -} - -/* * Write a TRUNCATE xlog record * * We must flush the xlog record to disk before returning --- see notes in @@ -3401,36 +3329,16 @@ multixact_redo(XLogReaderState *record) if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, pageno); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactMemberPage(pageno, false); - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, pageno); } else if (info == XLOG_MULTIXACT_CREATE_ID) { diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index fe56286d9a9..10ec259f382 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -434,6 +434,31 @@ SimpleLruZeroLSNs(SlruCtl ctl, int slotno) } /* + * This is a convenience wrapper for the common case of zeroing a page and + * immediately flushing it to disk. + * + * Control lock is acquired and released here. + */ +void +SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno) +{ + int slotno; + LWLock *lock; + + lock = SimpleLruGetBankLock(ctl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(ctl, pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(ctl, slotno); + Assert(!ctl->shared->page_dirty[slotno]); + + LWLockRelease(lock); +} + +/* * Wait for any active I/O on a page slot to finish. (This does not * guarantee that new I/O hasn't been started before we return, though. * In fact the slot might not even contain the same page anymore.) diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 15153618fad..09aace9e09f 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -74,7 +74,6 @@ static SlruCtlData SubTransCtlData; #define SubTransCtl (&SubTransCtlData) -static int ZeroSUBTRANSPage(int64 pageno); static bool SubTransPagePrecedes(int64 page1, int64 page2); @@ -269,33 +268,8 @@ check_subtrans_buffers(int *newval, void **extra, GucSource source) void BootStrapSUBTRANS(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(SubTransCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the subtrans log */ - slotno = ZeroSUBTRANSPage(0); - - /* Make sure it's written out */ - SimpleLruWritePage(SubTransCtl, slotno); - Assert(!SubTransCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of SUBTRANS to zeroes. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroSUBTRANSPage(int64 pageno) -{ - return SimpleLruZeroPage(SubTransCtl, pageno); + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(SubTransCtl, 0); } /* @@ -335,7 +309,7 @@ StartupSUBTRANS(TransactionId oldestActiveXID) prevlock = lock; } - (void) ZeroSUBTRANSPage(startPage); + (void) SimpleLruZeroPage(SubTransCtl, startPage); if (startPage == endPage) break; @@ -395,7 +369,7 @@ ExtendSUBTRANS(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page */ - ZeroSUBTRANSPage(pageno); + SimpleLruZeroPage(SubTransCtl, pageno); LWLockRelease(lock); } diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5ee9d0b028e..c7571429e8e 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -530,6 +530,18 @@ XLogInsert(RmgrId rmid, uint8 info) } /* + * Simple wrapper to XLogInsert to insert a WAL record with elementary + * contents (only an int64 is supported as value currently). + */ +XLogRecPtr +XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value) +{ + XLogBeginInsert(); + XLogRegisterData(&value, sizeof(value)); + return XLogInsert(rmid, info); +} + +/* * Assemble a WAL record from the registered data and buffers into an * XLogRecData chain, ready for insertion with XLogInsertRecord(). * diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 3d44815ed5a..1f04a2c182c 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2247,7 +2247,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers) * Determines and returns the cost of an Append node. */ void -cost_append(AppendPath *apath) +cost_append(AppendPath *apath, PlannerInfo *root) { ListCell *l; @@ -2309,26 +2309,52 @@ cost_append(AppendPath *apath) foreach(l, apath->subpaths) { Path *subpath = (Path *) lfirst(l); - Path sort_path; /* dummy for result of cost_sort */ + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { /* * We'll need to insert a Sort node, so include costs for - * that. We can use the parent's LIMIT if any, since we + * that. We choose to use incremental sort if it is + * enabled and there are presorted keys; otherwise we use + * full sort. + * + * We can use the parent's LIMIT if any, since we * certainly won't pull more than that many tuples from * any child. */ - cost_sort(&sort_path, - NULL, /* doesn't currently need root */ - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - apath->limit_tuples); + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + subpath = &sort_path; } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 0b61aef962c..8a9f1d7a943 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -1318,6 +1318,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* * Compute sort column info, and adjust subplan's tlist as needed. @@ -1353,14 +1354,38 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and + * there are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } } @@ -1491,6 +1516,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* Build the child plan */ /* Must insist that all children return the same tlist */ @@ -1525,14 +1551,38 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and there + * are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } subplans = lappend(subplans, subplan); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e0192d4a491..9cc602788ea 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1404,12 +1404,12 @@ create_append_path(PlannerInfo *root, pathnode->path.total_cost = child->total_cost; } else - cost_append(pathnode); + cost_append(pathnode, root); /* Must do this last, else cost_append complains */ pathnode->path.pathkeys = child->pathkeys; } else - cost_append(pathnode); + cost_append(pathnode, root); /* If the caller provided a row estimate, override the computed value. */ if (rows >= 0) @@ -1515,6 +1515,9 @@ create_merge_append_path(PlannerInfo *root, foreach(l, subpaths) { Path *subpath = (Path *) lfirst(l); + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ /* All child paths should be unparameterized */ Assert(bms_is_empty(PATH_REQ_OUTER(subpath))); @@ -1523,32 +1526,52 @@ create_merge_append_path(PlannerInfo *root, pathnode->path.parallel_safe = pathnode->path.parallel_safe && subpath->parallel_safe; - if (pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - /* Subpath is adequately ordered, we won't need to sort it */ - input_disabled_nodes += subpath->disabled_nodes; - input_startup_cost += subpath->startup_cost; - input_total_cost += subpath->total_cost; - } - else - { - /* We'll need to insert a Sort node, so include cost for that */ - Path sort_path; /* dummy for result of cost_sort */ + /* + * We'll need to insert a Sort node, so include costs for that. We + * choose to use incremental sort if it is enabled and there are + * presorted keys; otherwise we use full sort. + * + * We can use the parent's LIMIT if any, since we certainly won't + * pull more than that many tuples from any child. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } - cost_sort(&sort_path, - root, - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - pathnode->limit_tuples); - input_disabled_nodes += sort_path.disabled_nodes; - input_startup_cost += sort_path.startup_cost; - input_total_cost += sort_path.total_cost; + subpath = &sort_path; } + + input_disabled_nodes += subpath->disabled_nodes; + input_startup_cost += subpath->startup_cost; + input_total_cost += subpath->total_cost; } /* diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index b78048328e1..0a8c054162f 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -29,6 +29,9 @@ #ifdef IOMETHOD_IO_URING_ENABLED +#include <sys/mman.h> +#include <unistd.h> + #include <liburing.h> #include "miscadmin.h" @@ -94,12 +97,32 @@ PgAioUringContext struct io_uring io_uring_ring; } PgAioUringContext; +/* + * Information about the capabilities that io_uring has. + * + * Depending on liburing and kernel version different features are + * supported. At least for the kernel a kernel version check does not suffice + * as various vendors do backport features to older kernels :(. + */ +typedef struct PgAioUringCaps +{ + bool checked; + /* -1 if io_uring_queue_init_mem() is unsupported */ + int mem_init_size; +} PgAioUringCaps; + + /* PgAioUringContexts for all backends */ static PgAioUringContext *pgaio_uring_contexts; /* the current backend's context */ static PgAioUringContext *pgaio_my_uring_context; +static PgAioUringCaps pgaio_uring_caps = +{ + .checked = false, + .mem_init_size = -1, +}; static uint32 pgaio_uring_procs(void) @@ -111,16 +134,145 @@ pgaio_uring_procs(void) return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; } -static Size +/* + * Initializes pgaio_uring_caps, unless that's already done. + */ +static void +pgaio_uring_check_capabilities(void) +{ + if (pgaio_uring_caps.checked) + return; + + /* + * By default io_uring creates a shared memory mapping for each io_uring + * instance, leading to a large number of memory mappings. Unfortunately a + * large number of memory mappings slows things down, backend exit is + * particularly affected. To address that, newer kernels (6.5) support + * using user-provided memory for the memory, by putting the relevant + * memory into shared memory we don't need any additional mappings. + * + * To know whether this is supported, we unfortunately need to probe the + * kernel by trying to create a ring with userspace-provided memory. This + * also has a secondary benefit: We can determine precisely how much + * memory we need for each io_uring instance. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + { + struct io_uring test_ring; + size_t ring_size; + void *ring_ptr; + struct io_uring_params p = {0}; + int ret; + + /* + * Liburing does not yet provide an API to query how much memory a + * ring will need. So we over-estimate it here. As the memory is freed + * just below that's small temporary waste of memory. + * + * 1MB is more than enough for rings within io_max_concurrency's + * range. + */ + ring_size = 1024 * 1024; + + /* + * Hard to believe a system exists where 1MB would not be a multiple + * of the page size. But it's cheap to ensure... + */ + ring_size -= ring_size % sysconf(_SC_PAGESIZE); + + ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ring_ptr == MAP_FAILED) + elog(ERROR, + "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m", + ring_size); + + ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size); + if (ret > 0) + { + pgaio_uring_caps.mem_init_size = ret; + + elog(DEBUG1, + "can use combined memory mapping for io_uring, each ring needs %d bytes", + ret); + + /* clean up the created ring, it was just for a test */ + io_uring_queue_exit(&test_ring); + } + else + { + /* + * There are different reasons for ring creation to fail, but it's + * ok to treat that just as io_uring_queue_init_mem() not being + * supported. We'll report a more detailed error in + * pgaio_uring_shmem_init(). + */ + errno = -ret; + elog(DEBUG1, + "cannot use combined memory mapping for io_uring, ring creation failed: %m"); + + } + + if (munmap(ring_ptr, ring_size) != 0) + elog(ERROR, "munmap() failed: %m"); + } +#else + { + elog(DEBUG1, + "can't use combined memory mapping for io_uring, kernel or liburing too old"); + } +#endif + + pgaio_uring_caps.checked = true; +} + +/* + * Memory for all PgAioUringContext instances + */ +static size_t pgaio_uring_context_shmem_size(void) { return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext)); } +/* + * Memory for the combined memory used by io_uring instances. Returns 0 if + * that is not supported by kernel/liburing. + */ +static size_t +pgaio_uring_ring_shmem_size(void) +{ + size_t sz = 0; + + if (pgaio_uring_caps.mem_init_size > 0) + { + /* + * Memory for rings needs to be allocated to the page boundary, + * reserve space. Luckily it does not need to be aligned to hugepage + * boundaries, even if huge pages are used. + */ + sz = add_size(sz, sysconf(_SC_PAGESIZE)); + sz = add_size(sz, mul_size(pgaio_uring_procs(), + pgaio_uring_caps.mem_init_size)); + } + + return sz; +} + static size_t pgaio_uring_shmem_size(void) { - return pgaio_uring_context_shmem_size(); + size_t sz; + + /* + * Kernel and liburing support for various features influences how much + * shmem we need, perform the necessary checks. + */ + pgaio_uring_check_capabilities(); + + sz = pgaio_uring_context_shmem_size(); + sz = add_size(sz, pgaio_uring_ring_shmem_size()); + + return sz; } static void @@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time) { int TotalProcs = pgaio_uring_procs(); bool found; + char *shmem; + size_t ring_mem_remain = 0; + char *ring_mem_next = 0; - pgaio_uring_contexts = (PgAioUringContext *) - ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found); - + /* + * We allocate memory for all PgAioUringContext instances and, if + * supported, the memory required for each of the io_uring instances, in + * one ShmemInitStruct(). + */ + shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found); if (found) return; + pgaio_uring_contexts = (PgAioUringContext *) shmem; + shmem += pgaio_uring_context_shmem_size(); + + /* if supported, handle memory alignment / sizing for io_uring memory */ + if (pgaio_uring_caps.mem_init_size > 0) + { + ring_mem_remain = pgaio_uring_ring_shmem_size(); + ring_mem_next = (char *) shmem; + + /* align to page boundary, see also pgaio_uring_ring_shmem_size() */ + ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next); + + /* account for alignment */ + ring_mem_remain -= ring_mem_next - shmem; + shmem += ring_mem_next - shmem; + + shmem += ring_mem_remain; + } + for (int contextno = 0; contextno < TotalProcs; contextno++) { PgAioUringContext *context = &pgaio_uring_contexts[contextno]; @@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time) * be worth using that - also need to evaluate if that causes * noticeable additional contention? */ - ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + + /* + * If supported (c.f. pgaio_uring_check_capabilities()), create ring + * with its data in shared memory. Otherwise fall back io_uring + * creating a memory mapping for each ring. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + if (pgaio_uring_caps.mem_init_size > 0) + { + struct io_uring_params p = {0}; + + ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain); + + ring_mem_remain -= ret; + ring_mem_next += ret; + } + else +#endif + { + ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + } + if (ret < 0) { char *hint = NULL; diff --git a/src/bin/pg_walsummary/t/002_blocks.pl b/src/bin/pg_walsummary/t/002_blocks.pl index 270332780a4..0f98c7df82e 100644 --- a/src/bin/pg_walsummary/t/002_blocks.pl +++ b/src/bin/pg_walsummary/t/002_blocks.pl @@ -47,11 +47,12 @@ EOM ok($result, "WAL summarization caught up after insert"); # The WAL summarizer should have generated some IO statistics. -my $stats_reads = $node1->safe_psql( +$node1->poll_query_until( 'postgres', - qq{SELECT sum(reads) > 0 FROM pg_stat_io - WHERE backend_type = 'walsummarizer' AND object = 'wal'}); -is($stats_reads, 't', "WAL summarizer generates statistics for WAL reads"); + q{SELECT sum(reads) > 0 FROM pg_stat_io + WHERE backend_type = 'walsummarizer' AND object = 'wal'}) + or die + "Timed out while waiting for WAL summarizer to generate statistics for WAL reads"; # Find the highest LSN that is summarized on disk. my $summarized_lsn = $node1->safe_psql('postgres', <<EOM); diff --git a/src/include/access/slru.h b/src/include/access/slru.h index e142800aab2..20dbd1e0070 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -187,6 +187,7 @@ extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names); extern int SimpleLruZeroPage(SlruCtl ctl, int64 pageno); +extern void SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno); extern int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid); extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index cf057f033a2..d6a71415d4f 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -44,6 +44,7 @@ extern void XLogBeginInsert(void); extern void XLogSetRecordFlags(uint8 flags); extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info); +extern XLogRecPtr XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value); extern void XLogEnsureRecordSpace(int max_block_id, int ndatas); extern void XLogRegisterData(const void *data, uint32 len); extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags); diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index d397fe27dc1..b523bcda8f3 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -118,7 +118,7 @@ extern void cost_incremental_sort(Path *path, Cost input_startup_cost, Cost input_total_cost, double input_tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples); -extern void cost_append(AppendPath *apath); +extern void cost_append(AppendPath *apath, PlannerInfo *root); extern void cost_merge_append(Path *path, PlannerInfo *root, List *pathkeys, int n_streams, int input_disabled_nodes, diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 726a7c1be1f..c4dc5d72bdb 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -229,6 +229,9 @@ /* Define to 1 if you have the global variable 'int timezone'. */ #undef HAVE_INT_TIMEZONE +/* Define to 1 if you have the `io_uring_queue_init_mem' function. */ +#undef HAVE_IO_URING_QUEUE_INIT_MEM + /* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */ #undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index bb99781c56e..b9acc790dc6 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -5703,7 +5703,7 @@ exec_eval_expr(PLpgSQL_execstate *estate, /* * Else do it the hard way via exec_run_select */ - rc = exec_run_select(estate, expr, 2, NULL); + rc = exec_run_select(estate, expr, 0, NULL); if (rc != SPI_OK_SELECT) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -5757,6 +5757,10 @@ exec_eval_expr(PLpgSQL_execstate *estate, /* ---------- * exec_run_select Execute a select query + * + * Note: passing maxtuples different from 0 ("return all tuples") is + * deprecated because it will prevent parallel execution of the query. + * However, we retain the parameter in case we need it someday. * ---------- */ static int diff --git a/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm b/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm index 1725fe2f948..7224c286e1d 100644 --- a/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm +++ b/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm @@ -251,6 +251,32 @@ sub adjust_database_contents 'drop operator if exists public.=> (bigint, NONE)'); } + # Version 19 changed the output format of pg_lsn. To avoid output + # differences, set all pg_lsn columns to NULL if the old version is + # older than 19. + if ($old_version < 19) + { + if ($old_version >= '9.5') + { + _add_st($result, 'regression', + "update brintest set lsncol = NULL"); + } + + if ($old_version >= 12) + { + _add_st($result, 'regression', + "update tab_core_types set pg_lsn = NULL"); + } + + if ($old_version >= 14) + { + _add_st($result, 'regression', + "update brintest_multi set lsncol = NULL"); + _add_st($result, 'regression', + "update brintest_bloom set lsncol = NULL"); + } + } + return $result; } diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index b00219643b9..5a1dd9fc022 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1722,3 +1722,43 @@ order by t1.four, t1.two limit 1; -> Seq Scan on tenk1 t2 (12 rows) +-- +-- Test incremental sort for Append/MergeAppend +-- +create table prt_tbl (a int, b int) partition by range (a); +create table prt_tbl_1 partition of prt_tbl for values from (0) to (100); +create table prt_tbl_2 partition of prt_tbl for values from (100) to (200); +insert into prt_tbl select i%200, i from generate_series(1,1000)i; +create index on prt_tbl_1(a); +create index on prt_tbl_2(a, b); +analyze prt_tbl; +set enable_seqscan to off; +set enable_bitmapscan to off; +-- Ensure we get an incremental sort for the subpath of Append +explain (costs off) select * from prt_tbl order by a, b; + QUERY PLAN +------------------------------------------------------------ + Append + -> Incremental Sort + Sort Key: prt_tbl_1.a, prt_tbl_1.b + Presorted Key: prt_tbl_1.a + -> Index Scan using prt_tbl_1_a_idx on prt_tbl_1 + -> Index Only Scan using prt_tbl_2_a_b_idx on prt_tbl_2 +(6 rows) + +-- Ensure we get an incremental sort for the subpath of MergeAppend +explain (costs off) select * from prt_tbl_1 union all select * from prt_tbl_2 order by a, b; + QUERY PLAN +------------------------------------------------------------ + Merge Append + Sort Key: prt_tbl_1.a, prt_tbl_1.b + -> Incremental Sort + Sort Key: prt_tbl_1.a, prt_tbl_1.b + Presorted Key: prt_tbl_1.a + -> Index Scan using prt_tbl_1_a_idx on prt_tbl_1 + -> Index Only Scan using prt_tbl_2_a_b_idx on prt_tbl_2 +(7 rows) + +reset enable_bitmapscan; +reset enable_seqscan; +drop table prt_tbl; diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 78dead65325..5b5055babdc 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -1898,10 +1898,11 @@ ORDER BY thousand, tenthous; Merge Append Sort Key: tenk1.thousand, tenk1.tenthous -> Index Only Scan using tenk1_thous_tenthous on tenk1 - -> Sort + -> Incremental Sort Sort Key: tenk1_1.thousand, tenk1_1.thousand + Presorted Key: tenk1_1.thousand -> Index Only Scan using tenk1_thous_tenthous on tenk1 tenk1_1 -(6 rows) +(7 rows) explain (costs off) SELECT thousand, tenthous, thousand+tenthous AS x FROM tenk1 @@ -1982,10 +1983,11 @@ ORDER BY x, y; Merge Append Sort Key: a.thousand, a.tenthous -> Index Only Scan using tenk1_thous_tenthous on tenk1 a - -> Sort + -> Incremental Sort Sort Key: b.unique2, b.unique2 + Presorted Key: b.unique2 -> Index Only Scan using tenk1_unique2 on tenk1 b -(6 rows) +(7 rows) -- exercise rescan code path via a repeatedly-evaluated subquery explain (costs off) diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql index f1f8fae5654..bbe658a7588 100644 --- a/src/test/regress/sql/incremental_sort.sql +++ b/src/test/regress/sql/incremental_sort.sql @@ -298,3 +298,27 @@ explain (costs off) select * from (select * from tenk1 order by four) t1 join tenk1 t2 on t1.four = t2.four and t1.two = t2.two order by t1.four, t1.two limit 1; + +-- +-- Test incremental sort for Append/MergeAppend +-- +create table prt_tbl (a int, b int) partition by range (a); +create table prt_tbl_1 partition of prt_tbl for values from (0) to (100); +create table prt_tbl_2 partition of prt_tbl for values from (100) to (200); +insert into prt_tbl select i%200, i from generate_series(1,1000)i; +create index on prt_tbl_1(a); +create index on prt_tbl_2(a, b); +analyze prt_tbl; + +set enable_seqscan to off; +set enable_bitmapscan to off; + +-- Ensure we get an incremental sort for the subpath of Append +explain (costs off) select * from prt_tbl order by a, b; + +-- Ensure we get an incremental sort for the subpath of MergeAppend +explain (costs off) select * from prt_tbl_1 union all select * from prt_tbl_2 order by a, b; + +reset enable_bitmapscan; +reset enable_seqscan; +drop table prt_tbl; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 114bdafafdf..83192038571 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2181,6 +2181,7 @@ PgAioReturn PgAioTargetData PgAioTargetID PgAioTargetInfo +PgAioUringCaps PgAioUringContext PgAioWaitRef PgArchData |