diff options
Diffstat (limited to 'src/backend/access/transam')
-rw-r--r-- | src/backend/access/transam/Makefile | 5 | ||||
-rw-r--r-- | src/backend/access/transam/README | 2 | ||||
-rw-r--r-- | src/backend/access/transam/commit_ts.c | 902 | ||||
-rw-r--r-- | src/backend/access/transam/rmgr.c | 1 | ||||
-rw-r--r-- | src/backend/access/transam/slru.c | 2 | ||||
-rw-r--r-- | src/backend/access/transam/varsup.c | 4 | ||||
-rw-r--r-- | src/backend/access/transam/xact.c | 27 | ||||
-rw-r--r-- | src/backend/access/transam/xlog.c | 43 |
8 files changed, 973 insertions, 13 deletions
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 82a6c7695f..9d4d5dbc97 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -12,8 +12,9 @@ subdir = src/backend/access/transam top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = clog.o transam.o varsup.o xact.o rmgr.o slru.o subtrans.o multixact.o \ - timeline.o twophase.o twophase_rmgr.o xlog.o xlogarchive.o xlogfuncs.o \ +OBJS = clog.o commit_ts.o multixact.o rmgr.o slru.o subtrans.o \ + timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \ + xact.o xlog.o xlogarchive.o xlogfuncs.o \ xloginsert.o xlogreader.o xlogutils.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index b619de5ad3..bc68b470e0 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -840,7 +840,7 @@ parent transaction to complete. Not all transactional behaviour is emulated, for example we do not insert a transaction entry into the lock table, nor do we maintain the transaction -stack in memory. Clog and multixact entries are made normally. +stack in memory. Clog, multixact and commit_ts entries are made normally. Subtrans is maintained during recovery but the details of the transaction tree are ignored and all subtransactions reference the top-level TransactionId directly. Since commit is atomic this provides correct lock wait behaviour diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c new file mode 100644 index 0000000000..ca074dafd3 --- /dev/null +++ b/src/backend/access/transam/commit_ts.c @@ -0,0 +1,902 @@ +/*------------------------------------------------------------------------- + * + * commit_ts.c + * PostgreSQL commit timestamp manager + * + * This module is a pg_clog-like system that stores the commit timestamp + * for each transaction. + * + * XLOG interactions: this module generates an XLOG record whenever a new + * CommitTs page is initialized to zeroes. Also, one XLOG record is + * generated for setting of values when the caller requests it; this allows + * us to support values coming from places other than transaction commit. + * Other writes of CommitTS come from recording of transaction commit in + * xact.c, which generates its own XLOG records for these events and will + * re-perform the status update on redo; so we need make no additional XLOG + * entry here. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/commit_ts.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/commit_ts.h" +#include "access/htup_details.h" +#include "access/slru.h" +#include "access/transam.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/builtins.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" + +/* + * Defines for CommitTs page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * CommitTs page numbering also wraps around at + * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE, and CommitTs segment numbering at + * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateCommitTs (see CommitTsPagePrecedes). + */ + +/* + * We need 8+4 bytes per xact. Note that enlarging this struct might mean + * the largest possible file name is more than 5 chars long; see + * SlruScanDirectory. + */ +typedef struct CommitTimestampEntry +{ + TimestampTz time; + CommitTsNodeId nodeid; +} CommitTimestampEntry; + +#define SizeOfCommitTimestampEntry (offsetof(CommitTimestampEntry, nodeid) + \ + sizeof(CommitTsNodeId)) + +#define COMMIT_TS_XACTS_PER_PAGE \ + (BLCKSZ / SizeOfCommitTimestampEntry) + +#define TransactionIdToCTsPage(xid) \ + ((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE) +#define TransactionIdToCTsEntry(xid) \ + ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE) + +/* + * Link to shared-memory data structures for CommitTs control + */ +static SlruCtlData CommitTsCtlData; + +#define CommitTsCtl (&CommitTsCtlData) + +/* + * We keep a cache of the last value set in shared memory. This is protected + * by CommitTsLock. + */ +typedef struct CommitTimestampShared +{ + TransactionId xidLastCommit; + CommitTimestampEntry dataLastCommit; +} CommitTimestampShared; + +CommitTimestampShared *commitTsShared; + + +/* GUC variable */ +bool track_commit_timestamp; + +static CommitTsNodeId default_node_id = InvalidCommitTsNodeId; + +static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz ts, + CommitTsNodeId nodeid, int pageno); +static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, + CommitTsNodeId nodeid, int slotno); +static int ZeroCommitTsPage(int pageno, bool writeXlog); +static bool CommitTsPagePrecedes(int page1, int page2); +static void WriteZeroPageXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno); +static void WriteSetTimestampXlogRec(TransactionId mainxid, int nsubxids, + TransactionId *subxids, TimestampTz timestamp, + CommitTsNodeId nodeid); + + +/* + * CommitTsSetDefaultNodeId + * + * Set default nodeid for current backend. + */ +void +CommitTsSetDefaultNodeId(CommitTsNodeId nodeid) +{ + default_node_id = nodeid; +} + +/* + * CommitTsGetDefaultNodeId + * + * Set default nodeid for current backend. + */ +CommitTsNodeId +CommitTsGetDefaultNodeId(void) +{ + return default_node_id; +} + +/* + * TransactionTreeSetCommitTsData + * + * Record the final commit timestamp of transaction entries in the commit log + * for a transaction and its subtransaction tree, as efficiently as possible. + * + * xid is the top level transaction id. + * + * subxids is an array of xids of length nsubxids, representing subtransactions + * in the tree of xid. In various cases nsubxids may be zero. + * The reason why tracking just the parent xid commit timestamp is not enough + * is that the subtrans SLRU does not stay valid across crashes (it's not + * permanent) so we need to keep the information about them here. If the + * subtrans implementation changes in the future, we might want to revisit the + * decision of storing timestamp info for each subxid. + * + * The do_xlog parameter tells us whether to include a XLog record of this + * or not. Normal path through RecordTransactionCommit() will be related + * to a transaction commit XLog record, and so should pass "false" here. + * Other callers probably want to pass true, so that the given values persist + * in case of crashes. + */ +void +TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz timestamp, + CommitTsNodeId nodeid, bool do_xlog) +{ + int i; + TransactionId headxid; + TransactionId newestXact; + + if (!track_commit_timestamp) + return; + + /* + * Comply with the WAL-before-data rule: if caller specified it wants + * this value to be recorded in WAL, do so before touching the data. + */ + if (do_xlog) + WriteSetTimestampXlogRec(xid, nsubxids, subxids, timestamp, nodeid); + + /* + * Figure out the latest Xid in this batch: either the last subxid if + * there's any, otherwise the parent xid. + */ + if (nsubxids > 0) + newestXact = subxids[nsubxids - 1]; + else + newestXact = xid; + + /* + * We split the xids to set the timestamp to in groups belonging to the + * same SLRU page; the first element in each such set is its head. The + * first group has the main XID as the head; subsequent sets use the + * first subxid not on the previous page as head. This way, we only have + * to lock/modify each SLRU page once. + */ + for (i = 0, headxid = xid;;) + { + int pageno = TransactionIdToCTsPage(headxid); + int j; + + for (j = i; j < nsubxids; j++) + { + if (TransactionIdToCTsPage(subxids[j]) != pageno) + break; + } + /* subxids[i..j] are on the same page as the head */ + + SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid, + pageno); + + /* if we wrote out all subxids, we're done. */ + if (j + 1 >= nsubxids) + break; + + /* + * Set the new head and skip over it, as well as over the subxids + * we just wrote. + */ + headxid = subxids[j]; + i += j - i + 1; + } + + /* update the cached value in shared memory */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + commitTsShared->xidLastCommit = xid; + commitTsShared->dataLastCommit.time = timestamp; + commitTsShared->dataLastCommit.nodeid = nodeid; + + /* and move forwards our endpoint, if needed */ + if (TransactionIdPrecedes(ShmemVariableCache->newestCommitTs, newestXact)) + ShmemVariableCache->newestCommitTs = newestXact; + LWLockRelease(CommitTsLock); +} + +/* + * Record the commit timestamp of transaction entries in the commit log for all + * entries on a single page. Atomic only on this page. + */ +static void +SetXidCommitTsInPage(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz ts, + CommitTsNodeId nodeid, int pageno) +{ + int slotno; + int i; + + LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid); + + TransactionIdSetCommitTs(xid, ts, nodeid, slotno); + for (i = 0; i < nsubxids; i++) + TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno); + + CommitTsCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(CommitTsControlLock); +} + +/* + * Sets the commit timestamp of a single transaction. + * + * Must be called with CommitTsControlLock held + */ +static void +TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, + CommitTsNodeId nodeid, int slotno) +{ + int entryno = TransactionIdToCTsEntry(xid); + CommitTimestampEntry entry; + + Assert(TransactionIdIsNormal(xid)); + + entry.time = ts; + entry.nodeid = nodeid; + + memcpy(CommitTsCtl->shared->page_buffer[slotno] + + SizeOfCommitTimestampEntry * entryno, + &entry, SizeOfCommitTimestampEntry); +} + +/* + * Interrogate the commit timestamp of a transaction. + * + * Return value indicates whether commit timestamp record was found for + * given xid. + */ +bool +TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, + CommitTsNodeId *nodeid) +{ + int pageno = TransactionIdToCTsPage(xid); + int entryno = TransactionIdToCTsEntry(xid); + int slotno; + CommitTimestampEntry entry; + TransactionId oldestCommitTs; + TransactionId newestCommitTs; + + /* Error if module not enabled */ + if (!track_commit_timestamp) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not get commit timestamp data"), + errhint("Make sure the configuration parameter \"%s\" is set.", + "track_commit_timestamp"))); + + /* error if the given Xid doesn't normally commit */ + if (!TransactionIdIsNormal(xid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cannot retrieve commit timestamp for transaction %u", xid))); + + /* + * Return empty if the requested value is outside our valid range. + */ + LWLockAcquire(CommitTsLock, LW_SHARED); + oldestCommitTs = ShmemVariableCache->oldestCommitTs; + newestCommitTs = ShmemVariableCache->newestCommitTs; + /* neither is invalid, or both are */ + Assert(TransactionIdIsValid(oldestCommitTs) == TransactionIdIsValid(newestCommitTs)); + LWLockRelease(CommitTsLock); + + if (!TransactionIdIsValid(oldestCommitTs) || + TransactionIdPrecedes(xid, oldestCommitTs) || + TransactionIdPrecedes(newestCommitTs, xid)) + { + if (ts) + *ts = 0; + if (nodeid) + *nodeid = InvalidCommitTsNodeId; + return false; + } + + /* + * Use an unlocked atomic read on our cached value in shared memory; if + * it's a hit, acquire a lock and read the data, after verifying that it's + * still what we initially read. Otherwise, fall through to read from + * SLRU. + */ + if (commitTsShared->xidLastCommit == xid) + { + LWLockAcquire(CommitTsLock, LW_SHARED); + if (commitTsShared->xidLastCommit == xid) + { + if (ts) + *ts = commitTsShared->dataLastCommit.time; + if (nodeid) + *nodeid = commitTsShared->dataLastCommit.nodeid; + + LWLockRelease(CommitTsLock); + return *ts != 0; + } + LWLockRelease(CommitTsLock); + } + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); + memcpy(&entry, + CommitTsCtl->shared->page_buffer[slotno] + + SizeOfCommitTimestampEntry * entryno, + SizeOfCommitTimestampEntry); + + if (ts) + *ts = entry.time; + if (nodeid) + *nodeid = entry.nodeid; + + LWLockRelease(CommitTsControlLock); + return *ts != 0; +} + +/* + * Return the Xid of the latest committed transaction. (As far as this module + * is concerned, anyway; it's up to the caller to ensure the value is useful + * for its purposes.) + * + * ts and extra are filled with the corresponding data; they can be passed + * as NULL if not wanted. + */ +TransactionId +GetLatestCommitTsData(TimestampTz *ts, CommitTsNodeId *nodeid) +{ + TransactionId xid; + + /* Error if module not enabled */ + if (!track_commit_timestamp) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not get commit timestamp data"), + errhint("Make sure the configuration parameter \"%s\" is set.", + "track_commit_timestamp"))); + + LWLockAcquire(CommitTsLock, LW_SHARED); + xid = commitTsShared->xidLastCommit; + if (ts) + *ts = commitTsShared->dataLastCommit.time; + if (nodeid) + *nodeid = commitTsShared->dataLastCommit.nodeid; + LWLockRelease(CommitTsLock); + + return xid; +} + +/* + * SQL-callable wrapper to obtain commit time of a transaction + */ +Datum +pg_xact_commit_timestamp(PG_FUNCTION_ARGS) +{ + TransactionId xid = PG_GETARG_UINT32(0); + TimestampTz ts; + bool found; + + found = TransactionIdGetCommitTsData(xid, &ts, NULL); + + if (!found) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(ts); +} + + +Datum +pg_last_committed_xact(PG_FUNCTION_ARGS) +{ + TransactionId xid; + TimestampTz ts; + Datum values[2]; + bool nulls[2]; + TupleDesc tupdesc; + HeapTuple htup; + + /* and construct a tuple with our data */ + xid = GetLatestCommitTsData(&ts, NULL); + + /* + * Construct a tuple descriptor for the result row. This must match this + * function's pg_proc entry! + */ + tupdesc = CreateTemplateTupleDesc(2, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "timestamp", + TIMESTAMPTZOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + if (!TransactionIdIsNormal(xid)) + { + memset(nulls, true, sizeof(nulls)); + } + else + { + values[0] = TransactionIdGetDatum(xid); + nulls[0] = false; + + values[1] = TimestampTzGetDatum(ts); + nulls[1] = false; + } + + htup = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(htup)); +} + + +/* + * Number of shared CommitTS buffers. + * + * We use a very similar logic as for the number of CLOG buffers; see comments + * in CLOGShmemBuffers. + */ +Size +CommitTsShmemBuffers(void) +{ + return Min(16, Max(4, NBuffers / 1024)); +} + +/* + * Shared memory sizing for CommitTs + */ +Size +CommitTsShmemSize(void) +{ + return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) + + sizeof(CommitTimestampShared); +} + +/* + * Initialize CommitTs at system startup (postmaster start or standalone + * backend) + */ +void +CommitTsShmemInit(void) +{ + bool found; + + CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; + SimpleLruInit(CommitTsCtl, "CommitTs Ctl", CommitTsShmemBuffers(), 0, + CommitTsControlLock, "pg_commit_ts"); + + commitTsShared = ShmemInitStruct("CommitTs shared", + sizeof(CommitTimestampShared), + &found); + + if (!IsUnderPostmaster) + { + Assert(!found); + + commitTsShared->xidLastCommit = InvalidTransactionId; + TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time); + commitTsShared->dataLastCommit.nodeid = InvalidCommitTsNodeId; + } + else + Assert(found); +} + +/* + * This function must be called ONCE on system install. + * + * (The CommitTs directory is assumed to have been created by initdb, and + * CommitTsShmemInit must have been called already.) + */ +void +BootStrapCommitTs(void) +{ + /* + * Nothing to do here at present, unlike most other SLRU modules; segments + * are created when the server is started with this module enabled. + * See StartupCommitTs. + */ +} + +/* + * Initialize (or reinitialize) a page of CommitTs to zeroes. + * If writeXlog is TRUE, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroCommitTsPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(CommitTsCtl, pageno); + + if (writeXlog) + WriteZeroPageXlogRec(pageno); + + return slotno; +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + */ +void +StartupCommitTs(void) +{ + TransactionId xid = ShmemVariableCache->nextXid; + int pageno = TransactionIdToCTsPage(xid); + + LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); + + /* + * Initialize our idea of the latest page number. + */ + CommitTsCtl->shared->latest_page_number = pageno; + + LWLockRelease(CommitTsControlLock); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * when commit timestamp is enabled. Must be called after recovery has + * finished. + * + * This is in charge of creating the currently active segment, if it's not + * already there. The reason for this is that the server might have been + * running with this module disabled for a while and thus might have skipped + * the normal creation point. + */ +void +CompleteCommitTsInitialization(void) +{ + TransactionId xid = ShmemVariableCache->nextXid; + int pageno = TransactionIdToCTsPage(xid); + + /* + * Re-Initialize our idea of the latest page number. + */ + LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); + CommitTsCtl->shared->latest_page_number = pageno; + LWLockRelease(CommitTsControlLock); + + /* + * If this module is not currently enabled, make sure we don't hand back + * possibly-invalid data; also remove segments of old data. + */ + if (!track_commit_timestamp) + { + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + ShmemVariableCache->oldestCommitTs = InvalidTransactionId; + ShmemVariableCache->newestCommitTs = InvalidTransactionId; + LWLockRelease(CommitTsLock); + + TruncateCommitTs(ReadNewTransactionId()); + + return; + } + + /* + * If CommitTs is enabled, but it wasn't in the previous server run, we + * need to set the oldest and newest values to the next Xid; that way, we + * will not try to read data that might not have been set. + * + * XXX does this have a problem if a server is started with commitTs + * enabled, then started with commitTs disabled, then restarted with it + * enabled again? It doesn't look like it does, because there should be a + * checkpoint that sets the value to InvalidTransactionId at end of + * recovery; and so any chance of injecting new transactions without + * CommitTs values would occur after the oldestCommitTs has been set to + * Invalid temporarily. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTs == InvalidTransactionId) + { + ShmemVariableCache->oldestCommitTs = + ShmemVariableCache->newestCommitTs = ReadNewTransactionId(); + } + LWLockRelease(CommitTsLock); + + /* Finally, create the current segment file, if necessary */ + if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) + { + int slotno; + + LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); + slotno = ZeroCommitTsPage(pageno, false); + SimpleLruWritePage(CommitTsCtl, slotno); + Assert(!CommitTsCtl->shared->page_dirty[slotno]); + LWLockRelease(CommitTsControlLock); + } +} + +/* + * This must be called ONCE during postmaster or standalone-backend shutdown + */ +void +ShutdownCommitTs(void) +{ + /* Flush dirty CommitTs pages to disk */ + SimpleLruFlush(CommitTsCtl, false); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointCommitTs(void) +{ + /* Flush dirty CommitTs pages to disk */ + SimpleLruFlush(CommitTsCtl, true); +} + +/* + * Make sure that CommitTs has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty CommitTs or xlog page to make room + * in shared memory. + * + * NB: the current implementation relies on track_commit_timestamp being + * PGC_POSTMASTER. + */ +void +ExtendCommitTs(TransactionId newestXact) +{ + int pageno; + + /* nothing to do if module not enabled */ + if (!track_commit_timestamp) + return; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToCTsEntry(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToCTsPage(newestXact); + + LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroCommitTsPage(pageno, !InRecovery); + + LWLockRelease(CommitTsControlLock); +} + +/* + * Remove all CommitTs segments before the one holding the passed + * transaction ID. + * + * Note that we don't need to flush XLOG here. + */ +void +TruncateCommitTs(TransactionId oldestXact) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. + */ + cutoffPage = TransactionIdToCTsPage(oldestXact); + + /* Check to see if there's any files that could be removed */ + if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence, + &cutoffPage)) + return; /* nothing to remove */ + + /* Write XLOG record */ + WriteTruncateXlogRec(cutoffPage); + + /* Now we can remove the old CommitTs segment(s) */ + SimpleLruTruncate(CommitTsCtl, cutoffPage); +} + +/* + * Set the limit values between which commit TS can be consulted. + */ +void +SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact) +{ + /* + * Be careful not to overwrite values that are either further into the + * "future" or signal a disabled committs. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTs != InvalidTransactionId) + { + if (TransactionIdPrecedes(ShmemVariableCache->oldestCommitTs, oldestXact)) + ShmemVariableCache->oldestCommitTs = oldestXact; + if (TransactionIdPrecedes(newestXact, ShmemVariableCache->newestCommitTs)) + ShmemVariableCache->newestCommitTs = newestXact; + } + else + { + Assert(ShmemVariableCache->newestCommitTs == InvalidTransactionId); + } + LWLockRelease(CommitTsLock); +} + +/* + * Move forwards the oldest commitTS value that can be consulted + */ +void +AdvanceOldestCommitTs(TransactionId oldestXact) +{ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTs != InvalidTransactionId && + TransactionIdPrecedes(ShmemVariableCache->oldestCommitTs, oldestXact)) + ShmemVariableCache->oldestCommitTs = oldestXact; + LWLockRelease(CommitTsLock); +} + + +/* + * Decide which of two CLOG page numbers is "older" for truncation purposes. + * + * We need to use comparison of TransactionIds here in order to do the right + * thing with wraparound XID arithmetic. However, if we are asked about + * page number zero, we don't want to hand InvalidTransactionId to + * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, + * offset both xids by FirstNormalTransactionId to avoid that. + */ +static bool +CommitTsPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId; + xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId; + + return TransactionIdPrecedes(xid1, xid2); +} + + +/* + * Write a ZEROPAGE xlog record + */ +static void +WriteZeroPageXlogRec(int pageno) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); +} + +/* + * Write a TRUNCATE xlog record + */ +static void +WriteTruncateXlogRec(int pageno) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE); +} + +/* + * Write a SETTS xlog record + */ +static void +WriteSetTimestampXlogRec(TransactionId mainxid, int nsubxids, + TransactionId *subxids, TimestampTz timestamp, + CommitTsNodeId nodeid) +{ + xl_commit_ts_set record; + + record.timestamp = timestamp; + record.nodeid = nodeid; + record.mainxid = mainxid; + + XLogBeginInsert(); + XLogRegisterData((char *) &record, + offsetof(xl_commit_ts_set, mainxid) + + sizeof(TransactionId)); + XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId)); + XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_SETTS); +} + +/* + * CommitTS resource manager's routines + */ +void +commit_ts_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in commit_ts records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == COMMIT_TS_ZEROPAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); + + slotno = ZeroCommitTsPage(pageno, false); + SimpleLruWritePage(CommitTsCtl, slotno); + Assert(!CommitTsCtl->shared->page_dirty[slotno]); + + LWLockRelease(CommitTsControlLock); + } + else if (info == COMMIT_TS_TRUNCATE) + { + int pageno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + /* + * During XLOG replay, latest_page_number isn't set up yet; insert a + * suitable value to bypass the sanity test in SimpleLruTruncate. + */ + CommitTsCtl->shared->latest_page_number = pageno; + + SimpleLruTruncate(CommitTsCtl, pageno); + } + else if (info == COMMIT_TS_SETTS) + { + xl_commit_ts_set *setts = (xl_commit_ts_set *) XLogRecGetData(record); + int nsubxids; + TransactionId *subxids; + + nsubxids = ((XLogRecGetDataLen(record) - SizeOfCommitTsSet) / + sizeof(TransactionId)); + if (nsubxids > 0) + { + subxids = palloc(sizeof(TransactionId) * nsubxids); + memcpy(subxids, + XLogRecGetData(record) + SizeOfCommitTsSet, + sizeof(TransactionId) * nsubxids); + } + else + subxids = NULL; + + TransactionTreeSetCommitTsData(setts->mainxid, nsubxids, subxids, + setts->timestamp, setts->nodeid, false); + if (subxids) + pfree(subxids); + } + else + elog(PANIC, "commit_ts_redo: unknown op code %u", info); +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index befd60f2d3..dcf423bdd7 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -8,6 +8,7 @@ #include "postgres.h" #include "access/clog.h" +#include "access/commit_ts.h" #include "access/gin.h" #include "access/gist_private.h" #include "access/hash.h" diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 1f9a100da8..15596c7c7f 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1297,7 +1297,7 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) len = strlen(clde->d_name); - if ((len == 4 || len == 5) && + if ((len == 4 || len == 5 || len == 6) && strspn(clde->d_name, "0123456789ABCDEF") == len) { segno = (int) strtol(clde->d_name, NULL, 16); diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index d51cca406c..c541156668 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -14,6 +14,7 @@ #include "postgres.h" #include "access/clog.h" +#include "access/commit_ts.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/xact.h" @@ -158,9 +159,10 @@ GetNewTransactionId(bool isSubXact) * XID before we zero the page. Fortunately, a page of the commit log * holds 32K or more transactions, so we don't have to do this very often. * - * Extend pg_subtrans too. + * Extend pg_subtrans and pg_commit_ts too. */ ExtendCLOG(xid); + ExtendCommitTs(xid); ExtendSUBTRANS(xid); /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 763e9deb6f..8b2f7140cf 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -20,6 +20,7 @@ #include <time.h> #include <unistd.h> +#include "access/commit_ts.h" #include "access/multixact.h" #include "access/subtrans.h" #include "access/transam.h" @@ -1135,6 +1136,21 @@ RecordTransactionCommit(void) } /* + * We only need to log the commit timestamp separately if the node + * identifier is a valid value; the commit record above already contains + * the timestamp info otherwise, and will be used to load it. + */ + if (markXidCommitted) + { + CommitTsNodeId node_id; + + node_id = CommitTsGetDefaultNodeId(); + TransactionTreeSetCommitTsData(xid, nchildren, children, + xactStopTimestamp, + node_id, node_id != InvalidCommitTsNodeId); + } + + /* * Check if we want to commit asynchronously. We can allow the XLOG flush * to happen asynchronously if synchronous_commit=off, or if the current * transaction has not performed any WAL-logged operation. The latter @@ -4644,6 +4660,7 @@ xactGetCommittedChildren(TransactionId **ptr) */ static void xact_redo_commit_internal(TransactionId xid, XLogRecPtr lsn, + TimestampTz commit_time, TransactionId *sub_xids, int nsubxacts, SharedInvalidationMessage *inval_msgs, int nmsgs, RelFileNode *xnodes, int nrels, @@ -4671,6 +4688,10 @@ xact_redo_commit_internal(TransactionId xid, XLogRecPtr lsn, LWLockRelease(XidGenLock); } + /* Set the transaction commit timestamp and metadata */ + TransactionTreeSetCommitTsData(xid, nsubxacts, sub_xids, + commit_time, InvalidCommitTsNodeId, false); + if (standbyState == STANDBY_DISABLED) { /* @@ -4790,7 +4811,8 @@ xact_redo_commit(xl_xact_commit *xlrec, /* invalidation messages array follows subxids */ inval_msgs = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]); - xact_redo_commit_internal(xid, lsn, subxacts, xlrec->nsubxacts, + xact_redo_commit_internal(xid, lsn, xlrec->xact_time, + subxacts, xlrec->nsubxacts, inval_msgs, xlrec->nmsgs, xlrec->xnodes, xlrec->nrels, xlrec->dbId, @@ -4805,7 +4827,8 @@ static void xact_redo_commit_compact(xl_xact_commit_compact *xlrec, TransactionId xid, XLogRecPtr lsn) { - xact_redo_commit_internal(xid, lsn, xlrec->subxacts, xlrec->nsubxacts, + xact_redo_commit_internal(xid, lsn, xlrec->xact_time, + xlrec->subxacts, xlrec->nsubxacts, NULL, 0, /* inval msgs */ NULL, 0, /* relfilenodes */ InvalidOid, /* dbId */ diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a2ad5ebfe8..da28de90db 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -22,6 +22,7 @@ #include <unistd.h> #include "access/clog.h" +#include "access/commit_ts.h" #include "access/multixact.h" #include "access/rewriteheap.h" #include "access/subtrans.h" @@ -4518,6 +4519,8 @@ BootStrapXLOG(void) checkPoint.oldestXidDB = TemplateDbOid; checkPoint.oldestMulti = FirstMultiXactId; checkPoint.oldestMultiDB = TemplateDbOid; + checkPoint.oldestCommitTs = InvalidTransactionId; + checkPoint.newestCommitTs = InvalidTransactionId; checkPoint.time = (pg_time_t) time(NULL); checkPoint.oldestActiveXid = InvalidTransactionId; @@ -4527,6 +4530,7 @@ BootStrapXLOG(void) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); /* Set up the XLOG page header */ page->xlp_magic = XLOG_PAGE_MAGIC; @@ -4606,6 +4610,7 @@ BootStrapXLOG(void) ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; + ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = bootstrap_data_checksum_version; /* some additional ControlFile fields are set in WriteControlFile() */ @@ -4614,6 +4619,7 @@ BootStrapXLOG(void) /* Bootstrap the commit log, too */ BootStrapCLOG(); + BootStrapCommitTs(); BootStrapSUBTRANS(); BootStrapMultiXact(); @@ -5920,6 +5926,10 @@ StartupXLOG(void) ereport(DEBUG1, (errmsg("oldest MultiXactId: %u, in database %u", checkPoint.oldestMulti, checkPoint.oldestMultiDB))); + ereport(DEBUG1, + (errmsg("commit timestamp Xid oldest/newest: %u/%u", + checkPoint.oldestCommitTs, + checkPoint.newestCommitTs))); if (!TransactionIdIsNormal(checkPoint.nextXid)) ereport(PANIC, (errmsg("invalid next transaction ID"))); @@ -5931,6 +5941,8 @@ StartupXLOG(void) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + SetCommitTsLimit(checkPoint.oldestCommitTs, + checkPoint.newestCommitTs); MultiXactSetSafeTruncate(checkPoint.oldestMulti); XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch; XLogCtl->ckptXid = checkPoint.nextXid; @@ -6153,11 +6165,12 @@ StartupXLOG(void) ProcArrayInitRecovery(ShmemVariableCache->nextXid); /* - * Startup commit log and subtrans only. MultiXact has already - * been started up and other SLRUs are not maintained during - * recovery and need not be started yet. + * Startup commit log, commit timestamp and subtrans only. + * MultiXact has already been started up and other SLRUs are not + * maintained during recovery and need not be started yet. */ StartupCLOG(); + StartupCommitTs(); StartupSUBTRANS(oldestActiveXID); /* @@ -6827,12 +6840,13 @@ StartupXLOG(void) LWLockRelease(ProcArrayLock); /* - * Start up the commit log and subtrans, if not already done for hot - * standby. + * Start up the commit log, commit timestamp and subtrans, if not already + * done for hot standby. */ if (standbyState == STANDBY_DISABLED) { StartupCLOG(); + StartupCommitTs(); StartupSUBTRANS(oldestActiveXID); } @@ -6868,6 +6882,12 @@ StartupXLOG(void) XLogReportParameters(); /* + * Local WAL inserts enabled, so it's time to finish initialization + * of commit timestamp. + */ + CompleteCommitTsInitialization(); + + /* * All done. Allow backends to write WAL. (Although the bool flag is * probably atomic in itself, we use the info_lck here to ensure that * there are no race conditions concerning visibility of other recent @@ -7433,6 +7453,7 @@ ShutdownXLOG(int code, Datum arg) CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } ShutdownCLOG(); + ShutdownCommitTs(); ShutdownSUBTRANS(); ShutdownMultiXact(); @@ -7769,6 +7790,11 @@ CreateCheckPoint(int flags) checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB; LWLockRelease(XidGenLock); + LWLockAcquire(CommitTsLock, LW_SHARED); + checkPoint.oldestCommitTs = ShmemVariableCache->oldestCommitTs; + checkPoint.newestCommitTs = ShmemVariableCache->newestCommitTs; + LWLockRelease(CommitTsLock); + /* Increase XID epoch if we've wrapped around since last checkpoint */ checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid) @@ -8046,6 +8072,7 @@ static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { CheckPointCLOG(); + CheckPointCommitTs(); CheckPointSUBTRANS(); CheckPointMultiXact(); CheckPointPredicate(); @@ -8474,7 +8501,8 @@ XLogReportParameters(void) MaxConnections != ControlFile->MaxConnections || max_worker_processes != ControlFile->max_worker_processes || max_prepared_xacts != ControlFile->max_prepared_xacts || - max_locks_per_xact != ControlFile->max_locks_per_xact) + max_locks_per_xact != ControlFile->max_locks_per_xact || + track_commit_timestamp != ControlFile->track_commit_timestamp) { /* * The change in number of backend slots doesn't need to be WAL-logged @@ -8494,6 +8522,7 @@ XLogReportParameters(void) xlrec.max_locks_per_xact = max_locks_per_xact; xlrec.wal_level = wal_level; xlrec.wal_log_hints = wal_log_hints; + xlrec.track_commit_timestamp = track_commit_timestamp; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); @@ -8508,6 +8537,7 @@ XLogReportParameters(void) ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; + ControlFile->track_commit_timestamp = track_commit_timestamp; UpdateControlFile(); } } @@ -8884,6 +8914,7 @@ xlog_redo(XLogReaderState *record) ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact; ControlFile->wal_level = xlrec.wal_level; ControlFile->wal_log_hints = wal_log_hints; + ControlFile->track_commit_timestamp = track_commit_timestamp; /* * Update minRecoveryPoint to ensure that if recovery is aborted, we |