diff options
author | Bernd Helmle | 2011-05-31 16:31:49 +0000 |
---|---|---|
committer | Bernd Helmle | 2011-05-31 16:31:49 +0000 |
commit | 554cb4427a60fcaf0f6d007f5072a08d985fe779 (patch) | |
tree | c39487bbadc5fb298d82c014e423254597655a39 | |
parent | b9b93512ec3c880d9081b05791e19244798ffe94 (diff) | |
parent | 13c00ae8c73ee9635c11059925814b351dc3593c (diff) |
Merge branch 'master' of ../bernd_pg into notnull_constraint
25 files changed, 524 insertions, 347 deletions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e367c29bd5..39819695d1 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3527,8 +3527,8 @@ local0.* /var/log/postgresql <para> Causes each attempted connection to the server to be logged, as well as successful completion of client authentication. - This parameter can only be set in the <filename>postgresql.conf</> - file or on the server command line. The default is off. + This parameter cannot be changed after session start. + The default is off. </para> <note> @@ -3553,8 +3553,7 @@ local0.* /var/log/postgresql <varname>log_connections</varname> but at session termination, and includes the duration of the session. This is off by default. - This parameter can only be set in the <filename>postgresql.conf</> - file or on the server command line. + This parameter cannot be changed after session start. </para> </listitem> </varlistentry> diff --git a/doc/src/sgml/install-windows.sgml b/doc/src/sgml/install-windows.sgml index 3c9d90ef33..cb8bca9c63 100644 --- a/doc/src/sgml/install-windows.sgml +++ b/doc/src/sgml/install-windows.sgml @@ -150,9 +150,10 @@ $ENV{PATH}=$ENV{PATH} . ';c:\some\where\bison\bin'; <varlistentry> <term><productname>Microsoft Platform SDK</productname></term> <listitem><para> - It is recommended that you upgrade to the latest available version - of the <productname>Microsoft Platform SDK</productname>, available - for download from <ulink url="http://www.microsoft.com/downloads/"></>. + It is recommended that you upgrade to the latest supported version + of the <productname>Microsoft Platform SDK</productname> (currently + version 7.0), available for download from + <ulink url="http://www.microsoft.com/downloads/"></>. </para> <para> You must always include the diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index 8a7b833f0f..47dce43b19 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -169,13 +169,26 @@ PostgreSQL documentation </varlistentry> <varlistentry> + <term><option>-z</option></term> + <term><option>--gzip</option></term> + <listitem> + <para> + Enables gzip compression of tar file output, with the default + compression level. Compression is only available when using + the tar format. + </para> + </listitem> + </varlistentry> + + <varlistentry> <term><option>-Z <replaceable class="parameter">level</replaceable></option></term> <term><option>--compress=<replaceable class="parameter">level</replaceable></option></term> <listitem> <para> - Enables gzip compression of tar file output. Compression is only - available when generating tar files, and is not available when sending - output to standard output. + Enables gzip compression of tar file output, and specifies the + compression level (1 through 9, 9 being best + compression). Compression is only available when using the tar + format. </para> </listitem> </varlistentry> @@ -394,11 +407,11 @@ PostgreSQL documentation </para> <para> - To create a backup of the local server with one maximum compressed + To create a backup of the local server with one compressed tar file for each tablespace, and store it in the directory <filename>backup</filename>, showing a progress report while running: <screen> -<prompt>$</prompt> <userinput>pg_basebackup -D backup -Ft -Z9 -P</userinput> +<prompt>$</prompt> <userinput>pg_basebackup -D backup -Ft -z -P</userinput> </screen> </para> diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 346d6b964d..01a492e496 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1529,7 +1529,6 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, OffsetNumber offnum; bool at_chain_start; bool valid; - bool match_found; if (all_dead) *all_dead = true; @@ -1539,7 +1538,6 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = true; - match_found = false; /* Scan through possible multiple members of HOT-chain */ for (;;) @@ -1597,10 +1595,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, PredicateLockTuple(relation, &heapTuple); if (all_dead) *all_dead = false; - if (IsolationIsSerializable()) - match_found = true; - else - return true; + return true; } /* @@ -1629,7 +1624,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, break; /* end of chain */ } - return match_found; + return false; } /* @@ -2855,12 +2850,6 @@ l2: END_CRIT_SECTION(); - /* - * Any existing SIREAD locks on the old tuple must be linked to the new - * tuple for conflict detection purposes. - */ - PredicateLockTupleRowVersionLink(relation, &oldtup, heaptup); - if (newbuf != buffer) LockBuffer(newbuf, BUFFER_LOCK_UNLOCK); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 08de8b4f88..27c37d6173 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -612,8 +612,7 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ - if (IsMVCCSnapshot(scan->xs_snapshot) - && !IsolationIsSerializable()) + if (IsMVCCSnapshot(scan->xs_snapshot)) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 0568a1bcf8..fa84989fc6 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -84,8 +84,7 @@ static MemoryContext anl_context = NULL; static BufferAccessStrategy vac_strategy; -static void do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, - bool update_reltuples, bool inh); +static void do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh); static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize); static bool BlockSampler_HasMore(BlockSampler bs); @@ -115,18 +114,9 @@ static bool std_typanalyze(VacAttrStats *stats); /* * analyze_rel() -- analyze one relation - * - * If update_reltuples is true, we update reltuples and relpages columns - * in pg_class. Caller should pass false if we're part of VACUUM ANALYZE, - * and the VACUUM didn't skip any pages. We only have an approximate count, - * so we don't want to overwrite the accurate values already inserted by the - * VACUUM in that case. VACUUM always scans all indexes, however, so the - * pg_class entries for indexes are never updated if we're part of VACUUM - * ANALYZE. */ void -analyze_rel(Oid relid, VacuumStmt *vacstmt, - BufferAccessStrategy bstrategy, bool update_reltuples) +analyze_rel(Oid relid, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy) { Relation onerel; @@ -238,13 +228,13 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt, /* * Do the normal non-recursive ANALYZE. */ - do_analyze_rel(onerel, vacstmt, update_reltuples, false); + do_analyze_rel(onerel, vacstmt, false); /* * If there are child tables, do recursive ANALYZE. */ if (onerel->rd_rel->relhassubclass) - do_analyze_rel(onerel, vacstmt, false, true); + do_analyze_rel(onerel, vacstmt, true); /* * Close source relation now, but keep lock so that no one deletes it @@ -267,8 +257,7 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt, * do_analyze_rel() -- analyze one relation, recursively or not */ static void -do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, - bool update_reltuples, bool inh) +do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh) { int attr_cnt, tcnt, @@ -437,9 +426,9 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, } /* - * Quit if no analyzable columns and no pg_class update needed. + * Quit if no analyzable columns. */ - if (attr_cnt <= 0 && !analyzableindex && !update_reltuples) + if (attr_cnt <= 0 && !analyzableindex) goto cleanup; /* @@ -549,10 +538,10 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, } /* - * Update pages/tuples stats in pg_class, but not if we're inside a VACUUM - * that got a more precise number. + * Update pages/tuples stats in pg_class ... but not if we're doing + * inherited stats. */ - if (update_reltuples) + if (!inh) vac_update_relstats(onerel, RelationGetNumberOfBlocks(onerel), totalrows, hasindex, InvalidTransactionId); @@ -562,7 +551,7 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, * VACUUM ANALYZE, don't overwrite the accurate count already inserted by * VACUUM. */ - if (!(vacstmt->options & VACOPT_VACUUM)) + if (!inh && !(vacstmt->options & VACOPT_VACUUM)) { for (ind = 0; ind < nindexes; ind++) { @@ -577,13 +566,12 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, } /* - * Report ANALYZE to the stats collector, too; likewise, tell it to adopt - * these numbers only if we're not inside a VACUUM that got a better - * number. However, a call with inh = true shouldn't reset the stats. + * Report ANALYZE to the stats collector, too. However, if doing + * inherited stats we shouldn't report, because the stats collector only + * tracks per-table stats. */ if (!inh) - pgstat_report_analyze(onerel, update_reltuples, - totalrows, totaldeadrows); + pgstat_report_analyze(onerel, totalrows, totaldeadrows); /* We skip to here if there were no analyzable columns */ cleanup: @@ -1243,18 +1231,19 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows, qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); /* - * Estimate total numbers of rows in relation. + * Estimate total numbers of rows in relation. For live rows, use + * vac_estimate_reltuples; for dead rows, we have no source of old + * information, so we have to assume the density is the same in unseen + * pages as in the pages we scanned. */ + *totalrows = vac_estimate_reltuples(onerel, true, + totalblocks, + bs.m, + liverows); if (bs.m > 0) - { - *totalrows = floor((liverows * totalblocks) / bs.m + 0.5); - *totaldeadrows = floor((deadrows * totalblocks) / bs.m + 0.5); - } + *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5); else - { - *totalrows = 0.0; *totaldeadrows = 0.0; - } /* * Emit some interesting relation info diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 9606569617..224c34f6e7 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -20,6 +20,8 @@ */ #include "postgres.h" +#include <math.h> + #include "access/clog.h" #include "access/genam.h" #include "access/heapam.h" @@ -62,7 +64,7 @@ static BufferAccessStrategy vac_strategy; static List *get_rel_oids(Oid relid, const RangeVar *vacrel); static void vac_truncate_clog(TransactionId frozenXID); static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, - bool for_wraparound, bool *scanned_all); + bool for_wraparound); /* @@ -219,12 +221,10 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast, foreach(cur, relations) { Oid relid = lfirst_oid(cur); - bool scanned_all = false; if (vacstmt->options & VACOPT_VACUUM) { - if (!vacuum_rel(relid, vacstmt, do_toast, for_wraparound, - &scanned_all)) + if (!vacuum_rel(relid, vacstmt, do_toast, for_wraparound)) continue; } @@ -241,7 +241,7 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast, PushActiveSnapshot(GetTransactionSnapshot()); } - analyze_rel(relid, vacstmt, vac_strategy, !scanned_all); + analyze_rel(relid, vacstmt, vac_strategy); if (use_own_xacts) { @@ -454,6 +454,79 @@ vacuum_set_xid_limits(int freeze_min_age, /* + * vac_estimate_reltuples() -- estimate the new value for pg_class.reltuples + * + * If we scanned the whole relation then we should just use the count of + * live tuples seen; but if we did not, we should not trust the count + * unreservedly, especially not in VACUUM, which may have scanned a quite + * nonrandom subset of the table. When we have only partial information, + * we take the old value of pg_class.reltuples as a measurement of the + * tuple density in the unscanned pages. + * + * This routine is shared by VACUUM and ANALYZE. + */ +double +vac_estimate_reltuples(Relation relation, bool is_analyze, + BlockNumber total_pages, + BlockNumber scanned_pages, + double scanned_tuples) +{ + BlockNumber old_rel_pages = relation->rd_rel->relpages; + double old_rel_tuples = relation->rd_rel->reltuples; + double old_density; + double new_density; + double multiplier; + double updated_density; + + /* If we did scan the whole table, just use the count as-is */ + if (scanned_pages >= total_pages) + return scanned_tuples; + + /* + * If scanned_pages is zero but total_pages isn't, keep the existing + * value of reltuples. + */ + if (scanned_pages == 0) + return old_rel_tuples; + + /* + * If old value of relpages is zero, old density is indeterminate; we + * can't do much except scale up scanned_tuples to match total_pages. + */ + if (old_rel_pages == 0) + return floor((scanned_tuples / scanned_pages) * total_pages + 0.5); + + /* + * Okay, we've covered the corner cases. The normal calculation is to + * convert the old measurement to a density (tuples per page), then + * update the density using an exponential-moving-average approach, + * and finally compute reltuples as updated_density * total_pages. + * + * For ANALYZE, the moving average multiplier is just the fraction of + * the table's pages we scanned. This is equivalent to assuming + * that the tuple density in the unscanned pages didn't change. Of + * course, it probably did, if the new density measurement is different. + * But over repeated cycles, the value of reltuples will converge towards + * the correct value, if repeated measurements show the same new density. + * + * For VACUUM, the situation is a bit different: we have looked at a + * nonrandom sample of pages, but we know for certain that the pages we + * didn't look at are precisely the ones that haven't changed lately. + * Thus, there is a reasonable argument for doing exactly the same thing + * as for the ANALYZE case, that is use the old density measurement as + * the value for the unscanned pages. + * + * This logic could probably use further refinement. + */ + old_density = old_rel_tuples / old_rel_pages; + new_density = scanned_tuples / scanned_pages; + multiplier = (double) scanned_pages / (double) total_pages; + updated_density = old_density + (new_density - old_density) * multiplier; + return floor(updated_density * total_pages + 0.5); +} + + +/* * vac_update_relstats() -- update statistics for one relation * * Update the whole-relation statistics that are kept in its pg_class @@ -480,7 +553,7 @@ vacuum_set_xid_limits(int freeze_min_age, * somebody vacuuming pg_class might think they could delete a tuple * marked with xmin = our xid. * - * This routine is shared by VACUUM and stand-alone ANALYZE. + * This routine is shared by VACUUM and ANALYZE. */ void vac_update_relstats(Relation relation, @@ -758,14 +831,10 @@ vac_truncate_clog(TransactionId frozenXID) * many small transactions. Otherwise, two-phase locking would require * us to lock the entire database during one pass of the vacuum cleaner. * - * We'll return true in *scanned_all if the vacuum scanned all heap - * pages, and updated pg_class. - * * At entry and exit, we are not inside a transaction. */ static bool -vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, - bool *scanned_all) +vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound) { LOCKMODE lmode; Relation onerel; @@ -775,9 +844,6 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, int save_sec_context; int save_nestlevel; - if (scanned_all) - *scanned_all = false; - /* Begin a transaction for vacuuming this relation */ StartTransactionCommand(); @@ -971,7 +1037,7 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, vacstmt->freeze_min_age, vacstmt->freeze_table_age); } else - lazy_vacuum_rel(onerel, vacstmt, vac_strategy, scanned_all); + lazy_vacuum_rel(onerel, vacstmt, vac_strategy); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -997,7 +1063,7 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, * totally unimportant for toast relations. */ if (toast_relid != InvalidOid) - vacuum_rel(toast_relid, vacstmt, false, for_wraparound, NULL); + vacuum_rel(toast_relid, vacstmt, false, for_wraparound); /* * Now release the session-level lock on the master table. diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 9393fa0727..ce5fa18066 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -77,17 +77,18 @@ * Before we consider skipping a page that's marked as clean in * visibility map, we must've seen at least this many clean pages. */ -#define SKIP_PAGES_THRESHOLD 32 +#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) typedef struct LVRelStats { /* hasindex = true means two-pass strategy; false means one-pass */ bool hasindex; - bool scanned_all; /* have we scanned all pages (this far)? */ /* Overall statistics about rel */ - BlockNumber rel_pages; + BlockNumber rel_pages; /* total number of pages */ + BlockNumber scanned_pages; /* number of pages we examined */ + double scanned_tuples; /* counts only tuples on scanned pages */ double old_rel_tuples; /* previous value of pg_class.reltuples */ - double rel_tuples; /* counts only tuples on scanned pages */ + double new_rel_tuples; /* new estimated total # of tuples */ BlockNumber pages_removed; double tuples_deleted; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ @@ -143,7 +144,7 @@ static int vac_cmp_itemptr(const void *left, const void *right); */ void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, - BufferAccessStrategy bstrategy, bool *scanned_all) + BufferAccessStrategy bstrategy) { LVRelStats *vacrelstats; Relation *Irel; @@ -175,7 +176,6 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); - vacrelstats->scanned_all = true; /* will be cleared if we skip a page */ vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; @@ -205,24 +205,20 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, FreeSpaceMapVacuum(onerel); /* - * Update statistics in pg_class. But only if we didn't skip any pages; - * the tuple count only includes tuples from the pages we've visited, and - * we haven't frozen tuples in unvisited pages either. The page count is - * accurate in any case, but because we use the reltuples / relpages ratio - * in the planner, it's better to not update relpages either if we can't - * update reltuples. + * Update statistics in pg_class. But don't change relfrozenxid if we + * skipped any pages. */ - if (vacrelstats->scanned_all) - vac_update_relstats(onerel, - vacrelstats->rel_pages, vacrelstats->rel_tuples, - vacrelstats->hasindex, - FreezeLimit); + vac_update_relstats(onerel, + vacrelstats->rel_pages, vacrelstats->new_rel_tuples, + vacrelstats->hasindex, + (vacrelstats->scanned_pages < vacrelstats->rel_pages) ? + InvalidTransactionId : + FreezeLimit); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, - vacrelstats->scanned_all, - vacrelstats->rel_tuples); + vacrelstats->new_rel_tuples); /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) @@ -239,13 +235,12 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, - vacrelstats->pages_removed, vacrelstats->rel_pages, - vacrelstats->tuples_deleted, vacrelstats->rel_tuples, + vacrelstats->pages_removed, + vacrelstats->rel_pages, + vacrelstats->tuples_deleted, + vacrelstats->new_rel_tuples, pg_rusage_show(&ru0)))); } - - if (scanned_all) - *scanned_all = vacrelstats->scanned_all; } /* @@ -301,7 +296,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, HeapTupleData tuple; char *relname; BlockNumber empty_pages, - scanned_pages, vacuumed_pages; double num_tuples, tups_vacuumed, @@ -311,7 +305,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; - BlockNumber all_visible_streak; + BlockNumber next_not_all_visible_block; + bool skipping_all_visible_blocks; pg_rusage_init(&ru0); @@ -321,7 +316,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, get_namespace_name(RelationGetNamespace(onerel)), relname))); - empty_pages = vacuumed_pages = scanned_pages = 0; + empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) @@ -329,12 +324,47 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; + vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); - all_visible_streak = 0; + /* + * We want to skip pages that don't require vacuuming according to the + * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD + * consecutive pages. Since we're reading sequentially, the OS should be + * doing readahead for us, so there's no gain in skipping a page now and + * then; that's likely to disable readahead and so be counterproductive. + * Also, skipping even a single page means that we can't update + * relfrozenxid, so we only want to do it if we can skip a goodly number + * of pages. + * + * Before entering the main loop, establish the invariant that + * next_not_all_visible_block is the next block number >= blkno that's + * not all-visible according to the visibility map, or nblocks if there's + * no such block. Also, we set up the skipping_all_visible_blocks flag, + * which is needed because we need hysteresis in the decision: once we've + * started skipping blocks, we may as well skip everything up to the next + * not-all-visible block. + * + * Note: if scan_all is true, we won't actually skip any pages; but we + * maintain next_not_all_visible_block anyway, so as to set up the + * all_visible_according_to_vm flag correctly for each page. + */ + for (next_not_all_visible_block = 0; + next_not_all_visible_block < nblocks; + next_not_all_visible_block++) + { + if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) + break; + vacuum_delay_point(); + } + if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) + skipping_all_visible_blocks = true; + else + skipping_all_visible_blocks = false; + for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; @@ -347,41 +377,45 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; - bool all_visible_according_to_vm = false; + bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; - /* - * Skip pages that don't require vacuuming according to the visibility - * map. But only if we've seen a streak of at least - * SKIP_PAGES_THRESHOLD pages marked as clean. Since we're reading - * sequentially, the OS should be doing readahead for us and there's - * no gain in skipping a page now and then. You need a longer run of - * consecutive skipped pages before it's worthwhile. Also, skipping - * even a single page means that we can't update relfrozenxid or - * reltuples, so we only want to do it if there's a good chance to - * skip a goodly number of pages. - */ - if (!scan_all) + if (blkno == next_not_all_visible_block) { - all_visible_according_to_vm = - visibilitymap_test(onerel, blkno, &vmbuffer); - if (all_visible_according_to_vm) + /* Time to advance next_not_all_visible_block */ + for (next_not_all_visible_block++; + next_not_all_visible_block < nblocks; + next_not_all_visible_block++) { - all_visible_streak++; - if (all_visible_streak >= SKIP_PAGES_THRESHOLD) - { - vacrelstats->scanned_all = false; - continue; - } + if (!visibilitymap_test(onerel, next_not_all_visible_block, + &vmbuffer)) + break; + vacuum_delay_point(); } + + /* + * We know we can't skip the current block. But set up + * skipping_all_visible_blocks to do the right thing at the + * following blocks. + */ + if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) + skipping_all_visible_blocks = true; else - all_visible_streak = 0; + skipping_all_visible_blocks = false; + all_visible_according_to_vm = false; + } + else + { + /* Current block is all-visible */ + if (skipping_all_visible_blocks && !scan_all) + continue; + all_visible_according_to_vm = true; } vacuum_delay_point(); - scanned_pages++; + vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple @@ -764,9 +798,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, } /* save stats for use later */ - vacrelstats->rel_tuples = num_tuples; + vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; + /* now we can compute the new value for pg_class.reltuples */ + vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, + nblocks, + vacrelstats->scanned_pages, + num_tuples); + /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) @@ -805,7 +845,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), - tups_vacuumed, num_tuples, scanned_pages, nblocks), + tups_vacuumed, num_tuples, + vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" @@ -977,10 +1018,9 @@ lazy_cleanup_index(Relation indrel, ivinfo.index = indrel; ivinfo.analyze_only = false; - ivinfo.estimated_count = !vacrelstats->scanned_all; + ivinfo.estimated_count = (vacrelstats->scanned_pages < vacrelstats->rel_pages); ivinfo.message_level = elevel; - /* use rel_tuples only if we scanned all pages, else fall back */ - ivinfo.num_heap_tuples = vacrelstats->scanned_all ? vacrelstats->rel_tuples : vacrelstats->old_rel_tuples; + ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples; ivinfo.strategy = vac_strategy; stats = index_vacuum_cleanup(&ivinfo, stats); @@ -1041,8 +1081,13 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) new_rel_pages = RelationGetNumberOfBlocks(onerel); if (new_rel_pages != old_rel_pages) { - /* might as well use the latest news when we update pg_class stats */ - vacrelstats->rel_pages = new_rel_pages; + /* + * Note: we intentionally don't update vacrelstats->rel_pages with + * the new rel size here. If we did, it would amount to assuming that + * the new pages are empty, which is unlikely. Leaving the numbers + * alone amounts to assuming that the new pages have the same tuple + * density as existing ones, which is less unlikely. + */ UnlockRelation(onerel, AccessExclusiveLock); return; } @@ -1076,7 +1121,11 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) */ UnlockRelation(onerel, AccessExclusiveLock); - /* update statistics */ + /* + * Update statistics. Here, it *is* correct to adjust rel_pages without + * also touching reltuples, since the tuple count wasn't changed by the + * truncation. + */ vacrelstats->rel_pages = new_rel_pages; vacrelstats->pages_removed = old_rel_pages - new_rel_pages; diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 618f007827..e6ab659f4b 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -1788,7 +1788,7 @@ auth_peer(hbaPort *port) char ident_user[IDENT_USERNAME_MAX + 1]; #if defined(HAVE_GETPEEREID) - /* OpenBSD style: */ + /* OpenBSD (also Mac OS X) style: use getpeereid() */ uid_t uid; gid_t gid; struct passwd *pass; @@ -1843,7 +1843,7 @@ auth_peer(hbaPort *port) strlcpy(ident_user, pass->pw_name, IDENT_USERNAME_MAX + 1); #elif defined(HAVE_GETPEERUCRED) - /* Solaris > 10 */ + /* Solaris > 10: use getpeerucred() */ uid_t uid; struct passwd *pass; ucred_t *ucred; @@ -1878,9 +1878,7 @@ auth_peer(hbaPort *port) strlcpy(ident_user, pass->pw_name, IDENT_USERNAME_MAX + 1); #elif defined(HAVE_STRUCT_CMSGCRED) || defined(HAVE_STRUCT_FCRED) || (defined(HAVE_STRUCT_SOCKCRED) && defined(LOCAL_CREDS)) - struct msghdr msg; - -/* Credentials structure */ + /* Assorted BSDen: use a credentials control message */ #if defined(HAVE_STRUCT_CMSGCRED) typedef struct cmsgcred Cred; @@ -1894,36 +1892,35 @@ auth_peer(hbaPort *port) #define cruid sc_uid #endif - Cred *cred; - - /* Compute size without padding */ - char cmsgmem[ALIGN(sizeof(struct cmsghdr)) + ALIGN(sizeof(Cred))]; /* for NetBSD */ - - /* Point to start of first structure */ - struct cmsghdr *cmsg = (struct cmsghdr *) cmsgmem; + struct msghdr msg; + struct cmsghdr *cmsg; + union + { + struct cmsghdr hdr; + unsigned char buf[CMSG_SPACE(sizeof(Cred))]; + } cmsgbuf; struct iovec iov; char buf; + Cred *cred; struct passwd *pw; - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = (char *) cmsg; - msg.msg_controllen = sizeof(cmsgmem); - memset(cmsg, 0, sizeof(cmsgmem)); - /* - * The one character which is received here is not meaningful; its - * purposes is only to make sure that recvmsg() blocks long enough for the - * other side to send its credentials. + * The one character that is received here is not meaningful; its purpose + * is only to make sure that recvmsg() blocks long enough for the other + * side to send its credentials. */ iov.iov_base = &buf; iov.iov_len = 1; - if (recvmsg(port->sock, &msg, 0) < 0 || - cmsg->cmsg_len < sizeof(cmsgmem) || - cmsg->cmsg_type != SCM_CREDS) + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = &cmsgbuf.buf; + msg.msg_controllen = sizeof(cmsgbuf.buf); + memset(&cmsgbuf, 0, sizeof(cmsgbuf)); + + if (recvmsg(port->sock, &msg, 0) < 0) { ereport(LOG, (errcode_for_socket_access(), @@ -1931,6 +1928,19 @@ auth_peer(hbaPort *port) return STATUS_ERROR; } + cmsg = CMSG_FIRSTHDR(&msg); + if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC) || + cmsg == NULL || + cmsg->cmsg_len < CMSG_LEN(sizeof(Cred)) || + cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_CREDS) + { + ereport(LOG, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not get peer credentials: incorrect control message"))); + return STATUS_ERROR; + } + cred = (Cred *) CMSG_DATA(cmsg); pw = getpwuid(cred->cruid); diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index c17863fce5..f3a3b6e2cc 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -824,7 +824,16 @@ parse_hba_line(List *line, int line_num, HbaLine *parsedline) token = lfirst(line_item); if (strcmp(token, "local") == 0) { +#ifdef HAVE_UNIX_SOCKETS parsedline->conntype = ctLocal; +#else + ereport(LOG, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("local connections are not supported by this build"), + errcontext("line %d of configuration file \"%s\"", + line_num, HbaFileName))); + return false; +#endif } else if (strcmp(token, "host") == 0 || strcmp(token, "hostssl") == 0 diff --git a/src/backend/libpq/pg_hba.conf.sample b/src/backend/libpq/pg_hba.conf.sample index f913b9aabb..0a50905eff 100644 --- a/src/backend/libpq/pg_hba.conf.sample +++ b/src/backend/libpq/pg_hba.conf.sample @@ -84,6 +84,6 @@ host all all 127.0.0.1/32 @authmethod@ host all all ::1/128 @authmethod@ # Allow replication connections from localhost, by a user with the # replication privilege. -#local replication @default_username@ @authmethod@ +@remove-line-for-nolocal@#local replication @default_username@ @authmethodlocal@ #host replication @default_username@ 127.0.0.1/32 @authmethod@ #host replication @default_username@ ::1/128 @authmethod@ diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 5ed6e8337c..1d80c311d8 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1246,8 +1246,7 @@ pgstat_report_autovac(Oid dboid) * --------- */ void -pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts, - PgStat_Counter tuples) +pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter tuples) { PgStat_MsgVacuum msg; @@ -1257,7 +1256,6 @@ pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts, pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM); msg.m_databaseid = shared ? InvalidOid : MyDatabaseId; msg.m_tableoid = tableoid; - msg.m_adopt_counts = adopt_counts; msg.m_autovacuum = IsAutoVacuumWorkerProcess(); msg.m_vacuumtime = GetCurrentTimestamp(); msg.m_tuples = tuples; @@ -1271,7 +1269,7 @@ pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts, * -------- */ void -pgstat_report_analyze(Relation rel, bool adopt_counts, +pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples) { PgStat_MsgAnalyze msg; @@ -1308,7 +1306,6 @@ pgstat_report_analyze(Relation rel, bool adopt_counts, pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE); msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId; msg.m_tableoid = RelationGetRelid(rel); - msg.m_adopt_counts = adopt_counts; msg.m_autovacuum = IsAutoVacuumWorkerProcess(); msg.m_analyzetime = GetCurrentTimestamp(); msg.m_live_tuples = livetuples; @@ -4197,8 +4194,7 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - if (msg->m_adopt_counts) - tabentry->n_live_tuples = msg->m_tuples; + tabentry->n_live_tuples = msg->m_tuples; /* Resetting dead_tuples to 0 is an approximation ... */ tabentry->n_dead_tuples = 0; @@ -4233,11 +4229,8 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - if (msg->m_adopt_counts) - { - tabentry->n_live_tuples = msg->m_live_tuples; - tabentry->n_dead_tuples = msg->m_dead_tuples; - } + tabentry->n_live_tuples = msg->m_live_tuples; + tabentry->n_dead_tuples = msg->m_dead_tuples; /* * We reset changes_since_analyze to zero, forgetting any changes that diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI index a2bb63e3f8..5617182a44 100644 --- a/src/backend/storage/lmgr/README-SSI +++ b/src/backend/storage/lmgr/README-SSI @@ -402,6 +402,54 @@ is based on the top level xid. When looking at an xid that comes from a tuple's xmin or xmax, for example, we always call SubTransGetTopmostTransaction() before doing much else with it. + * PostgreSQL does not use "update in place" with a rollback log +for its MVCC implementation. Where possible it uses "HOT" updates on +the same page (if there is room and no indexed value is changed). +For non-HOT updates the old tuple is expired in place and a new tuple +is inserted at a new location. Because of this difference, a tuple +lock in PostgreSQL doesn't automatically lock any other versions of a +row. We don't try to copy or expand a tuple lock to any other +versions of the row, based on the following proof that any additional +serialization failures we would get from that would be false +positives: + + o If transaction T1 reads a row (thus acquiring a predicate +lock on it) and a second transaction T2 updates that row, must a +third transaction T3 which updates the new version of the row have a +rw-conflict in from T1 to prevent anomalies? In other words, does it +matter whether this edge T1 -> T3 is there? + + o If T1 has a conflict in, it certainly doesn't. Adding the +edge T1 -> T3 would create a dangerous structure, but we already had +one from the edge T1 -> T2, so we would have aborted something +anyway. + + o Now let's consider the case where T1 doesn't have a +conflict in. If that's the case, for this edge T1 -> T3 to make a +difference, T3 must have a rw-conflict out that induces a cycle in +the dependency graph, i.e. a conflict out to some transaction +preceding T1 in the serial order. (A conflict out to T1 would work +too, but that would mean T1 has a conflict in and we would have +rolled back.) + + o So now we're trying to figure out if there can be an +rw-conflict edge T3 -> T0, where T0 is some transaction that precedes +T1. For T0 to precede T1, there has to be has to be some edge, or +sequence of edges, from T0 to T1. At least the last edge has to be a +wr-dependency or ww-dependency rather than a rw-conflict, because T1 +doesn't have a rw-conflict in. And that gives us enough information +about the order of transactions to see that T3 can't have a +rw-dependency to T0: + - T0 committed before T1 started (the wr/ww-dependency implies this) + - T1 started before T2 committed (the T1->T2 rw-conflict implies this) + - T2 committed before T3 started (otherwise, T3 would be aborted + because of an update conflict) + + o That means T0 committed before T3 started, and therefore +there can't be a rw-conflict from T3 to T0. + + o In both cases, we didn't need the T1 -> T3 edge. + * Predicate locking in PostgreSQL will start at the tuple level when possible, with automatic conversion of multiple fine-grained locks to coarser granularity as need to avoid resource exhaustion. diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index dc53a7ab5b..5670f31f58 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -155,9 +155,6 @@ * BlockNumber newblkno); * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, * BlockNumber newblkno); - * PredicateLockTupleRowVersionLink(const Relation relation, - * const HeapTuple oldTuple, - * const HeapTuple newTuple) * ReleasePredicateLocks(bool isCommit) * * conflict detection (may also trigger rollback) @@ -2252,90 +2249,6 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple) PredicateLockAcquire(&tag); } -/* - * If the old tuple has any predicate locks, copy them to the new target. - * - * This is called at an UPDATE, where any predicate locks held on the old - * tuple need to be copied to the new tuple, because logically they both - * represent the same row. A lock taken before the update must conflict - * with anyone locking the same row after the update. - */ -void -PredicateLockTupleRowVersionLink(const Relation relation, - const HeapTuple oldTuple, - const HeapTuple newTuple) -{ - PREDICATELOCKTARGETTAG oldtupletag; - PREDICATELOCKTARGETTAG oldpagetag; - PREDICATELOCKTARGETTAG newtupletag; - BlockNumber oldblk, - newblk; - OffsetNumber oldoff, - newoff; - TransactionId oldxmin, - newxmin; - - /* - * Bail out quickly if there are no serializable transactions - * running. - * - * It's safe to do this check without taking any additional - * locks. Even if a serializable transaction starts concurrently, - * we know it can't take any SIREAD locks on the modified tuple - * because the caller is holding the associated buffer page lock. - * Memory reordering isn't an issue; the memory barrier in the - * LWLock acquisition guarantees that this read occurs while the - * buffer page lock is held. - */ - if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) - return; - - oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self)); - oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self)); - oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data); - - newblk = ItemPointerGetBlockNumber(&(newTuple->t_self)); - newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self)); - newxmin = HeapTupleHeaderGetXmin(newTuple->t_data); - - SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag, - relation->rd_node.dbNode, - relation->rd_id, - oldblk, - oldoff, - oldxmin); - - SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag, - relation->rd_node.dbNode, - relation->rd_id, - oldblk); - - SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag, - relation->rd_node.dbNode, - relation->rd_id, - newblk, - newoff, - newxmin); - - /* - * A page-level lock on the page containing the old tuple counts too. - * Anyone holding a lock on the page is logically holding a lock on the - * old tuple, so we need to acquire a lock on his behalf on the new tuple - * too. However, if the new tuple is on the same page as the old one, the - * old page-level lock already covers the new tuple. - * - * A relation-level lock always covers both tuple versions, so we don't - * need to worry about those here. - */ - LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE); - - TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false); - if (newblk != oldblk) - TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false); - - LWLockRelease(SerializablePredicateLockListLock); -} - /* * DeleteLockTarget @@ -2650,9 +2563,15 @@ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, /* * Bail out quickly if there are no serializable transactions - * running. As with PredicateLockTupleRowVersionLink, it's safe to - * check this without taking locks because the caller is holding - * the buffer page lock. + * running. + * + * It's safe to do this check without taking any additional + * locks. Even if a serializable transaction starts concurrently, + * we know it can't take any SIREAD locks on the page being split + * because the caller is holding the associated buffer page lock. + * Memory reordering isn't an issue; the memory barrier in the + * LWLock acquisition guarantees that this read occurs while the + * buffer page lock is held. */ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) return; @@ -3890,8 +3809,21 @@ FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer) } /* - * Check whether we should roll back one of these transactions - * instead of flagging a new rw-conflict. + * We are about to add a RW-edge to the dependency graph - check that we don't + * introduce a dangerous structure by doing so, and abort one of the + * transactions if so. + * + * A serialization failure can only occur if there is a dangerous structure + * in the dependency graph: + * + * Tin ------> Tpivot ------> Tout + * rw rw + * + * Furthermore, Tout must commit first. + * + * One more optimization is that if Tin is declared READ ONLY (or commits + * without writing), we can only have a problem if Tout committed before Tin + * acquired its snapshot. */ static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, @@ -3905,100 +3837,134 @@ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, failure = false; /* - * Check for already-committed writer with rw-conflict out flagged. This - * means that the reader must immediately fail. + * Check for already-committed writer with rw-conflict out flagged + * (conflict-flag on W means that T2 committed before W): + * + * R ------> W ------> T2 + * rw rw + * + * That is a dangerous structure, so we must abort. (Since the writer + * has already committed, we must be the reader) */ if (SxactIsCommitted(writer) && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer))) failure = true; /* - * Check whether the reader has become a pivot with a committed writer. If - * so, we must roll back unless every in-conflict either committed before - * the writer committed or is READ ONLY and overlaps the writer. + * Check whether the writer has become a pivot with an out-conflict + * committed transaction (T2), and T2 committed first: + * + * R ------> W ------> T2 + * rw rw + * + * Because T2 must've committed first, there is no anomaly if: + * - the reader committed before T2 + * - the writer committed before T2 + * - the reader is a READ ONLY transaction and the reader was not + * concurrent with T2 (= reader acquired its snapshot after T2 committed) */ - if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader)) + if (!failure) { - if (SxactHasSummaryConflictIn(reader)) + if (SxactHasSummaryConflictOut(writer)) { failure = true; conflict = NULL; } else conflict = (RWConflict) - SHMQueueNext(&reader->inConflicts, - &reader->inConflicts, - offsetof(RWConflictData, inLink)); + SHMQueueNext(&writer->outConflicts, + &writer->outConflicts, + offsetof(RWConflictData, outLink)); while (conflict) { - if (!SxactIsRolledBack(conflict->sxactOut) - && (!SxactIsCommitted(conflict->sxactOut) - || conflict->sxactOut->commitSeqNo >= writer->commitSeqNo) - && (!SxactIsReadOnly(conflict->sxactOut) - || conflict->sxactOut->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo)) + SERIALIZABLEXACT *t2 = conflict->sxactIn; + + if (SxactIsCommitted(t2) + && (!SxactIsCommitted(reader) + || t2->commitSeqNo <= reader->commitSeqNo) + && (!SxactIsCommitted(writer) + || t2->commitSeqNo <= writer->commitSeqNo) + && (!SxactIsReadOnly(reader) + || t2->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot)) { failure = true; break; } conflict = (RWConflict) - SHMQueueNext(&reader->inConflicts, - &conflict->inLink, - offsetof(RWConflictData, inLink)); + SHMQueueNext(&writer->outConflicts, + &conflict->outLink, + offsetof(RWConflictData, outLink)); } } /* - * Check whether the writer has become a pivot with an out-conflict - * committed transaction, while neither reader nor writer is committed. If - * the reader is a READ ONLY transaction, there is only a serialization - * failure if an out-conflict transaction causing the pivot committed - * before the reader acquired its snapshot. (That is, the reader must not - * have been concurrent with the out-conflict transaction.) + * Check whether the reader has become a pivot with a committed writer: + * + * T0 ------> R ------> W + * rw rw + * + * Because W must've committed first for an anomaly to occur, there is no + * anomaly if: + * - T0 committed before the writer + * - T0 is READ ONLY, and overlaps the writer */ - if (!failure && !SxactIsCommitted(writer)) + if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader)) { - if (SxactHasSummaryConflictOut(reader)) + if (SxactHasSummaryConflictIn(reader)) { failure = true; conflict = NULL; } else conflict = (RWConflict) - SHMQueueNext(&writer->outConflicts, - &writer->outConflicts, - offsetof(RWConflictData, outLink)); + SHMQueueNext(&reader->inConflicts, + &reader->inConflicts, + offsetof(RWConflictData, inLink)); while (conflict) { - if ((reader == conflict->sxactIn && SxactIsCommitted(reader)) - || (SxactIsCommitted(conflict->sxactIn) - && !SxactIsCommitted(reader) - && (!SxactIsReadOnly(reader) - || conflict->sxactIn->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))) + SERIALIZABLEXACT *t0 = conflict->sxactOut; + + if (!SxactIsRolledBack(t0) + && (!SxactIsCommitted(t0) + || t0->commitSeqNo >= writer->commitSeqNo) + && (!SxactIsReadOnly(t0) + || t0->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo)) { failure = true; break; } conflict = (RWConflict) - SHMQueueNext(&writer->outConflicts, - &conflict->outLink, - offsetof(RWConflictData, outLink)); + SHMQueueNext(&reader->inConflicts, + &conflict->inLink, + offsetof(RWConflictData, inLink)); } } if (failure) { + /* + * We have to kill a transaction to avoid a possible anomaly from + * occurring. If the writer is us, we can just ereport() to cause + * a transaction abort. Otherwise we flag the writer for termination, + * causing it to abort when it tries to commit. However, if the writer + * is a prepared transaction, already prepared, we can't abort it + * anymore, so we have to kill the reader instead. + */ if (MySerializableXact == writer) { LWLockRelease(SerializableXactHashLock); ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to read/write dependencies among transactions"), - errdetail("Cancelled on identification as pivot, during write."), + errdetail("Cancelled on identification as a pivot, during write."), errhint("The transaction might succeed if retried."))); } else if (SxactIsPrepared(writer)) { LWLockRelease(SerializableXactHashLock); + + /* if we're not the writer, we have to be the reader */ + Assert(MySerializableXact == reader); ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to read/write dependencies among transactions"), diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index ee82d4616c..702b9e3e9f 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -1067,13 +1067,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp, if (xmlStrncmp(p, (xmlChar *) "'yes'", 5) == 0 || xmlStrncmp(p, (xmlChar *) "\"yes\"", 5) == 0) { - *standalone = 1; + if (standalone) + *standalone = 1; p += 5; } else if (xmlStrncmp(p, (xmlChar *) "'no'", 4) == 0 || xmlStrncmp(p, (xmlChar *) "\"no\"", 4) == 0) { - *standalone = 0; + if (standalone) + *standalone = 0; p += 4; } else @@ -1218,8 +1220,8 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, { int res_code; size_t count; - xmlChar *version = NULL; - int standalone = -1; + xmlChar *version; + int standalone; res_code = parse_xml_decl(utf8string, &count, &version, NULL, &standalone); diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 1f31fe0694..17cff8dd5b 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -126,7 +126,8 @@ usage(void) printf(_(" -D, --pgdata=DIRECTORY receive base backup into directory\n")); printf(_(" -F, --format=p|t output format (plain, tar)\n")); printf(_(" -x, --xlog include required WAL files in backup\n")); - printf(_(" -Z, --compress=0-9 compress tar output\n")); + printf(_(" -z, --gzip compress tar output\n")); + printf(_(" -Z, --compress=0-9 compress tar output with given compression level\n")); printf(_("\nGeneral options:\n")); printf(_(" -c, --checkpoint=fast|spread\n" " set fast or spread checkpointing\n")); @@ -261,7 +262,22 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum) * Base tablespaces */ if (strcmp(basedir, "-") == 0) - tarfile = stdout; + { +#ifdef HAVE_LIBZ + if (compresslevel > 0) + { + ztarfile = gzdopen(dup(fileno(stdout)), "wb"); + if (gzsetparams(ztarfile, compresslevel, Z_DEFAULT_STRATEGY) != Z_OK) + { + fprintf(stderr, _("%s: could not set compression level %i: %s\n"), + progname, compresslevel, get_gz_error(ztarfile)); + disconnect_and_exit(1); + } + } + else +#endif + tarfile = stdout; + } else { #ifdef HAVE_LIBZ @@ -384,7 +400,14 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum) } } - if (strcmp(basedir, "-") != 0) + if (strcmp(basedir, "-") == 0) + { +#ifdef HAVE_LIBZ + if (ztarfile) + gzclose(ztarfile); +#endif + } + else { #ifdef HAVE_LIBZ if (ztarfile != NULL) @@ -882,7 +905,7 @@ BaseBackup(void) } if (PQntuples(res) != 1) { - fprintf(stderr, _("%s: no end point returned from server\n"), + fprintf(stderr, _("%s: no WAL end position returned from server\n"), progname); disconnect_and_exit(1); } @@ -919,6 +942,7 @@ main(int argc, char **argv) {"format", required_argument, NULL, 'F'}, {"checkpoint", required_argument, NULL, 'c'}, {"xlog", no_argument, NULL, 'x'}, + {"gzip", no_argument, NULL, 'z'}, {"compress", required_argument, NULL, 'Z'}, {"label", required_argument, NULL, 'l'}, {"host", required_argument, NULL, 'h'}, @@ -978,6 +1002,13 @@ main(int argc, char **argv) case 'l': label = xstrdup(optarg); break; + case 'z': +#ifdef HAVE_LIBZ + compresslevel = Z_DEFAULT_COMPRESSION; +#else + compresslevel = 1; /* will be rejected below */ +#endif + break; case 'Z': compresslevel = atoi(optarg); if (compresslevel <= 0 || compresslevel > 9) @@ -1076,14 +1107,6 @@ main(int argc, char **argv) progname); exit(1); } -#else - if (compresslevel > 0 && strcmp(basedir, "-") == 0) - { - fprintf(stderr, - _("%s: compression is not supported on standard output\n"), - progname); - exit(1); - } #endif /* diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c index 66d2419725..8540a75d11 100644 --- a/src/bin/pg_dump/compress_io.c +++ b/src/bin/pg_dump/compress_io.c @@ -535,6 +535,7 @@ cfopen_write(const char *path, const char *mode, int compression) free(fname); #else die_horribly(NULL, modulename, "not built with zlib support\n"); + fp = NULL; /* keep compiler quiet */ #endif } return fp; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index f925be1ffa..3781e599c4 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -11180,6 +11180,14 @@ dumpForeignDataWrapper(Archive *fout, FdwInfo *fdwinfo) if (!fdwinfo->dobj.dump || dataOnly) return; + /* + * FDWs that belong to an extension are dumped based on their "dump" field. + * Otherwise omit them if we are only dumping some specific object. + */ + if (!fdwinfo->dobj.ext_member) + if (!include_everything) + return; + q = createPQExpBuffer(); delq = createPQExpBuffer(); labelq = createPQExpBuffer(); @@ -11255,7 +11263,7 @@ dumpForeignServer(Archive *fout, ForeignServerInfo *srvinfo) char *fdwname; /* Skip if not to be dumped */ - if (!srvinfo->dobj.dump || dataOnly) + if (!srvinfo->dobj.dump || dataOnly || !include_everything) return; q = createPQExpBuffer(); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 79c9f5d90f..cfbe0c4392 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -142,6 +142,10 @@ extern void vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast, extern void vac_open_indexes(Relation relation, LOCKMODE lockmode, int *nindexes, Relation **Irel); extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode); +extern double vac_estimate_reltuples(Relation relation, bool is_analyze, + BlockNumber total_pages, + BlockNumber scanned_pages, + double scanned_tuples); extern void vac_update_relstats(Relation relation, BlockNumber num_pages, double num_tuples, @@ -157,10 +161,10 @@ extern void vacuum_delay_point(void); /* in commands/vacuumlazy.c */ extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, - BufferAccessStrategy bstrategy, bool *scanned_all); + BufferAccessStrategy bstrategy); /* in commands/analyze.c */ extern void analyze_rel(Oid relid, VacuumStmt *vacstmt, - BufferAccessStrategy bstrategy, bool update_reltuples); + BufferAccessStrategy bstrategy); #endif /* VACUUM_H */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f04be95b45..5446fa0440 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -322,7 +322,6 @@ typedef struct PgStat_MsgVacuum PgStat_MsgHdr m_hdr; Oid m_databaseid; Oid m_tableoid; - bool m_adopt_counts; bool m_autovacuum; TimestampTz m_vacuumtime; PgStat_Counter m_tuples; @@ -339,7 +338,6 @@ typedef struct PgStat_MsgAnalyze PgStat_MsgHdr m_hdr; Oid m_databaseid; Oid m_tableoid; - bool m_adopt_counts; bool m_autovacuum; TimestampTz m_analyzetime; PgStat_Counter m_live_tuples; @@ -706,9 +704,9 @@ extern void pgstat_reset_shared_counters(const char *); extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type type); extern void pgstat_report_autovac(Oid dboid); -extern void pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts, +extern void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter tuples); -extern void pgstat_report_analyze(Relation rel, bool adopt_counts, +extern void pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples); extern void pgstat_report_recovery_conflict(int reason); diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h index 9a26ecf2d3..77ae8f904d 100644 --- a/src/include/storage/predicate.h +++ b/src/include/storage/predicate.h @@ -47,7 +47,6 @@ extern void RegisterPredicateLockingXid(const TransactionId xid); extern void PredicateLockRelation(const Relation relation); extern void PredicateLockPage(const Relation relation, const BlockNumber blkno); extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple); -extern void PredicateLockTupleRowVersionLink(const Relation relation, const HeapTuple oldTuple, const HeapTuple newTuple); extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno); extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno); extern void ReleasePredicateLocks(const bool isCommit); diff --git a/src/interfaces/libpq/fe-auth.c b/src/interfaces/libpq/fe-auth.c index 6f1a163a10..094926b4e6 100644 --- a/src/interfaces/libpq/fe-auth.c +++ b/src/interfaces/libpq/fe-auth.c @@ -693,11 +693,12 @@ pg_local_sendauth(PGconn *conn) struct msghdr msg; #ifdef HAVE_STRUCT_CMSGCRED - /* Prevent padding */ - char cmsgmem[sizeof(struct cmsghdr) + sizeof(struct cmsgcred)]; - - /* Point to start of first structure */ - struct cmsghdr *cmsg = (struct cmsghdr *) cmsgmem; + struct cmsghdr *cmsg; + union + { + struct cmsghdr hdr; + unsigned char buf[CMSG_SPACE(sizeof(struct cmsgcred))]; + } cmsgbuf; #endif /* @@ -713,11 +714,12 @@ pg_local_sendauth(PGconn *conn) msg.msg_iovlen = 1; #ifdef HAVE_STRUCT_CMSGCRED - /* Create control header, FreeBSD */ - msg.msg_control = cmsg; - msg.msg_controllen = sizeof(cmsgmem); - memset(cmsg, 0, sizeof(cmsgmem)); - cmsg->cmsg_len = sizeof(cmsgmem); + /* FreeBSD needs us to set up a message that will be filled in by kernel */ + memset(&cmsgbuf, 0, sizeof(cmsgbuf)); + msg.msg_control = &cmsgbuf.buf; + msg.msg_controllen = sizeof(cmsgbuf.buf); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(sizeof(struct cmsgcred)); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_CREDS; #endif diff --git a/src/pl/plperl/plperl.c b/src/pl/plperl/plperl.c index d69d2327bb..f2e8ad2207 100644 --- a/src/pl/plperl/plperl.c +++ b/src/pl/plperl/plperl.c @@ -1357,7 +1357,13 @@ make_array_ref(plperl_array_info *info, int first, int last) for (i = first; i < last; i++) { if (info->nulls[i]) - av_push(result, &PL_sv_undef); + { + /* + * We can't use &PL_sv_undef here. See "AVs, HVs and undefined + * values" in perlguts. + */ + av_push(result, newSV(0)); + } else { Datum itemvalue = info->elements[i]; @@ -2639,8 +2645,12 @@ plperl_hash_from_tuple(HeapTuple tuple, TupleDesc tupdesc) if (isnull) { - /* Store (attname => undef) and move on. */ - hv_store_string(hv, attname, &PL_sv_undef); + /* + * Store (attname => undef) and move on. Note we can't use + * &PL_sv_undef here; see "AVs, HVs and undefined values" in + * perlguts for an explanation. + */ + hv_store_string(hv, attname, newSV(0)); continue; } diff --git a/src/test/isolation/expected/multiple-row-versions.out b/src/test/isolation/expected/multiple-row-versions.out index cd31029d17..bbd3ecc0f7 100644 --- a/src/test/isolation/expected/multiple-row-versions.out +++ b/src/test/isolation/expected/multiple-row-versions.out @@ -19,6 +19,6 @@ id txt 1 step c4: COMMIT; step c3: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions step wz1: UPDATE t SET txt = 'a' WHERE id = 1; +ERROR: could not serialize access due to read/write dependencies among transactions step c1: COMMIT; diff --git a/src/test/isolation/specs/multiple-row-versions.spec b/src/test/isolation/specs/multiple-row-versions.spec index 8cfe3a44dc..1bb5b4e8ba 100644 --- a/src/test/isolation/specs/multiple-row-versions.spec +++ b/src/test/isolation/specs/multiple-row-versions.spec @@ -1,8 +1,7 @@ # Multiple Row Versions test # -# This test is designed to ensure that predicate locks taken on one version -# of a row are detected as conflicts when a later version of the row is -# updated or deleted by a transaction concurrent to the reader. +# This test is designed to cover some code paths which only occur with +# four or more transactions interacting with particular timings. # # Due to long permutation setup time, we are only testing one specific # permutation, which should get a serialization error. |