summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBernd Helmle2011-05-31 16:31:49 +0000
committerBernd Helmle2011-05-31 16:31:49 +0000
commit554cb4427a60fcaf0f6d007f5072a08d985fe779 (patch)
treec39487bbadc5fb298d82c014e423254597655a39
parentb9b93512ec3c880d9081b05791e19244798ffe94 (diff)
parent13c00ae8c73ee9635c11059925814b351dc3593c (diff)
Merge branch 'master' of ../bernd_pg into notnull_constraint
-rw-r--r--doc/src/sgml/config.sgml7
-rw-r--r--doc/src/sgml/install-windows.sgml7
-rw-r--r--doc/src/sgml/ref/pg_basebackup.sgml23
-rw-r--r--src/backend/access/heap/heapam.c15
-rw-r--r--src/backend/access/index/indexam.c3
-rw-r--r--src/backend/commands/analyze.c59
-rw-r--r--src/backend/commands/vacuum.c98
-rw-r--r--src/backend/commands/vacuumlazy.c169
-rw-r--r--src/backend/libpq/auth.c60
-rw-r--r--src/backend/libpq/hba.c9
-rw-r--r--src/backend/libpq/pg_hba.conf.sample2
-rw-r--r--src/backend/postmaster/pgstat.c17
-rw-r--r--src/backend/storage/lmgr/README-SSI48
-rw-r--r--src/backend/storage/lmgr/predicate.c226
-rw-r--r--src/backend/utils/adt/xml.c10
-rw-r--r--src/bin/pg_basebackup/pg_basebackup.c47
-rw-r--r--src/bin/pg_dump/compress_io.c1
-rw-r--r--src/bin/pg_dump/pg_dump.c10
-rw-r--r--src/include/commands/vacuum.h8
-rw-r--r--src/include/pgstat.h6
-rw-r--r--src/include/storage/predicate.h1
-rw-r--r--src/interfaces/libpq/fe-auth.c22
-rw-r--r--src/pl/plperl/plperl.c16
-rw-r--r--src/test/isolation/expected/multiple-row-versions.out2
-rw-r--r--src/test/isolation/specs/multiple-row-versions.spec5
25 files changed, 524 insertions, 347 deletions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e367c29bd5..39819695d1 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3527,8 +3527,8 @@ local0.* /var/log/postgresql
<para>
Causes each attempted connection to the server to be logged,
as well as successful completion of client authentication.
- This parameter can only be set in the <filename>postgresql.conf</>
- file or on the server command line. The default is off.
+ This parameter cannot be changed after session start.
+ The default is off.
</para>
<note>
@@ -3553,8 +3553,7 @@ local0.* /var/log/postgresql
<varname>log_connections</varname> but at session termination,
and includes the duration of the session. This is off by
default.
- This parameter can only be set in the <filename>postgresql.conf</>
- file or on the server command line.
+ This parameter cannot be changed after session start.
</para>
</listitem>
</varlistentry>
diff --git a/doc/src/sgml/install-windows.sgml b/doc/src/sgml/install-windows.sgml
index 3c9d90ef33..cb8bca9c63 100644
--- a/doc/src/sgml/install-windows.sgml
+++ b/doc/src/sgml/install-windows.sgml
@@ -150,9 +150,10 @@ $ENV{PATH}=$ENV{PATH} . ';c:\some\where\bison\bin';
<varlistentry>
<term><productname>Microsoft Platform SDK</productname></term>
<listitem><para>
- It is recommended that you upgrade to the latest available version
- of the <productname>Microsoft Platform SDK</productname>, available
- for download from <ulink url="http://www.microsoft.com/downloads/"></>.
+ It is recommended that you upgrade to the latest supported version
+ of the <productname>Microsoft Platform SDK</productname> (currently
+ version 7.0), available for download from
+ <ulink url="http://www.microsoft.com/downloads/"></>.
</para>
<para>
You must always include the
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index 8a7b833f0f..47dce43b19 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -169,13 +169,26 @@ PostgreSQL documentation
</varlistentry>
<varlistentry>
+ <term><option>-z</option></term>
+ <term><option>--gzip</option></term>
+ <listitem>
+ <para>
+ Enables gzip compression of tar file output, with the default
+ compression level. Compression is only available when using
+ the tar format.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><option>-Z <replaceable class="parameter">level</replaceable></option></term>
<term><option>--compress=<replaceable class="parameter">level</replaceable></option></term>
<listitem>
<para>
- Enables gzip compression of tar file output. Compression is only
- available when generating tar files, and is not available when sending
- output to standard output.
+ Enables gzip compression of tar file output, and specifies the
+ compression level (1 through 9, 9 being best
+ compression). Compression is only available when using the tar
+ format.
</para>
</listitem>
</varlistentry>
@@ -394,11 +407,11 @@ PostgreSQL documentation
</para>
<para>
- To create a backup of the local server with one maximum compressed
+ To create a backup of the local server with one compressed
tar file for each tablespace, and store it in the directory
<filename>backup</filename>, showing a progress report while running:
<screen>
-<prompt>$</prompt> <userinput>pg_basebackup -D backup -Ft -Z9 -P</userinput>
+<prompt>$</prompt> <userinput>pg_basebackup -D backup -Ft -z -P</userinput>
</screen>
</para>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 346d6b964d..01a492e496 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1529,7 +1529,6 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
OffsetNumber offnum;
bool at_chain_start;
bool valid;
- bool match_found;
if (all_dead)
*all_dead = true;
@@ -1539,7 +1538,6 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
offnum = ItemPointerGetOffsetNumber(tid);
at_chain_start = true;
- match_found = false;
/* Scan through possible multiple members of HOT-chain */
for (;;)
@@ -1597,10 +1595,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
PredicateLockTuple(relation, &heapTuple);
if (all_dead)
*all_dead = false;
- if (IsolationIsSerializable())
- match_found = true;
- else
- return true;
+ return true;
}
/*
@@ -1629,7 +1624,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
break; /* end of chain */
}
- return match_found;
+ return false;
}
/*
@@ -2855,12 +2850,6 @@ l2:
END_CRIT_SECTION();
- /*
- * Any existing SIREAD locks on the old tuple must be linked to the new
- * tuple for conflict detection purposes.
- */
- PredicateLockTupleRowVersionLink(relation, &oldtup, heaptup);
-
if (newbuf != buffer)
LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 08de8b4f88..27c37d6173 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -612,8 +612,7 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
* any more members. Otherwise, check for continuation of the
* HOT-chain, and set state for next time.
*/
- if (IsMVCCSnapshot(scan->xs_snapshot)
- && !IsolationIsSerializable())
+ if (IsMVCCSnapshot(scan->xs_snapshot))
scan->xs_next_hot = InvalidOffsetNumber;
else if (HeapTupleIsHotUpdated(heapTuple))
{
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 0568a1bcf8..fa84989fc6 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -84,8 +84,7 @@ static MemoryContext anl_context = NULL;
static BufferAccessStrategy vac_strategy;
-static void do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
- bool update_reltuples, bool inh);
+static void do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh);
static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
int samplesize);
static bool BlockSampler_HasMore(BlockSampler bs);
@@ -115,18 +114,9 @@ static bool std_typanalyze(VacAttrStats *stats);
/*
* analyze_rel() -- analyze one relation
- *
- * If update_reltuples is true, we update reltuples and relpages columns
- * in pg_class. Caller should pass false if we're part of VACUUM ANALYZE,
- * and the VACUUM didn't skip any pages. We only have an approximate count,
- * so we don't want to overwrite the accurate values already inserted by the
- * VACUUM in that case. VACUUM always scans all indexes, however, so the
- * pg_class entries for indexes are never updated if we're part of VACUUM
- * ANALYZE.
*/
void
-analyze_rel(Oid relid, VacuumStmt *vacstmt,
- BufferAccessStrategy bstrategy, bool update_reltuples)
+analyze_rel(Oid relid, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy)
{
Relation onerel;
@@ -238,13 +228,13 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt,
/*
* Do the normal non-recursive ANALYZE.
*/
- do_analyze_rel(onerel, vacstmt, update_reltuples, false);
+ do_analyze_rel(onerel, vacstmt, false);
/*
* If there are child tables, do recursive ANALYZE.
*/
if (onerel->rd_rel->relhassubclass)
- do_analyze_rel(onerel, vacstmt, false, true);
+ do_analyze_rel(onerel, vacstmt, true);
/*
* Close source relation now, but keep lock so that no one deletes it
@@ -267,8 +257,7 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt,
* do_analyze_rel() -- analyze one relation, recursively or not
*/
static void
-do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
- bool update_reltuples, bool inh)
+do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh)
{
int attr_cnt,
tcnt,
@@ -437,9 +426,9 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
}
/*
- * Quit if no analyzable columns and no pg_class update needed.
+ * Quit if no analyzable columns.
*/
- if (attr_cnt <= 0 && !analyzableindex && !update_reltuples)
+ if (attr_cnt <= 0 && !analyzableindex)
goto cleanup;
/*
@@ -549,10 +538,10 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
}
/*
- * Update pages/tuples stats in pg_class, but not if we're inside a VACUUM
- * that got a more precise number.
+ * Update pages/tuples stats in pg_class ... but not if we're doing
+ * inherited stats.
*/
- if (update_reltuples)
+ if (!inh)
vac_update_relstats(onerel,
RelationGetNumberOfBlocks(onerel),
totalrows, hasindex, InvalidTransactionId);
@@ -562,7 +551,7 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
* VACUUM ANALYZE, don't overwrite the accurate count already inserted by
* VACUUM.
*/
- if (!(vacstmt->options & VACOPT_VACUUM))
+ if (!inh && !(vacstmt->options & VACOPT_VACUUM))
{
for (ind = 0; ind < nindexes; ind++)
{
@@ -577,13 +566,12 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
}
/*
- * Report ANALYZE to the stats collector, too; likewise, tell it to adopt
- * these numbers only if we're not inside a VACUUM that got a better
- * number. However, a call with inh = true shouldn't reset the stats.
+ * Report ANALYZE to the stats collector, too. However, if doing
+ * inherited stats we shouldn't report, because the stats collector only
+ * tracks per-table stats.
*/
if (!inh)
- pgstat_report_analyze(onerel, update_reltuples,
- totalrows, totaldeadrows);
+ pgstat_report_analyze(onerel, totalrows, totaldeadrows);
/* We skip to here if there were no analyzable columns */
cleanup:
@@ -1243,18 +1231,19 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,
qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
/*
- * Estimate total numbers of rows in relation.
+ * Estimate total numbers of rows in relation. For live rows, use
+ * vac_estimate_reltuples; for dead rows, we have no source of old
+ * information, so we have to assume the density is the same in unseen
+ * pages as in the pages we scanned.
*/
+ *totalrows = vac_estimate_reltuples(onerel, true,
+ totalblocks,
+ bs.m,
+ liverows);
if (bs.m > 0)
- {
- *totalrows = floor((liverows * totalblocks) / bs.m + 0.5);
- *totaldeadrows = floor((deadrows * totalblocks) / bs.m + 0.5);
- }
+ *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5);
else
- {
- *totalrows = 0.0;
*totaldeadrows = 0.0;
- }
/*
* Emit some interesting relation info
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 9606569617..224c34f6e7 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -20,6 +20,8 @@
*/
#include "postgres.h"
+#include <math.h>
+
#include "access/clog.h"
#include "access/genam.h"
#include "access/heapam.h"
@@ -62,7 +64,7 @@ static BufferAccessStrategy vac_strategy;
static List *get_rel_oids(Oid relid, const RangeVar *vacrel);
static void vac_truncate_clog(TransactionId frozenXID);
static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast,
- bool for_wraparound, bool *scanned_all);
+ bool for_wraparound);
/*
@@ -219,12 +221,10 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast,
foreach(cur, relations)
{
Oid relid = lfirst_oid(cur);
- bool scanned_all = false;
if (vacstmt->options & VACOPT_VACUUM)
{
- if (!vacuum_rel(relid, vacstmt, do_toast, for_wraparound,
- &scanned_all))
+ if (!vacuum_rel(relid, vacstmt, do_toast, for_wraparound))
continue;
}
@@ -241,7 +241,7 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast,
PushActiveSnapshot(GetTransactionSnapshot());
}
- analyze_rel(relid, vacstmt, vac_strategy, !scanned_all);
+ analyze_rel(relid, vacstmt, vac_strategy);
if (use_own_xacts)
{
@@ -454,6 +454,79 @@ vacuum_set_xid_limits(int freeze_min_age,
/*
+ * vac_estimate_reltuples() -- estimate the new value for pg_class.reltuples
+ *
+ * If we scanned the whole relation then we should just use the count of
+ * live tuples seen; but if we did not, we should not trust the count
+ * unreservedly, especially not in VACUUM, which may have scanned a quite
+ * nonrandom subset of the table. When we have only partial information,
+ * we take the old value of pg_class.reltuples as a measurement of the
+ * tuple density in the unscanned pages.
+ *
+ * This routine is shared by VACUUM and ANALYZE.
+ */
+double
+vac_estimate_reltuples(Relation relation, bool is_analyze,
+ BlockNumber total_pages,
+ BlockNumber scanned_pages,
+ double scanned_tuples)
+{
+ BlockNumber old_rel_pages = relation->rd_rel->relpages;
+ double old_rel_tuples = relation->rd_rel->reltuples;
+ double old_density;
+ double new_density;
+ double multiplier;
+ double updated_density;
+
+ /* If we did scan the whole table, just use the count as-is */
+ if (scanned_pages >= total_pages)
+ return scanned_tuples;
+
+ /*
+ * If scanned_pages is zero but total_pages isn't, keep the existing
+ * value of reltuples.
+ */
+ if (scanned_pages == 0)
+ return old_rel_tuples;
+
+ /*
+ * If old value of relpages is zero, old density is indeterminate; we
+ * can't do much except scale up scanned_tuples to match total_pages.
+ */
+ if (old_rel_pages == 0)
+ return floor((scanned_tuples / scanned_pages) * total_pages + 0.5);
+
+ /*
+ * Okay, we've covered the corner cases. The normal calculation is to
+ * convert the old measurement to a density (tuples per page), then
+ * update the density using an exponential-moving-average approach,
+ * and finally compute reltuples as updated_density * total_pages.
+ *
+ * For ANALYZE, the moving average multiplier is just the fraction of
+ * the table's pages we scanned. This is equivalent to assuming
+ * that the tuple density in the unscanned pages didn't change. Of
+ * course, it probably did, if the new density measurement is different.
+ * But over repeated cycles, the value of reltuples will converge towards
+ * the correct value, if repeated measurements show the same new density.
+ *
+ * For VACUUM, the situation is a bit different: we have looked at a
+ * nonrandom sample of pages, but we know for certain that the pages we
+ * didn't look at are precisely the ones that haven't changed lately.
+ * Thus, there is a reasonable argument for doing exactly the same thing
+ * as for the ANALYZE case, that is use the old density measurement as
+ * the value for the unscanned pages.
+ *
+ * This logic could probably use further refinement.
+ */
+ old_density = old_rel_tuples / old_rel_pages;
+ new_density = scanned_tuples / scanned_pages;
+ multiplier = (double) scanned_pages / (double) total_pages;
+ updated_density = old_density + (new_density - old_density) * multiplier;
+ return floor(updated_density * total_pages + 0.5);
+}
+
+
+/*
* vac_update_relstats() -- update statistics for one relation
*
* Update the whole-relation statistics that are kept in its pg_class
@@ -480,7 +553,7 @@ vacuum_set_xid_limits(int freeze_min_age,
* somebody vacuuming pg_class might think they could delete a tuple
* marked with xmin = our xid.
*
- * This routine is shared by VACUUM and stand-alone ANALYZE.
+ * This routine is shared by VACUUM and ANALYZE.
*/
void
vac_update_relstats(Relation relation,
@@ -758,14 +831,10 @@ vac_truncate_clog(TransactionId frozenXID)
* many small transactions. Otherwise, two-phase locking would require
* us to lock the entire database during one pass of the vacuum cleaner.
*
- * We'll return true in *scanned_all if the vacuum scanned all heap
- * pages, and updated pg_class.
- *
* At entry and exit, we are not inside a transaction.
*/
static bool
-vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
- bool *scanned_all)
+vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound)
{
LOCKMODE lmode;
Relation onerel;
@@ -775,9 +844,6 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
int save_sec_context;
int save_nestlevel;
- if (scanned_all)
- *scanned_all = false;
-
/* Begin a transaction for vacuuming this relation */
StartTransactionCommand();
@@ -971,7 +1037,7 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
vacstmt->freeze_min_age, vacstmt->freeze_table_age);
}
else
- lazy_vacuum_rel(onerel, vacstmt, vac_strategy, scanned_all);
+ lazy_vacuum_rel(onerel, vacstmt, vac_strategy);
/* Roll back any GUC changes executed by index functions */
AtEOXact_GUC(false, save_nestlevel);
@@ -997,7 +1063,7 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
* totally unimportant for toast relations.
*/
if (toast_relid != InvalidOid)
- vacuum_rel(toast_relid, vacstmt, false, for_wraparound, NULL);
+ vacuum_rel(toast_relid, vacstmt, false, for_wraparound);
/*
* Now release the session-level lock on the master table.
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 9393fa0727..ce5fa18066 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -77,17 +77,18 @@
* Before we consider skipping a page that's marked as clean in
* visibility map, we must've seen at least this many clean pages.
*/
-#define SKIP_PAGES_THRESHOLD 32
+#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
typedef struct LVRelStats
{
/* hasindex = true means two-pass strategy; false means one-pass */
bool hasindex;
- bool scanned_all; /* have we scanned all pages (this far)? */
/* Overall statistics about rel */
- BlockNumber rel_pages;
+ BlockNumber rel_pages; /* total number of pages */
+ BlockNumber scanned_pages; /* number of pages we examined */
+ double scanned_tuples; /* counts only tuples on scanned pages */
double old_rel_tuples; /* previous value of pg_class.reltuples */
- double rel_tuples; /* counts only tuples on scanned pages */
+ double new_rel_tuples; /* new estimated total # of tuples */
BlockNumber pages_removed;
double tuples_deleted;
BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
@@ -143,7 +144,7 @@ static int vac_cmp_itemptr(const void *left, const void *right);
*/
void
lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
- BufferAccessStrategy bstrategy, bool *scanned_all)
+ BufferAccessStrategy bstrategy)
{
LVRelStats *vacrelstats;
Relation *Irel;
@@ -175,7 +176,6 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
- vacrelstats->scanned_all = true; /* will be cleared if we skip a page */
vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
vacrelstats->num_index_scans = 0;
@@ -205,24 +205,20 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
FreeSpaceMapVacuum(onerel);
/*
- * Update statistics in pg_class. But only if we didn't skip any pages;
- * the tuple count only includes tuples from the pages we've visited, and
- * we haven't frozen tuples in unvisited pages either. The page count is
- * accurate in any case, but because we use the reltuples / relpages ratio
- * in the planner, it's better to not update relpages either if we can't
- * update reltuples.
+ * Update statistics in pg_class. But don't change relfrozenxid if we
+ * skipped any pages.
*/
- if (vacrelstats->scanned_all)
- vac_update_relstats(onerel,
- vacrelstats->rel_pages, vacrelstats->rel_tuples,
- vacrelstats->hasindex,
- FreezeLimit);
+ vac_update_relstats(onerel,
+ vacrelstats->rel_pages, vacrelstats->new_rel_tuples,
+ vacrelstats->hasindex,
+ (vacrelstats->scanned_pages < vacrelstats->rel_pages) ?
+ InvalidTransactionId :
+ FreezeLimit);
/* report results to the stats collector, too */
pgstat_report_vacuum(RelationGetRelid(onerel),
onerel->rd_rel->relisshared,
- vacrelstats->scanned_all,
- vacrelstats->rel_tuples);
+ vacrelstats->new_rel_tuples);
/* and log the action if appropriate */
if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
@@ -239,13 +235,12 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
get_namespace_name(RelationGetNamespace(onerel)),
RelationGetRelationName(onerel),
vacrelstats->num_index_scans,
- vacrelstats->pages_removed, vacrelstats->rel_pages,
- vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
+ vacrelstats->pages_removed,
+ vacrelstats->rel_pages,
+ vacrelstats->tuples_deleted,
+ vacrelstats->new_rel_tuples,
pg_rusage_show(&ru0))));
}
-
- if (scanned_all)
- *scanned_all = vacrelstats->scanned_all;
}
/*
@@ -301,7 +296,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
HeapTupleData tuple;
char *relname;
BlockNumber empty_pages,
- scanned_pages,
vacuumed_pages;
double num_tuples,
tups_vacuumed,
@@ -311,7 +305,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
int i;
PGRUsage ru0;
Buffer vmbuffer = InvalidBuffer;
- BlockNumber all_visible_streak;
+ BlockNumber next_not_all_visible_block;
+ bool skipping_all_visible_blocks;
pg_rusage_init(&ru0);
@@ -321,7 +316,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
get_namespace_name(RelationGetNamespace(onerel)),
relname)));
- empty_pages = vacuumed_pages = scanned_pages = 0;
+ empty_pages = vacuumed_pages = 0;
num_tuples = tups_vacuumed = nkeep = nunused = 0;
indstats = (IndexBulkDeleteResult **)
@@ -329,12 +324,47 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
nblocks = RelationGetNumberOfBlocks(onerel);
vacrelstats->rel_pages = nblocks;
+ vacrelstats->scanned_pages = 0;
vacrelstats->nonempty_pages = 0;
vacrelstats->latestRemovedXid = InvalidTransactionId;
lazy_space_alloc(vacrelstats, nblocks);
- all_visible_streak = 0;
+ /*
+ * We want to skip pages that don't require vacuuming according to the
+ * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD
+ * consecutive pages. Since we're reading sequentially, the OS should be
+ * doing readahead for us, so there's no gain in skipping a page now and
+ * then; that's likely to disable readahead and so be counterproductive.
+ * Also, skipping even a single page means that we can't update
+ * relfrozenxid, so we only want to do it if we can skip a goodly number
+ * of pages.
+ *
+ * Before entering the main loop, establish the invariant that
+ * next_not_all_visible_block is the next block number >= blkno that's
+ * not all-visible according to the visibility map, or nblocks if there's
+ * no such block. Also, we set up the skipping_all_visible_blocks flag,
+ * which is needed because we need hysteresis in the decision: once we've
+ * started skipping blocks, we may as well skip everything up to the next
+ * not-all-visible block.
+ *
+ * Note: if scan_all is true, we won't actually skip any pages; but we
+ * maintain next_not_all_visible_block anyway, so as to set up the
+ * all_visible_according_to_vm flag correctly for each page.
+ */
+ for (next_not_all_visible_block = 0;
+ next_not_all_visible_block < nblocks;
+ next_not_all_visible_block++)
+ {
+ if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer))
+ break;
+ vacuum_delay_point();
+ }
+ if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD)
+ skipping_all_visible_blocks = true;
+ else
+ skipping_all_visible_blocks = false;
+
for (blkno = 0; blkno < nblocks; blkno++)
{
Buffer buf;
@@ -347,41 +377,45 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
OffsetNumber frozen[MaxOffsetNumber];
int nfrozen;
Size freespace;
- bool all_visible_according_to_vm = false;
+ bool all_visible_according_to_vm;
bool all_visible;
bool has_dead_tuples;
- /*
- * Skip pages that don't require vacuuming according to the visibility
- * map. But only if we've seen a streak of at least
- * SKIP_PAGES_THRESHOLD pages marked as clean. Since we're reading
- * sequentially, the OS should be doing readahead for us and there's
- * no gain in skipping a page now and then. You need a longer run of
- * consecutive skipped pages before it's worthwhile. Also, skipping
- * even a single page means that we can't update relfrozenxid or
- * reltuples, so we only want to do it if there's a good chance to
- * skip a goodly number of pages.
- */
- if (!scan_all)
+ if (blkno == next_not_all_visible_block)
{
- all_visible_according_to_vm =
- visibilitymap_test(onerel, blkno, &vmbuffer);
- if (all_visible_according_to_vm)
+ /* Time to advance next_not_all_visible_block */
+ for (next_not_all_visible_block++;
+ next_not_all_visible_block < nblocks;
+ next_not_all_visible_block++)
{
- all_visible_streak++;
- if (all_visible_streak >= SKIP_PAGES_THRESHOLD)
- {
- vacrelstats->scanned_all = false;
- continue;
- }
+ if (!visibilitymap_test(onerel, next_not_all_visible_block,
+ &vmbuffer))
+ break;
+ vacuum_delay_point();
}
+
+ /*
+ * We know we can't skip the current block. But set up
+ * skipping_all_visible_blocks to do the right thing at the
+ * following blocks.
+ */
+ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD)
+ skipping_all_visible_blocks = true;
else
- all_visible_streak = 0;
+ skipping_all_visible_blocks = false;
+ all_visible_according_to_vm = false;
+ }
+ else
+ {
+ /* Current block is all-visible */
+ if (skipping_all_visible_blocks && !scan_all)
+ continue;
+ all_visible_according_to_vm = true;
}
vacuum_delay_point();
- scanned_pages++;
+ vacrelstats->scanned_pages++;
/*
* If we are close to overrunning the available space for dead-tuple
@@ -764,9 +798,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
}
/* save stats for use later */
- vacrelstats->rel_tuples = num_tuples;
+ vacrelstats->scanned_tuples = num_tuples;
vacrelstats->tuples_deleted = tups_vacuumed;
+ /* now we can compute the new value for pg_class.reltuples */
+ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
+ nblocks,
+ vacrelstats->scanned_pages,
+ num_tuples);
+
/* If any tuples need to be deleted, perform final vacuum cycle */
/* XXX put a threshold on min number of tuples here? */
if (vacrelstats->num_dead_tuples > 0)
@@ -805,7 +845,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
ereport(elevel,
(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
RelationGetRelationName(onerel),
- tups_vacuumed, num_tuples, scanned_pages, nblocks),
+ tups_vacuumed, num_tuples,
+ vacrelstats->scanned_pages, nblocks),
errdetail("%.0f dead row versions cannot be removed yet.\n"
"There were %.0f unused item pointers.\n"
"%u pages are entirely empty.\n"
@@ -977,10 +1018,9 @@ lazy_cleanup_index(Relation indrel,
ivinfo.index = indrel;
ivinfo.analyze_only = false;
- ivinfo.estimated_count = !vacrelstats->scanned_all;
+ ivinfo.estimated_count = (vacrelstats->scanned_pages < vacrelstats->rel_pages);
ivinfo.message_level = elevel;
- /* use rel_tuples only if we scanned all pages, else fall back */
- ivinfo.num_heap_tuples = vacrelstats->scanned_all ? vacrelstats->rel_tuples : vacrelstats->old_rel_tuples;
+ ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
ivinfo.strategy = vac_strategy;
stats = index_vacuum_cleanup(&ivinfo, stats);
@@ -1041,8 +1081,13 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
new_rel_pages = RelationGetNumberOfBlocks(onerel);
if (new_rel_pages != old_rel_pages)
{
- /* might as well use the latest news when we update pg_class stats */
- vacrelstats->rel_pages = new_rel_pages;
+ /*
+ * Note: we intentionally don't update vacrelstats->rel_pages with
+ * the new rel size here. If we did, it would amount to assuming that
+ * the new pages are empty, which is unlikely. Leaving the numbers
+ * alone amounts to assuming that the new pages have the same tuple
+ * density as existing ones, which is less unlikely.
+ */
UnlockRelation(onerel, AccessExclusiveLock);
return;
}
@@ -1076,7 +1121,11 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
*/
UnlockRelation(onerel, AccessExclusiveLock);
- /* update statistics */
+ /*
+ * Update statistics. Here, it *is* correct to adjust rel_pages without
+ * also touching reltuples, since the tuple count wasn't changed by the
+ * truncation.
+ */
vacrelstats->rel_pages = new_rel_pages;
vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c
index 618f007827..e6ab659f4b 100644
--- a/src/backend/libpq/auth.c
+++ b/src/backend/libpq/auth.c
@@ -1788,7 +1788,7 @@ auth_peer(hbaPort *port)
char ident_user[IDENT_USERNAME_MAX + 1];
#if defined(HAVE_GETPEEREID)
- /* OpenBSD style: */
+ /* OpenBSD (also Mac OS X) style: use getpeereid() */
uid_t uid;
gid_t gid;
struct passwd *pass;
@@ -1843,7 +1843,7 @@ auth_peer(hbaPort *port)
strlcpy(ident_user, pass->pw_name, IDENT_USERNAME_MAX + 1);
#elif defined(HAVE_GETPEERUCRED)
- /* Solaris > 10 */
+ /* Solaris > 10: use getpeerucred() */
uid_t uid;
struct passwd *pass;
ucred_t *ucred;
@@ -1878,9 +1878,7 @@ auth_peer(hbaPort *port)
strlcpy(ident_user, pass->pw_name, IDENT_USERNAME_MAX + 1);
#elif defined(HAVE_STRUCT_CMSGCRED) || defined(HAVE_STRUCT_FCRED) || (defined(HAVE_STRUCT_SOCKCRED) && defined(LOCAL_CREDS))
- struct msghdr msg;
-
-/* Credentials structure */
+ /* Assorted BSDen: use a credentials control message */
#if defined(HAVE_STRUCT_CMSGCRED)
typedef struct cmsgcred Cred;
@@ -1894,36 +1892,35 @@ auth_peer(hbaPort *port)
#define cruid sc_uid
#endif
- Cred *cred;
-
- /* Compute size without padding */
- char cmsgmem[ALIGN(sizeof(struct cmsghdr)) + ALIGN(sizeof(Cred))]; /* for NetBSD */
-
- /* Point to start of first structure */
- struct cmsghdr *cmsg = (struct cmsghdr *) cmsgmem;
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+ union
+ {
+ struct cmsghdr hdr;
+ unsigned char buf[CMSG_SPACE(sizeof(Cred))];
+ } cmsgbuf;
struct iovec iov;
char buf;
+ Cred *cred;
struct passwd *pw;
- memset(&msg, 0, sizeof(msg));
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
- msg.msg_control = (char *) cmsg;
- msg.msg_controllen = sizeof(cmsgmem);
- memset(cmsg, 0, sizeof(cmsgmem));
-
/*
- * The one character which is received here is not meaningful; its
- * purposes is only to make sure that recvmsg() blocks long enough for the
- * other side to send its credentials.
+ * The one character that is received here is not meaningful; its purpose
+ * is only to make sure that recvmsg() blocks long enough for the other
+ * side to send its credentials.
*/
iov.iov_base = &buf;
iov.iov_len = 1;
- if (recvmsg(port->sock, &msg, 0) < 0 ||
- cmsg->cmsg_len < sizeof(cmsgmem) ||
- cmsg->cmsg_type != SCM_CREDS)
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = &cmsgbuf.buf;
+ msg.msg_controllen = sizeof(cmsgbuf.buf);
+ memset(&cmsgbuf, 0, sizeof(cmsgbuf));
+
+ if (recvmsg(port->sock, &msg, 0) < 0)
{
ereport(LOG,
(errcode_for_socket_access(),
@@ -1931,6 +1928,19 @@ auth_peer(hbaPort *port)
return STATUS_ERROR;
}
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC) ||
+ cmsg == NULL ||
+ cmsg->cmsg_len < CMSG_LEN(sizeof(Cred)) ||
+ cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_CREDS)
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("could not get peer credentials: incorrect control message")));
+ return STATUS_ERROR;
+ }
+
cred = (Cred *) CMSG_DATA(cmsg);
pw = getpwuid(cred->cruid);
diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c
index c17863fce5..f3a3b6e2cc 100644
--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -824,7 +824,16 @@ parse_hba_line(List *line, int line_num, HbaLine *parsedline)
token = lfirst(line_item);
if (strcmp(token, "local") == 0)
{
+#ifdef HAVE_UNIX_SOCKETS
parsedline->conntype = ctLocal;
+#else
+ ereport(LOG,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("local connections are not supported by this build"),
+ errcontext("line %d of configuration file \"%s\"",
+ line_num, HbaFileName)));
+ return false;
+#endif
}
else if (strcmp(token, "host") == 0
|| strcmp(token, "hostssl") == 0
diff --git a/src/backend/libpq/pg_hba.conf.sample b/src/backend/libpq/pg_hba.conf.sample
index f913b9aabb..0a50905eff 100644
--- a/src/backend/libpq/pg_hba.conf.sample
+++ b/src/backend/libpq/pg_hba.conf.sample
@@ -84,6 +84,6 @@ host all all 127.0.0.1/32 @authmethod@
host all all ::1/128 @authmethod@
# Allow replication connections from localhost, by a user with the
# replication privilege.
-#local replication @default_username@ @authmethod@
+@remove-line-for-nolocal@#local replication @default_username@ @authmethodlocal@
#host replication @default_username@ 127.0.0.1/32 @authmethod@
#host replication @default_username@ ::1/128 @authmethod@
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5ed6e8337c..1d80c311d8 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1246,8 +1246,7 @@ pgstat_report_autovac(Oid dboid)
* ---------
*/
void
-pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts,
- PgStat_Counter tuples)
+pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter tuples)
{
PgStat_MsgVacuum msg;
@@ -1257,7 +1256,6 @@ pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts,
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
msg.m_tableoid = tableoid;
- msg.m_adopt_counts = adopt_counts;
msg.m_autovacuum = IsAutoVacuumWorkerProcess();
msg.m_vacuumtime = GetCurrentTimestamp();
msg.m_tuples = tuples;
@@ -1271,7 +1269,7 @@ pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts,
* --------
*/
void
-pgstat_report_analyze(Relation rel, bool adopt_counts,
+pgstat_report_analyze(Relation rel,
PgStat_Counter livetuples, PgStat_Counter deadtuples)
{
PgStat_MsgAnalyze msg;
@@ -1308,7 +1306,6 @@ pgstat_report_analyze(Relation rel, bool adopt_counts,
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
msg.m_tableoid = RelationGetRelid(rel);
- msg.m_adopt_counts = adopt_counts;
msg.m_autovacuum = IsAutoVacuumWorkerProcess();
msg.m_analyzetime = GetCurrentTimestamp();
msg.m_live_tuples = livetuples;
@@ -4197,8 +4194,7 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
- if (msg->m_adopt_counts)
- tabentry->n_live_tuples = msg->m_tuples;
+ tabentry->n_live_tuples = msg->m_tuples;
/* Resetting dead_tuples to 0 is an approximation ... */
tabentry->n_dead_tuples = 0;
@@ -4233,11 +4229,8 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
- if (msg->m_adopt_counts)
- {
- tabentry->n_live_tuples = msg->m_live_tuples;
- tabentry->n_dead_tuples = msg->m_dead_tuples;
- }
+ tabentry->n_live_tuples = msg->m_live_tuples;
+ tabentry->n_dead_tuples = msg->m_dead_tuples;
/*
* We reset changes_since_analyze to zero, forgetting any changes that
diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI
index a2bb63e3f8..5617182a44 100644
--- a/src/backend/storage/lmgr/README-SSI
+++ b/src/backend/storage/lmgr/README-SSI
@@ -402,6 +402,54 @@ is based on the top level xid. When looking at an xid that comes
from a tuple's xmin or xmax, for example, we always call
SubTransGetTopmostTransaction() before doing much else with it.
+ * PostgreSQL does not use "update in place" with a rollback log
+for its MVCC implementation. Where possible it uses "HOT" updates on
+the same page (if there is room and no indexed value is changed).
+For non-HOT updates the old tuple is expired in place and a new tuple
+is inserted at a new location. Because of this difference, a tuple
+lock in PostgreSQL doesn't automatically lock any other versions of a
+row. We don't try to copy or expand a tuple lock to any other
+versions of the row, based on the following proof that any additional
+serialization failures we would get from that would be false
+positives:
+
+ o If transaction T1 reads a row (thus acquiring a predicate
+lock on it) and a second transaction T2 updates that row, must a
+third transaction T3 which updates the new version of the row have a
+rw-conflict in from T1 to prevent anomalies? In other words, does it
+matter whether this edge T1 -> T3 is there?
+
+ o If T1 has a conflict in, it certainly doesn't. Adding the
+edge T1 -> T3 would create a dangerous structure, but we already had
+one from the edge T1 -> T2, so we would have aborted something
+anyway.
+
+ o Now let's consider the case where T1 doesn't have a
+conflict in. If that's the case, for this edge T1 -> T3 to make a
+difference, T3 must have a rw-conflict out that induces a cycle in
+the dependency graph, i.e. a conflict out to some transaction
+preceding T1 in the serial order. (A conflict out to T1 would work
+too, but that would mean T1 has a conflict in and we would have
+rolled back.)
+
+ o So now we're trying to figure out if there can be an
+rw-conflict edge T3 -> T0, where T0 is some transaction that precedes
+T1. For T0 to precede T1, there has to be has to be some edge, or
+sequence of edges, from T0 to T1. At least the last edge has to be a
+wr-dependency or ww-dependency rather than a rw-conflict, because T1
+doesn't have a rw-conflict in. And that gives us enough information
+about the order of transactions to see that T3 can't have a
+rw-dependency to T0:
+ - T0 committed before T1 started (the wr/ww-dependency implies this)
+ - T1 started before T2 committed (the T1->T2 rw-conflict implies this)
+ - T2 committed before T3 started (otherwise, T3 would be aborted
+ because of an update conflict)
+
+ o That means T0 committed before T3 started, and therefore
+there can't be a rw-conflict from T3 to T0.
+
+ o In both cases, we didn't need the T1 -> T3 edge.
+
* Predicate locking in PostgreSQL will start at the tuple level
when possible, with automatic conversion of multiple fine-grained
locks to coarser granularity as need to avoid resource exhaustion.
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index dc53a7ab5b..5670f31f58 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -155,9 +155,6 @@
* BlockNumber newblkno);
* PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
* BlockNumber newblkno);
- * PredicateLockTupleRowVersionLink(const Relation relation,
- * const HeapTuple oldTuple,
- * const HeapTuple newTuple)
* ReleasePredicateLocks(bool isCommit)
*
* conflict detection (may also trigger rollback)
@@ -2252,90 +2249,6 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
PredicateLockAcquire(&tag);
}
-/*
- * If the old tuple has any predicate locks, copy them to the new target.
- *
- * This is called at an UPDATE, where any predicate locks held on the old
- * tuple need to be copied to the new tuple, because logically they both
- * represent the same row. A lock taken before the update must conflict
- * with anyone locking the same row after the update.
- */
-void
-PredicateLockTupleRowVersionLink(const Relation relation,
- const HeapTuple oldTuple,
- const HeapTuple newTuple)
-{
- PREDICATELOCKTARGETTAG oldtupletag;
- PREDICATELOCKTARGETTAG oldpagetag;
- PREDICATELOCKTARGETTAG newtupletag;
- BlockNumber oldblk,
- newblk;
- OffsetNumber oldoff,
- newoff;
- TransactionId oldxmin,
- newxmin;
-
- /*
- * Bail out quickly if there are no serializable transactions
- * running.
- *
- * It's safe to do this check without taking any additional
- * locks. Even if a serializable transaction starts concurrently,
- * we know it can't take any SIREAD locks on the modified tuple
- * because the caller is holding the associated buffer page lock.
- * Memory reordering isn't an issue; the memory barrier in the
- * LWLock acquisition guarantees that this read occurs while the
- * buffer page lock is held.
- */
- if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
- return;
-
- oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self));
- oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self));
- oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data);
-
- newblk = ItemPointerGetBlockNumber(&(newTuple->t_self));
- newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self));
- newxmin = HeapTupleHeaderGetXmin(newTuple->t_data);
-
- SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag,
- relation->rd_node.dbNode,
- relation->rd_id,
- oldblk,
- oldoff,
- oldxmin);
-
- SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag,
- relation->rd_node.dbNode,
- relation->rd_id,
- oldblk);
-
- SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag,
- relation->rd_node.dbNode,
- relation->rd_id,
- newblk,
- newoff,
- newxmin);
-
- /*
- * A page-level lock on the page containing the old tuple counts too.
- * Anyone holding a lock on the page is logically holding a lock on the
- * old tuple, so we need to acquire a lock on his behalf on the new tuple
- * too. However, if the new tuple is on the same page as the old one, the
- * old page-level lock already covers the new tuple.
- *
- * A relation-level lock always covers both tuple versions, so we don't
- * need to worry about those here.
- */
- LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
-
- TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false);
- if (newblk != oldblk)
- TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false);
-
- LWLockRelease(SerializablePredicateLockListLock);
-}
-
/*
* DeleteLockTarget
@@ -2650,9 +2563,15 @@ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,
/*
* Bail out quickly if there are no serializable transactions
- * running. As with PredicateLockTupleRowVersionLink, it's safe to
- * check this without taking locks because the caller is holding
- * the buffer page lock.
+ * running.
+ *
+ * It's safe to do this check without taking any additional
+ * locks. Even if a serializable transaction starts concurrently,
+ * we know it can't take any SIREAD locks on the page being split
+ * because the caller is holding the associated buffer page lock.
+ * Memory reordering isn't an issue; the memory barrier in the
+ * LWLock acquisition guarantees that this read occurs while the
+ * buffer page lock is held.
*/
if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
return;
@@ -3890,8 +3809,21 @@ FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
}
/*
- * Check whether we should roll back one of these transactions
- * instead of flagging a new rw-conflict.
+ * We are about to add a RW-edge to the dependency graph - check that we don't
+ * introduce a dangerous structure by doing so, and abort one of the
+ * transactions if so.
+ *
+ * A serialization failure can only occur if there is a dangerous structure
+ * in the dependency graph:
+ *
+ * Tin ------> Tpivot ------> Tout
+ * rw rw
+ *
+ * Furthermore, Tout must commit first.
+ *
+ * One more optimization is that if Tin is declared READ ONLY (or commits
+ * without writing), we can only have a problem if Tout committed before Tin
+ * acquired its snapshot.
*/
static void
OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
@@ -3905,100 +3837,134 @@ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
failure = false;
/*
- * Check for already-committed writer with rw-conflict out flagged. This
- * means that the reader must immediately fail.
+ * Check for already-committed writer with rw-conflict out flagged
+ * (conflict-flag on W means that T2 committed before W):
+ *
+ * R ------> W ------> T2
+ * rw rw
+ *
+ * That is a dangerous structure, so we must abort. (Since the writer
+ * has already committed, we must be the reader)
*/
if (SxactIsCommitted(writer)
&& (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
failure = true;
/*
- * Check whether the reader has become a pivot with a committed writer. If
- * so, we must roll back unless every in-conflict either committed before
- * the writer committed or is READ ONLY and overlaps the writer.
+ * Check whether the writer has become a pivot with an out-conflict
+ * committed transaction (T2), and T2 committed first:
+ *
+ * R ------> W ------> T2
+ * rw rw
+ *
+ * Because T2 must've committed first, there is no anomaly if:
+ * - the reader committed before T2
+ * - the writer committed before T2
+ * - the reader is a READ ONLY transaction and the reader was not
+ * concurrent with T2 (= reader acquired its snapshot after T2 committed)
*/
- if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader))
+ if (!failure)
{
- if (SxactHasSummaryConflictIn(reader))
+ if (SxactHasSummaryConflictOut(writer))
{
failure = true;
conflict = NULL;
}
else
conflict = (RWConflict)
- SHMQueueNext(&reader->inConflicts,
- &reader->inConflicts,
- offsetof(RWConflictData, inLink));
+ SHMQueueNext(&writer->outConflicts,
+ &writer->outConflicts,
+ offsetof(RWConflictData, outLink));
while (conflict)
{
- if (!SxactIsRolledBack(conflict->sxactOut)
- && (!SxactIsCommitted(conflict->sxactOut)
- || conflict->sxactOut->commitSeqNo >= writer->commitSeqNo)
- && (!SxactIsReadOnly(conflict->sxactOut)
- || conflict->sxactOut->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
+ SERIALIZABLEXACT *t2 = conflict->sxactIn;
+
+ if (SxactIsCommitted(t2)
+ && (!SxactIsCommitted(reader)
+ || t2->commitSeqNo <= reader->commitSeqNo)
+ && (!SxactIsCommitted(writer)
+ || t2->commitSeqNo <= writer->commitSeqNo)
+ && (!SxactIsReadOnly(reader)
+ || t2->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
{
failure = true;
break;
}
conflict = (RWConflict)
- SHMQueueNext(&reader->inConflicts,
- &conflict->inLink,
- offsetof(RWConflictData, inLink));
+ SHMQueueNext(&writer->outConflicts,
+ &conflict->outLink,
+ offsetof(RWConflictData, outLink));
}
}
/*
- * Check whether the writer has become a pivot with an out-conflict
- * committed transaction, while neither reader nor writer is committed. If
- * the reader is a READ ONLY transaction, there is only a serialization
- * failure if an out-conflict transaction causing the pivot committed
- * before the reader acquired its snapshot. (That is, the reader must not
- * have been concurrent with the out-conflict transaction.)
+ * Check whether the reader has become a pivot with a committed writer:
+ *
+ * T0 ------> R ------> W
+ * rw rw
+ *
+ * Because W must've committed first for an anomaly to occur, there is no
+ * anomaly if:
+ * - T0 committed before the writer
+ * - T0 is READ ONLY, and overlaps the writer
*/
- if (!failure && !SxactIsCommitted(writer))
+ if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader))
{
- if (SxactHasSummaryConflictOut(reader))
+ if (SxactHasSummaryConflictIn(reader))
{
failure = true;
conflict = NULL;
}
else
conflict = (RWConflict)
- SHMQueueNext(&writer->outConflicts,
- &writer->outConflicts,
- offsetof(RWConflictData, outLink));
+ SHMQueueNext(&reader->inConflicts,
+ &reader->inConflicts,
+ offsetof(RWConflictData, inLink));
while (conflict)
{
- if ((reader == conflict->sxactIn && SxactIsCommitted(reader))
- || (SxactIsCommitted(conflict->sxactIn)
- && !SxactIsCommitted(reader)
- && (!SxactIsReadOnly(reader)
- || conflict->sxactIn->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot)))
+ SERIALIZABLEXACT *t0 = conflict->sxactOut;
+
+ if (!SxactIsRolledBack(t0)
+ && (!SxactIsCommitted(t0)
+ || t0->commitSeqNo >= writer->commitSeqNo)
+ && (!SxactIsReadOnly(t0)
+ || t0->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
{
failure = true;
break;
}
conflict = (RWConflict)
- SHMQueueNext(&writer->outConflicts,
- &conflict->outLink,
- offsetof(RWConflictData, outLink));
+ SHMQueueNext(&reader->inConflicts,
+ &conflict->inLink,
+ offsetof(RWConflictData, inLink));
}
}
if (failure)
{
+ /*
+ * We have to kill a transaction to avoid a possible anomaly from
+ * occurring. If the writer is us, we can just ereport() to cause
+ * a transaction abort. Otherwise we flag the writer for termination,
+ * causing it to abort when it tries to commit. However, if the writer
+ * is a prepared transaction, already prepared, we can't abort it
+ * anymore, so we have to kill the reader instead.
+ */
if (MySerializableXact == writer)
{
LWLockRelease(SerializableXactHashLock);
ereport(ERROR,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("could not serialize access due to read/write dependencies among transactions"),
- errdetail("Cancelled on identification as pivot, during write."),
+ errdetail("Cancelled on identification as a pivot, during write."),
errhint("The transaction might succeed if retried.")));
}
else if (SxactIsPrepared(writer))
{
LWLockRelease(SerializableXactHashLock);
+
+ /* if we're not the writer, we have to be the reader */
+ Assert(MySerializableXact == reader);
ereport(ERROR,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("could not serialize access due to read/write dependencies among transactions"),
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index ee82d4616c..702b9e3e9f 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -1067,13 +1067,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp,
if (xmlStrncmp(p, (xmlChar *) "'yes'", 5) == 0 ||
xmlStrncmp(p, (xmlChar *) "\"yes\"", 5) == 0)
{
- *standalone = 1;
+ if (standalone)
+ *standalone = 1;
p += 5;
}
else if (xmlStrncmp(p, (xmlChar *) "'no'", 4) == 0 ||
xmlStrncmp(p, (xmlChar *) "\"no\"", 4) == 0)
{
- *standalone = 0;
+ if (standalone)
+ *standalone = 0;
p += 4;
}
else
@@ -1218,8 +1220,8 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
{
int res_code;
size_t count;
- xmlChar *version = NULL;
- int standalone = -1;
+ xmlChar *version;
+ int standalone;
res_code = parse_xml_decl(utf8string,
&count, &version, NULL, &standalone);
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 1f31fe0694..17cff8dd5b 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -126,7 +126,8 @@ usage(void)
printf(_(" -D, --pgdata=DIRECTORY receive base backup into directory\n"));
printf(_(" -F, --format=p|t output format (plain, tar)\n"));
printf(_(" -x, --xlog include required WAL files in backup\n"));
- printf(_(" -Z, --compress=0-9 compress tar output\n"));
+ printf(_(" -z, --gzip compress tar output\n"));
+ printf(_(" -Z, --compress=0-9 compress tar output with given compression level\n"));
printf(_("\nGeneral options:\n"));
printf(_(" -c, --checkpoint=fast|spread\n"
" set fast or spread checkpointing\n"));
@@ -261,7 +262,22 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
* Base tablespaces
*/
if (strcmp(basedir, "-") == 0)
- tarfile = stdout;
+ {
+#ifdef HAVE_LIBZ
+ if (compresslevel > 0)
+ {
+ ztarfile = gzdopen(dup(fileno(stdout)), "wb");
+ if (gzsetparams(ztarfile, compresslevel, Z_DEFAULT_STRATEGY) != Z_OK)
+ {
+ fprintf(stderr, _("%s: could not set compression level %i: %s\n"),
+ progname, compresslevel, get_gz_error(ztarfile));
+ disconnect_and_exit(1);
+ }
+ }
+ else
+#endif
+ tarfile = stdout;
+ }
else
{
#ifdef HAVE_LIBZ
@@ -384,7 +400,14 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
}
}
- if (strcmp(basedir, "-") != 0)
+ if (strcmp(basedir, "-") == 0)
+ {
+#ifdef HAVE_LIBZ
+ if (ztarfile)
+ gzclose(ztarfile);
+#endif
+ }
+ else
{
#ifdef HAVE_LIBZ
if (ztarfile != NULL)
@@ -882,7 +905,7 @@ BaseBackup(void)
}
if (PQntuples(res) != 1)
{
- fprintf(stderr, _("%s: no end point returned from server\n"),
+ fprintf(stderr, _("%s: no WAL end position returned from server\n"),
progname);
disconnect_and_exit(1);
}
@@ -919,6 +942,7 @@ main(int argc, char **argv)
{"format", required_argument, NULL, 'F'},
{"checkpoint", required_argument, NULL, 'c'},
{"xlog", no_argument, NULL, 'x'},
+ {"gzip", no_argument, NULL, 'z'},
{"compress", required_argument, NULL, 'Z'},
{"label", required_argument, NULL, 'l'},
{"host", required_argument, NULL, 'h'},
@@ -978,6 +1002,13 @@ main(int argc, char **argv)
case 'l':
label = xstrdup(optarg);
break;
+ case 'z':
+#ifdef HAVE_LIBZ
+ compresslevel = Z_DEFAULT_COMPRESSION;
+#else
+ compresslevel = 1; /* will be rejected below */
+#endif
+ break;
case 'Z':
compresslevel = atoi(optarg);
if (compresslevel <= 0 || compresslevel > 9)
@@ -1076,14 +1107,6 @@ main(int argc, char **argv)
progname);
exit(1);
}
-#else
- if (compresslevel > 0 && strcmp(basedir, "-") == 0)
- {
- fprintf(stderr,
- _("%s: compression is not supported on standard output\n"),
- progname);
- exit(1);
- }
#endif
/*
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c
index 66d2419725..8540a75d11 100644
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -535,6 +535,7 @@ cfopen_write(const char *path, const char *mode, int compression)
free(fname);
#else
die_horribly(NULL, modulename, "not built with zlib support\n");
+ fp = NULL; /* keep compiler quiet */
#endif
}
return fp;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index f925be1ffa..3781e599c4 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -11180,6 +11180,14 @@ dumpForeignDataWrapper(Archive *fout, FdwInfo *fdwinfo)
if (!fdwinfo->dobj.dump || dataOnly)
return;
+ /*
+ * FDWs that belong to an extension are dumped based on their "dump" field.
+ * Otherwise omit them if we are only dumping some specific object.
+ */
+ if (!fdwinfo->dobj.ext_member)
+ if (!include_everything)
+ return;
+
q = createPQExpBuffer();
delq = createPQExpBuffer();
labelq = createPQExpBuffer();
@@ -11255,7 +11263,7 @@ dumpForeignServer(Archive *fout, ForeignServerInfo *srvinfo)
char *fdwname;
/* Skip if not to be dumped */
- if (!srvinfo->dobj.dump || dataOnly)
+ if (!srvinfo->dobj.dump || dataOnly || !include_everything)
return;
q = createPQExpBuffer();
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 79c9f5d90f..cfbe0c4392 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -142,6 +142,10 @@ extern void vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast,
extern void vac_open_indexes(Relation relation, LOCKMODE lockmode,
int *nindexes, Relation **Irel);
extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode);
+extern double vac_estimate_reltuples(Relation relation, bool is_analyze,
+ BlockNumber total_pages,
+ BlockNumber scanned_pages,
+ double scanned_tuples);
extern void vac_update_relstats(Relation relation,
BlockNumber num_pages,
double num_tuples,
@@ -157,10 +161,10 @@ extern void vacuum_delay_point(void);
/* in commands/vacuumlazy.c */
extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
- BufferAccessStrategy bstrategy, bool *scanned_all);
+ BufferAccessStrategy bstrategy);
/* in commands/analyze.c */
extern void analyze_rel(Oid relid, VacuumStmt *vacstmt,
- BufferAccessStrategy bstrategy, bool update_reltuples);
+ BufferAccessStrategy bstrategy);
#endif /* VACUUM_H */
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index f04be95b45..5446fa0440 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -322,7 +322,6 @@ typedef struct PgStat_MsgVacuum
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
Oid m_tableoid;
- bool m_adopt_counts;
bool m_autovacuum;
TimestampTz m_vacuumtime;
PgStat_Counter m_tuples;
@@ -339,7 +338,6 @@ typedef struct PgStat_MsgAnalyze
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
Oid m_tableoid;
- bool m_adopt_counts;
bool m_autovacuum;
TimestampTz m_analyzetime;
PgStat_Counter m_live_tuples;
@@ -706,9 +704,9 @@ extern void pgstat_reset_shared_counters(const char *);
extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type type);
extern void pgstat_report_autovac(Oid dboid);
-extern void pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts,
+extern void pgstat_report_vacuum(Oid tableoid, bool shared,
PgStat_Counter tuples);
-extern void pgstat_report_analyze(Relation rel, bool adopt_counts,
+extern void pgstat_report_analyze(Relation rel,
PgStat_Counter livetuples, PgStat_Counter deadtuples);
extern void pgstat_report_recovery_conflict(int reason);
diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h
index 9a26ecf2d3..77ae8f904d 100644
--- a/src/include/storage/predicate.h
+++ b/src/include/storage/predicate.h
@@ -47,7 +47,6 @@ extern void RegisterPredicateLockingXid(const TransactionId xid);
extern void PredicateLockRelation(const Relation relation);
extern void PredicateLockPage(const Relation relation, const BlockNumber blkno);
extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple);
-extern void PredicateLockTupleRowVersionLink(const Relation relation, const HeapTuple oldTuple, const HeapTuple newTuple);
extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
extern void ReleasePredicateLocks(const bool isCommit);
diff --git a/src/interfaces/libpq/fe-auth.c b/src/interfaces/libpq/fe-auth.c
index 6f1a163a10..094926b4e6 100644
--- a/src/interfaces/libpq/fe-auth.c
+++ b/src/interfaces/libpq/fe-auth.c
@@ -693,11 +693,12 @@ pg_local_sendauth(PGconn *conn)
struct msghdr msg;
#ifdef HAVE_STRUCT_CMSGCRED
- /* Prevent padding */
- char cmsgmem[sizeof(struct cmsghdr) + sizeof(struct cmsgcred)];
-
- /* Point to start of first structure */
- struct cmsghdr *cmsg = (struct cmsghdr *) cmsgmem;
+ struct cmsghdr *cmsg;
+ union
+ {
+ struct cmsghdr hdr;
+ unsigned char buf[CMSG_SPACE(sizeof(struct cmsgcred))];
+ } cmsgbuf;
#endif
/*
@@ -713,11 +714,12 @@ pg_local_sendauth(PGconn *conn)
msg.msg_iovlen = 1;
#ifdef HAVE_STRUCT_CMSGCRED
- /* Create control header, FreeBSD */
- msg.msg_control = cmsg;
- msg.msg_controllen = sizeof(cmsgmem);
- memset(cmsg, 0, sizeof(cmsgmem));
- cmsg->cmsg_len = sizeof(cmsgmem);
+ /* FreeBSD needs us to set up a message that will be filled in by kernel */
+ memset(&cmsgbuf, 0, sizeof(cmsgbuf));
+ msg.msg_control = &cmsgbuf.buf;
+ msg.msg_controllen = sizeof(cmsgbuf.buf);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct cmsgcred));
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_CREDS;
#endif
diff --git a/src/pl/plperl/plperl.c b/src/pl/plperl/plperl.c
index d69d2327bb..f2e8ad2207 100644
--- a/src/pl/plperl/plperl.c
+++ b/src/pl/plperl/plperl.c
@@ -1357,7 +1357,13 @@ make_array_ref(plperl_array_info *info, int first, int last)
for (i = first; i < last; i++)
{
if (info->nulls[i])
- av_push(result, &PL_sv_undef);
+ {
+ /*
+ * We can't use &PL_sv_undef here. See "AVs, HVs and undefined
+ * values" in perlguts.
+ */
+ av_push(result, newSV(0));
+ }
else
{
Datum itemvalue = info->elements[i];
@@ -2639,8 +2645,12 @@ plperl_hash_from_tuple(HeapTuple tuple, TupleDesc tupdesc)
if (isnull)
{
- /* Store (attname => undef) and move on. */
- hv_store_string(hv, attname, &PL_sv_undef);
+ /*
+ * Store (attname => undef) and move on. Note we can't use
+ * &PL_sv_undef here; see "AVs, HVs and undefined values" in
+ * perlguts for an explanation.
+ */
+ hv_store_string(hv, attname, newSV(0));
continue;
}
diff --git a/src/test/isolation/expected/multiple-row-versions.out b/src/test/isolation/expected/multiple-row-versions.out
index cd31029d17..bbd3ecc0f7 100644
--- a/src/test/isolation/expected/multiple-row-versions.out
+++ b/src/test/isolation/expected/multiple-row-versions.out
@@ -19,6 +19,6 @@ id txt
1
step c4: COMMIT;
step c3: COMMIT;
-ERROR: could not serialize access due to read/write dependencies among transactions
step wz1: UPDATE t SET txt = 'a' WHERE id = 1;
+ERROR: could not serialize access due to read/write dependencies among transactions
step c1: COMMIT;
diff --git a/src/test/isolation/specs/multiple-row-versions.spec b/src/test/isolation/specs/multiple-row-versions.spec
index 8cfe3a44dc..1bb5b4e8ba 100644
--- a/src/test/isolation/specs/multiple-row-versions.spec
+++ b/src/test/isolation/specs/multiple-row-versions.spec
@@ -1,8 +1,7 @@
# Multiple Row Versions test
#
-# This test is designed to ensure that predicate locks taken on one version
-# of a row are detected as conflicts when a later version of the row is
-# updated or deleted by a transaction concurrent to the reader.
+# This test is designed to cover some code paths which only occur with
+# four or more transactions interacting with particular timings.
#
# Due to long permutation setup time, we are only testing one specific
# permutation, which should get a serialization error.