diff options
Diffstat (limited to 'src')
2431 files changed, 417339 insertions, 477184 deletions
diff --git a/src/Makefile b/src/Makefile index 5706bb1335..79cfeeb710 100644 --- a/src/Makefile +++ b/src/Makefile @@ -23,11 +23,13 @@ SUBDIRS = \ backend/snowball \ include \ backend/replication/libpqwalreceiver \ + backend/replication/pgoutput \ fe_utils \ bin \ pl \ makefiles \ - test/regress + test/regress \ + test/perl # There are too many interdependencies between the subdirectories, so # don't attempt parallel make here. @@ -69,16 +71,5 @@ distclean maintainer-clean: $(MAKE) -C test/thread $@ rm -f Makefile.port Makefile.global -coverage: - $(MAKE) -C timezone $@ - $(MAKE) -C gtm $@ - $(MAKE) -C backend $@ - $(MAKE) -C backend/utils/mb/conversion_procs $@ - $(MAKE) -C backend/snowball $@ - $(MAKE) -C interfaces $@ - $(MAKE) -C backend/replication/libpqwalreceiver $@ - $(MAKE) -C bin $@ - $(MAKE) -C pl $@ - .PHONY: install-local installdirs-local uninstall-local diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 64b7b73d16..dc7b801dff 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -180,6 +180,7 @@ pgxsdir = $(pkglibdir)/pgxs # # Records the choice of the various --enable-xxx and --with-xxx options. +with_icu = @with_icu@ with_perl = @with_perl@ with_python = @with_python@ with_tcl = @with_tcl@ @@ -198,6 +199,7 @@ enable_dtrace = @enable_dtrace@ enable_coverage = @enable_coverage@ enable_tap_tests = @enable_tap_tests@ enable_thread_safety = @enable_thread_safety@ +enable_strong_random = @enable_strong_random@ python_includespec = @python_includespec@ python_libdir = @python_libdir@ @@ -208,6 +210,9 @@ python_version = @python_version@ krb_srvtab = @krb_srvtab@ +ICU_CFLAGS = @ICU_CFLAGS@ +ICU_LIBS = @ICU_LIBS@ + TCLSH = @TCLSH@ TCL_LIBS = @TCL_LIBS@ TCL_LIB_SPEC = @TCL_LIB_SPEC@ @@ -266,7 +271,6 @@ UUID_LIBS = @UUID_LIBS@ UUID_EXTRA_OBJS = @UUID_EXTRA_OBJS@ LD = @LD@ with_gnu_ld = @with_gnu_ld@ -ld_R_works = @ld_R_works@ # We want -L for libpgport.a and libpgcommon.a to be first in LDFLAGS. We # also need LDFLAGS to be a "recursively expanded" variable, else adjustments @@ -335,8 +339,11 @@ endif endif PROVE = @PROVE@ -PG_PROVE_FLAGS = -I $(top_srcdir)/src/test/perl/ -PROVE_FLAGS = --verbose +# There are common routines in src/test/perl, and some test suites have +# extra perl modules in their own directory. +PG_PROVE_FLAGS = -I $(top_srcdir)/src/test/perl/ -I $(srcdir) +# User-supplied prove flags such as --verbose can be provided in PROVE_FLAGS. + # prepend to path if already set, else just set it define add_to_path @@ -345,7 +352,7 @@ endef # platform-specific environment variable to set shared library path define ld_library_path_var -$(if $(filter $(PORTNAME),darwin),DYLD_LIBRARY_PATH,$(if $(filter $(PORTNAME),aix),LIBPATH,LD_LIBRARY_PATH)) +$(if $(filter $(PORTNAME),darwin),DYLD_LIBRARY_PATH,$(if $(filter $(PORTNAME),aix),LIBPATH,$(if $(filter $(PORTNAME),hpux),SHLIB_PATH,LD_LIBRARY_PATH))) endef define with_temp_install @@ -356,12 +363,12 @@ ifeq ($(enable_tap_tests),yes) define prove_installcheck rm -rf $(CURDIR)/tmp_check/log -cd $(srcdir) && TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PGPORT='6$(DEF_PGPORT)' top_builddir='$(CURDIR)/$(top_builddir)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) t/*.pl +cd $(srcdir) && TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PGPORT='6$(DEF_PGPORT)' top_builddir='$(CURDIR)/$(top_builddir)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),t/*.pl) endef define prove_check rm -rf $(CURDIR)/tmp_check/log -cd $(srcdir) && TESTDIR='$(CURDIR)' $(with_temp_install) PGPORT='6$(DEF_PGPORT)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) t/*.pl +cd $(srcdir) && TESTDIR='$(CURDIR)' $(with_temp_install) PGPORT='6$(DEF_PGPORT)' PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),t/*.pl) endef else @@ -395,11 +402,8 @@ STRIP_SHARED_LIB = @STRIP_SHARED_LIB@ # Documentation -have_docbook = @have_docbook@ -COLLATEINDEX = @COLLATEINDEX@ DBTOEPUB = @DBTOEPUB@ -DOCBOOKSTYLE = @DOCBOOKSTYLE@ -JADE = @JADE@ +FOP = @FOP@ NSGMLS = @NSGMLS@ OSX = @OSX@ XMLLINT = @XMLLINT@ @@ -546,14 +550,35 @@ TEMP_CONF += --temp-config=$(TEMP_CONFIG) endif pg_regress_locale_flags = $(if $(ENCODING),--encoding=$(ENCODING)) $(NOLOCALE) - -pg_regress_check = $(with_temp_install) $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --temp-instance=./tmp_check $(TEMP_CONF) --bindir= $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) -pg_regress_installcheck = $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --bindir='$(bindir)' $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) - -pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ log/ - -pg_isolation_regress_check = $(with_temp_install) $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --temp-instance=./tmp_check $(TEMP_CONF) --bindir= $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) -pg_isolation_regress_installcheck = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) +pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ tmp_check_iso/ log/ output_iso/ + +pg_regress_check = \ + $(with_temp_install) \ + $(top_builddir)/src/test/regress/pg_regress \ + --temp-instance=./tmp_check \ + --inputdir=$(srcdir) \ + --bindir= \ + $(TEMP_CONF) \ + $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) +pg_regress_installcheck = \ + $(top_builddir)/src/test/regress/pg_regress \ + --inputdir=$(srcdir) \ + --bindir='$(bindir)' \ + $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) + +pg_isolation_regress_check = \ + $(with_temp_install) \ + $(top_builddir)/src/test/isolation/pg_isolation_regress \ + --temp-instance=./tmp_check_iso \ + --inputdir=$(srcdir) --outputdir=output_iso \ + --bindir= \ + $(TEMP_CONF) \ + $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) +pg_isolation_regress_installcheck = \ + $(top_builddir)/src/test/isolation/pg_isolation_regress \ + --inputdir=$(srcdir) --outputdir=output_iso \ + --bindir='$(bindir)' \ + $(pg_regress_locale_flags) $(EXTRA_REGRESS_OPTS) ########################################################################## # @@ -583,6 +608,11 @@ ifneq ($(CUSTOM_COPT),) COPT= $(CUSTOM_COPT) endif +# +# These variables are meant to be set in the environment of "make" +# to add flags to whatever configure picked. Unlike the ones above, +# they are documented. +# ifdef COPT CFLAGS += $(COPT) LDFLAGS += $(COPT) @@ -623,6 +653,7 @@ TAS = @TAS@ ifdef FLEX $(FLEX) $(if $(FLEX_NO_BACKUP),-b) $(FLEXFLAGS) -o'$@' $< @$(if $(FLEX_NO_BACKUP),if [ `wc -l <lex.backup` -eq 1 ]; then rm lex.backup; else echo "Scanner requires backup; see lex.backup." 1>&2; exit 1; fi) + $(if $(FLEX_FIX_WARNING),$(PERL) $(top_srcdir)/src/tools/fix-old-flex-code.pl '$@') else @$(missing) flex $< '$@' endif diff --git a/src/Makefile.shlib b/src/Makefile.shlib index 66452cc2bd..866a2572d4 100644 --- a/src/Makefile.shlib +++ b/src/Makefile.shlib @@ -47,9 +47,8 @@ # clean-lib delete the static and shared libraries from the build dir # maintainer-clean-lib delete .def files built for win32 # -# Since `all-lib' is the first rule in this file you probably want to -# have the `all' target before including this file. In the most simple -# case it would look like this: +# Typically you would add `all-lib' to the `all' target so that `make all' +# builds the libraries. In the most simple case it would look like this: # # all: all-lib # @@ -128,7 +127,7 @@ ifeq ($(PORTNAME), darwin) else # loadable module DLSUFFIX = .so - LINK.shared = $(COMPILER) -bundle -multiply_defined suppress -Wl,-undefined,dynamic_lookup + LINK.shared = $(COMPILER) -bundle -multiply_defined suppress endif BUILD.exports = $(AWK) '/^[^\#]/ {printf "_%s\n",$$1}' $< >$@ exports_file = $(SHLIB_EXPORTS:%.txt=%.list) @@ -194,7 +193,7 @@ ifeq ($(PORTNAME), hpux) # can't use the CC-syntax rpath pattern here, so instead: rpath = ifeq ($(enable_rpath), yes) - LINK.shared += +b '$(rpathdir)' + LINK.shared += +s +b '$(rpathdir)' endif # On HPUX platforms, gcc is usually configured to search for libraries # in /usr/local/lib, but ld won't do so. Add an explicit -L switch so @@ -237,30 +236,6 @@ ifeq ($(PORTNAME), solaris) endif endif -ifeq ($(PORTNAME), sco) - ifeq ($(GCC), yes) - LINK.shared = $(CC) -shared - else - LINK.shared = $(CC) -G - endif - LINK.shared += -Wl,-z,text - ifdef soname - LINK.shared += -Wl,-h,$(soname) - endif -endif - -ifeq ($(PORTNAME), unixware) - ifeq ($(GCC), yes) - LINK.shared = $(CC) -shared - else - LINK.shared = $(CC) -G - endif - LINK.shared += -Wl,-z,text - ifdef soname - LINK.shared += -Wl,-h,$(soname) - endif -endif - ifeq ($(PORTNAME), cygwin) LINK.shared = $(CC) -shared ifdef SO_MAJOR_VERSION @@ -323,7 +298,7 @@ endif endif # shlib_major # Where possible, restrict the symbols exported by the library to just the -# official list, so as to avoid unintentional ABI changes. On recent Darwin +# official list, so as to avoid unintentional ABI changes. On recent macOS # this also quiets multiply-defined-symbol warnings in programs that use # libpgport along with libpq. ifneq (,$(SHLIB_EXPORTS)) @@ -402,7 +377,7 @@ $(shlib): $(OBJS) $(DLL_DEFFILE) | $(SHLIB_PREREQS) $(CC) $(CFLAGS) -shared -static-libgcc -o $@ $(OBJS) $(DLL_DEFFILE) $(LDFLAGS) $(LDFLAGS_SL) $(SHLIB_LINK) $(LIBS) -Wl,--out-implib=$(stlib) endif -endif # PORTNAME == cgywin +endif # PORTNAME == cygwin endif # PORTNAME == cygwin || PORTNAME == win32 @@ -430,30 +405,22 @@ endif # PORTNAME == cygwin || PORTNAME == win32 # tarballs. ifneq (,$(SHLIB_EXPORTS)) -distprep: lib$(NAME)dll.def lib$(NAME)ddll.def blib$(NAME)dll.def +distprep: lib$(NAME)dll.def lib$(NAME)ddll.def UC_NAME = $(shell echo $(NAME) | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') lib$(NAME)dll.def: $(SHLIB_EXPORTS) - echo '; DEF file for win32.mak release build and for Makefile.shlib (MinGW)' >$@ + echo '; DEF file for Makefile.shlib (MinGW)' >$@ echo 'LIBRARY LIB$(UC_NAME).dll' >>$@ echo 'EXPORTS' >>$@ sed -e '/^#/d' -e 's/^\(.*[ ]\)\([0-9][0-9]*\)/ \1@ \2/' $< >>$@ lib$(NAME)ddll.def: $(SHLIB_EXPORTS) - echo '; DEF file for win32.mak debug build' >$@ + echo '; DEF file for Makefile.shlib (MinGW)' >$@ echo 'LIBRARY LIB$(UC_NAME)D.dll' >>$@ echo 'EXPORTS' >>$@ sed -e '/^#/d' -e 's/^\(.*[ ]\)\([0-9][0-9]*\)/ \1@ \2/' $< >>$@ -blib$(NAME)dll.def: $(SHLIB_EXPORTS) - echo '; DEF file for bcc32.mak (Borland C++ Builder)' >$@ - echo 'LIBRARY BLIB$(UC_NAME)' >>$@ - echo 'EXPORTS' >>$@ - sed -e '/^#/d' -e 's/^\(.*[ ]\)\([0-9][0-9]*\)/ _\1@ \2/' $< >>$@ - echo >>$@ - echo '; Aliases for MS compatible names' >> $@ - sed -e '/^#/d' -e 's/^\(.*[ ]\)\([0-9][0-9]*\)/ \1= _\1/' $< | sed 's/ *$$//' >>$@ endif # SHLIB_EXPORTS @@ -542,5 +509,5 @@ clean-lib: ifneq (,$(SHLIB_EXPORTS)) maintainer-clean-lib: - rm -f lib$(NAME)dll.def lib$(NAME)ddll.def blib$(NAME)dll.def + rm -f lib$(NAME)dll.def lib$(NAME)ddll.def endif diff --git a/src/backend/Makefile b/src/backend/Makefile index faec8d8523..d9aec0e0a4 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -2,7 +2,7 @@ # # Makefile for the postgres backend # -# Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group +# Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California # # src/backend/Makefile @@ -23,15 +23,15 @@ endif SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \ pgxc main nodes optimizer port postmaster regex replication rewrite \ - storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq + statistics storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq include $(srcdir)/common.mk # As of 1/2010: # The probes.o file is necessary for dtrace support on Solaris, and on recent # versions of systemtap. (Older systemtap releases just produce an empty -# file, but that's okay.) However, OS X's dtrace doesn't use it and doesn't -# even recognize the -G option. So, build probes.o except on Darwin. +# file, but that's okay.) However, macOS's dtrace doesn't use it and doesn't +# even recognize the -G option. So, build probes.o except on macOS. # This might need adjustment as other platforms add dtrace support. ifneq ($(PORTNAME), darwin) ifeq ($(enable_dtrace), yes) @@ -50,6 +50,7 @@ OBJS = $(SUBDIROBJS) $(LOCALOBJS) \ $(top_builddir)/src/interfaces/libpq/fe-exec.o \ $(top_builddir)/src/interfaces/libpq/fe-auth.o \ $(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \ + $(top_builddir)/src/interfaces/libpq/fe-auth-scram.o \ $(top_builddir)/src/gtm/client/libgtmclient.a \ $(top_builddir)/src/gtm/common/libgtm.a \ $(top_builddir)/src/gtm/libpq/libpqcomm.a @@ -78,7 +79,7 @@ ifneq ($(PORTNAME), win32) ifneq ($(PORTNAME), aix) postgres: $(OBJS) - $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@ + $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@ endif endif @@ -159,8 +160,11 @@ storage/lmgr/lwlocknames.h: storage/lmgr/generate-lwlocknames.pl storage/lmgr/lw utils/errcodes.h: utils/generate-errcodes.pl utils/errcodes.txt $(MAKE) -C utils errcodes.h +# see explanation in parser/Makefile +utils/fmgrprotos.h: utils/fmgroids.h ; + utils/fmgroids.h: utils/Gen_fmgrtab.pl catalog/Catalog.pm $(top_srcdir)/src/include/catalog/pg_proc.h - $(MAKE) -C utils fmgroids.h + $(MAKE) -C utils $(notdir $@) utils/probes.h: utils/probes.d $(MAKE) -C utils probes.h @@ -186,7 +190,7 @@ submake-schemapg: .PHONY: generated-headers -generated-headers: $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/storage/lwlocknames.h $(top_builddir)/src/include/utils/errcodes.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/probes.h +generated-headers: $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/storage/lwlocknames.h $(top_builddir)/src/include/utils/errcodes.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/fmgrprotos.h $(top_builddir)/src/include/utils/probes.h $(top_builddir)/src/include/parser/gram.h: parser/gram.h prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \ @@ -213,6 +217,11 @@ $(top_builddir)/src/include/utils/fmgroids.h: utils/fmgroids.h cd '$(dir $@)' && rm -f $(notdir $@) && \ $(LN_S) "$$prereqdir/$(notdir $<)" . +$(top_builddir)/src/include/utils/fmgrprotos.h: utils/fmgrprotos.h + prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \ + cd '$(dir $@)' && rm -f $(notdir $@) && \ + $(LN_S) "$$prereqdir/$(notdir $<)" . + $(top_builddir)/src/include/utils/probes.h: utils/probes.h cd '$(dir $@)' && rm -f $(notdir $@) && \ $(LN_S) "../../../$(subdir)/utils/probes.h" . @@ -231,7 +240,7 @@ distprep: $(MAKE) -C catalog schemapg.h postgres.bki postgres.description postgres.shdescription $(MAKE) -C replication repl_gram.c repl_scanner.c syncrep_gram.c syncrep_scanner.c $(MAKE) -C storage/lmgr lwlocknames.h - $(MAKE) -C utils fmgrtab.c fmgroids.h errcodes.h + $(MAKE) -C utils fmgrtab.c fmgroids.h fmgrprotos.h errcodes.h $(MAKE) -C utils/misc guc-file.c $(MAKE) -C utils/sort qsort_tuple.c @@ -323,6 +332,7 @@ clean: $(top_builddir)/src/include/catalog/schemapg.h \ $(top_builddir)/src/include/storage/lwlocknames.h \ $(top_builddir)/src/include/utils/fmgroids.h \ + $(top_builddir)/src/include/utils/fmgrprotos.h \ $(top_builddir)/src/include/utils/probes.h ifeq ($(PORTNAME), cygwin) rm -f postgres.dll libpostgres.a @@ -351,6 +361,7 @@ maintainer-clean: distclean storage/lmgr/lwlocknames.c \ storage/lmgr/lwlocknames.h \ utils/fmgroids.h \ + utils/fmgrprotos.h \ utils/fmgrtab.c \ utils/errcodes.h \ utils/misc/guc-file.c \ diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index b194d33cc5..442a46140d 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -4,7 +4,7 @@ * * See src/backend/access/brin/README for details. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -26,8 +26,10 @@ #include "catalog/pg_am.h" #include "miscadmin.h" #include "pgstat.h" +#include "postmaster/autovacuum.h" #include "storage/bufmgr.h" #include "storage/freespace.h" +#include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -59,10 +61,12 @@ typedef struct BrinOpaque BrinDesc *bo_bdesc; } BrinOpaque; +#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber + static BrinBuildState *initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, BlockNumber pagesPerRange); static void terminate_brin_buildstate(BrinBuildState *state); -static void brinsummarize(Relation index, Relation heapRel, +static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, double *numSummarized, double *numExisting); static void form_and_insert_tuple(BrinBuildState *state); static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, @@ -92,6 +96,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amstorage = true; amroutine->amclusterable = false; amroutine->ampredlocks = false; + amroutine->amcanparallel = false; amroutine->amkeytype = InvalidOid; amroutine->ambuild = brinbuild; @@ -111,6 +116,9 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amendscan = brinendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; PG_RETURN_POINTER(amroutine); } @@ -121,57 +129,95 @@ brinhandler(PG_FUNCTION_ARGS) * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * + * If autosummarization is enabled, check if we need to summarize the previous + * page range. + * * If the range is not currently summarized (i.e. the revmap returns NULL for - * it), there's nothing to do. + * it), there's nothing to do for this tuple. */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, - IndexUniqueCheck checkUnique) + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) { BlockNumber pagesPerRange; - BrinDesc *bdesc = NULL; + BlockNumber origHeapBlk; + BlockNumber heapBlk; + BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; - MemoryContext oldcxt = NULL; + MemoryContext oldcxt = CurrentMemoryContext; + bool autosummarize = BrinGetAutoSummarize(idxRel); revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); + /* + * origHeapBlk is the block number where the insertion occurred. heapBlk + * is the first block in the corresponding page range. + */ + origHeapBlk = ItemPointerGetBlockNumber(heaptid); + heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange; + for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; - BlockNumber heapBlk; int keyno; CHECK_FOR_INTERRUPTS(); - heapBlk = ItemPointerGetBlockNumber(heaptid); - /* normalize the block number to be the first block in the range */ - heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; - brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, - BUFFER_LOCK_SHARE, NULL); + /* + * If auto-summarization is enabled and we just inserted the first + * tuple into the first block of a new non-first page range, request a + * summarization run of the previous range. + */ + if (autosummarize && + heapBlk > 0 && + heapBlk == origHeapBlk && + ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) + { + BlockNumber lastPageRange = heapBlk - 1; + BrinTuple *lastPageTuple; + + lastPageTuple = + brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); + if (!lastPageTuple) + AutoVacuumRequestWork(AVW_BRINSummarizeRange, + RelationGetRelid(idxRel), + lastPageRange); + brin_free_tuple(lastPageTuple); + } + + brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; - /* First time through? */ + /* First time through in this statement? */ if (bdesc == NULL) { + MemoryContextSwitchTo(indexInfo->ii_Context); bdesc = brin_build_desc(idxRel); + indexInfo->ii_AmCache = (void *) bdesc; + MemoryContextSwitchTo(oldcxt); + } + /* First time through in this brininsert call? */ + if (tupcxt == NULL) + { tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldcxt = MemoryContextSwitchTo(tupcxt); + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(tupcxt); } - dtup = brin_deform_tuple(bdesc, brtup); + dtup = brin_deform_tuple(bdesc, brtup, NULL); /* * Compare the key values of the new tuple to the stored index values; @@ -222,7 +268,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); - origtup = brin_copy_tuple(brtup, origsz); + origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); /* * Before releasing the lock, check if we can attempt a same-page @@ -259,12 +305,9 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); - if (bdesc != NULL) - { - brin_free_desc(bdesc); - MemoryContextSwitchTo(oldcxt); + MemoryContextSwitchTo(oldcxt); + if (tupcxt != NULL) MemoryContextDelete(tupcxt); - } return false; } @@ -320,6 +363,9 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) FmgrInfo *consistentFn; MemoryContext oldcxt; MemoryContext perRangeCxt; + BrinMemTuple *dtup; + BrinTuple *btup = NULL; + Size btupsz = 0; opaque = (BrinOpaque *) scan->opaque; bdesc = opaque->bo_bdesc; @@ -341,15 +387,16 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) */ consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); + /* allocate an initial in-memory tuple, out of the per-range memcxt */ + dtup = brin_new_memtuple(bdesc); + /* * Setup and use a per-range memory context, which is reset every time we * loop below. This avoids having to free the tuples within the loop. */ perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, "bringetbitmap cxt", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(perRangeCxt); /* @@ -360,6 +407,7 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) { bool addrange; + bool gottuple = false; BrinTuple *tup; OffsetNumber off; Size size; @@ -373,7 +421,8 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) scan->xs_snapshot); if (tup) { - tup = brin_copy_tuple(tup, size); + gottuple = true; + btup = brin_copy_tuple(tup, size, btup, &btupsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } @@ -381,15 +430,13 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) * For page ranges with no indexed tuple, we must return the whole * range; otherwise, compare it to the scan keys. */ - if (tup == NULL) + if (!gottuple) { addrange = true; } else { - BrinMemTuple *dtup; - - dtup = brin_deform_tuple(bdesc, tup); + dtup = brin_deform_tuple(bdesc, btup, dtup); if (dtup->bt_placeholder) { /* @@ -741,7 +788,7 @@ brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) brin_vacuum_scan(info->index, info->strategy); - brinsummarize(info->index, heapRel, + brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, &stats->num_index_tuples, &stats->num_index_tuples); heap_close(heapRel, AccessShareLock); @@ -759,7 +806,8 @@ brinoptions(Datum reloptions, bool validate) BrinOptions *rdopts; int numoptions; static const relopt_parse_elt tab[] = { - {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)} + {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}, + {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)} }; options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN, @@ -786,12 +834,39 @@ brinoptions(Datum reloptions, bool validate) Datum brin_summarize_new_values(PG_FUNCTION_ARGS) { + Datum relation = PG_GETARG_DATUM(0); + + return DirectFunctionCall2(brin_summarize_range, + relation, + Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES)); +} + +/* + * SQL-callable function to summarize the indicated page range, if not already + * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all + * unsummarized ranges are summarized. + */ +Datum +brin_summarize_range(PG_FUNCTION_ARGS) +{ Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; Oid heapoid; Relation indexRel; Relation heapRel; double numSummarized = 0; + if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0) + { + char *blk = psprintf(INT64_FORMAT, heapBlk64); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %s", blk))); + } + heapBlk = (BlockNumber) heapBlk64; + /* * We must lock table before index to avoid deadlocks. However, if the * passed indexoid isn't an index then IndexGetRelation() will fail. @@ -831,7 +906,7 @@ brin_summarize_new_values(PG_FUNCTION_ARGS) RelationGetRelationName(indexRel)))); /* OK, do it */ - brinsummarize(indexRel, heapRel, &numSummarized, NULL); + brinsummarize(indexRel, heapRel, heapBlk, &numSummarized, NULL); relation_close(indexRel, ShareUpdateExclusiveLock); relation_close(heapRel, ShareUpdateExclusiveLock); @@ -840,6 +915,81 @@ brin_summarize_new_values(PG_FUNCTION_ARGS) } /* + * SQL-callable interface to mark a range as no longer summarized + */ +Datum +brin_desummarize_range(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; + Oid heapoid; + Relation heapRel; + Relation indexRel; + bool done; + + if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0) + { + char *blk = psprintf(INT64_FORMAT, heapBlk64); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %s", blk))); + } + heapBlk = (BlockNumber) heapBlk64; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indexoid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + */ + heapoid = IndexGetRelation(indexoid, true); + if (OidIsValid(heapoid)) + heapRel = heap_open(heapoid, ShareUpdateExclusiveLock); + else + heapRel = NULL; + + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + /* Must be a BRIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != BRIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a BRIN index", + RelationGetRelationName(indexRel)))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (!pg_class_ownercheck(indexoid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, + RelationGetRelationName(indexRel)); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. Recheck. + */ + if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index %s", + RelationGetRelationName(indexRel)))); + + /* the revmap does the hard work */ + do + { + done = brinRevmapDesummarizeRange(indexRel, heapBlk); + } + while (!done); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_VOID(); +} + +/* * Build a BrinDesc used to create or scan a BRIN index */ BrinDesc * @@ -856,9 +1006,7 @@ brin_build_desc(Relation rel) cxt = AllocSetContextCreate(CurrentMemoryContext, "brin desc cxt", - ALLOCSET_SMALL_INITSIZE, - ALLOCSET_SMALL_MINSIZE, - ALLOCSET_SMALL_MAXSIZE); + ALLOCSET_SMALL_SIZES); oldcxt = MemoryContextSwitchTo(cxt); tupdesc = RelationGetDescr(rel); @@ -909,6 +1057,27 @@ brin_free_desc(BrinDesc *bdesc) } /* + * Fetch index's statistical data into *stats + */ +void +brinGetStats(Relation index, BrinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + BrinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + stats->pagesPerRange = metadata->pagesPerRange; + stats->revmapNumPages = metadata->lastRevmapPage - 1; + + UnlockReleaseBuffer(metabuffer); +} + +/* * Initialize a BrinBuildState appropriate to create tuples on the given index. */ static BrinBuildState * @@ -1048,7 +1217,7 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, /* the placeholder tuple must exist */ if (phtup == NULL) elog(ERROR, "missing placeholder tuple"); - phtup = brin_copy_tuple(phtup, phsz); + phtup = brin_copy_tuple(phtup, phsz, NULL, NULL); LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); /* merge it into the tuple from the heap scan */ @@ -1059,17 +1228,17 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, } /* - * Scan a complete BRIN index, and summarize each page range that's not already - * summarized. The index and heap must have been locked by caller in at - * least ShareUpdateExclusiveLock mode. + * Summarize page ranges that are not already summarized. If pageRange is + * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the + * page range containing the given heap page number is scanned. * * For each new index tuple inserted, *numSummarized (if not NULL) is * incremented; for each existing tuple, *numExisting (if not NULL) is * incremented. */ static void -brinsummarize(Relation index, Relation heapRel, double *numSummarized, - double *numExisting) +brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, + double *numSummarized, double *numExisting) { BrinRevmap *revmap; BrinBuildState *state = NULL; @@ -1078,15 +1247,40 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized, BlockNumber heapBlk; BlockNumber pagesPerRange; Buffer buf; + BlockNumber startBlk; + BlockNumber endBlk; + + /* determine range of pages to process; nothing to do for an empty table */ + heapNumBlocks = RelationGetNumberOfBlocks(heapRel); + if (heapNumBlocks == 0) + return; revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); + if (pageRange == BRIN_ALL_BLOCKRANGES) + { + startBlk = 0; + endBlk = heapNumBlocks; + } + else + { + startBlk = (pageRange / pagesPerRange) * pagesPerRange; + /* Nothing to do if start point is beyond end of table */ + if (startBlk > heapNumBlocks) + { + brinRevmapTerminate(revmap); + return; + } + endBlk = startBlk + pagesPerRange; + if (endBlk > heapNumBlocks) + endBlk = heapNumBlocks; + } + /* * Scan the revmap to find unsummarized items. */ buf = InvalidBuffer; - heapNumBlocks = RelationGetNumberOfBlocks(heapRel); - for (heapBlk = 0; heapBlk < heapNumBlocks; heapBlk += pagesPerRange) + for (heapBlk = startBlk; heapBlk < endBlk; heapBlk += pagesPerRange) { BrinTuple *tup; OffsetNumber off; @@ -1169,11 +1363,9 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) /* Use our own memory context to avoid retail pfree */ cxt = AllocSetContextCreate(CurrentMemoryContext, "brin union", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(cxt); - db = brin_deform_tuple(bdesc, b); + db = brin_deform_tuple(bdesc, b, NULL); MemoryContextSwitchTo(oldcxt); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) diff --git a/src/backend/access/brin/brin_inclusion.c b/src/backend/access/brin/brin_inclusion.c index 0ae7a72996..bc16dd7981 100644 --- a/src/backend/access/brin/brin_inclusion.c +++ b/src/backend/access/brin/brin_inclusion.c @@ -16,7 +16,7 @@ * writing is the INET type, where IPv6 values cannot be merged with IPv4 * values. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -30,6 +30,7 @@ #include "access/skey.h" #include "catalog/pg_amop.h" #include "catalog/pg_type.h" +#include "utils/builtins.h" #include "utils/datum.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -76,10 +77,6 @@ typedef struct InclusionOpaque FmgrInfo strategy_procinfos[RTMaxStrategyNumber]; } InclusionOpaque; -Datum brin_inclusion_opcinfo(PG_FUNCTION_ARGS); -Datum brin_inclusion_add_value(PG_FUNCTION_ARGS); -Datum brin_inclusion_consistent(PG_FUNCTION_ARGS); -Datum brin_inclusion_union(PG_FUNCTION_ARGS); static FmgrInfo *inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum); static FmgrInfo *inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, @@ -431,7 +428,7 @@ brin_inclusion_consistent(PG_FUNCTION_ARGS) * It is straightforward to support the equality strategies with * the contains operator. Generally, inequality strategies do not * make much sense for the types which will be used with the - * inclusion BRIN family of opclasses, but is is possible to + * inclusion BRIN family of opclasses, but is possible to * implement them with logical negation of the left-of and * right-of operators. * diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c index b7c76e6eda..8f7a0c75b8 100644 --- a/src/backend/access/brin/brin_minmax.c +++ b/src/backend/access/brin/brin_minmax.c @@ -2,7 +2,7 @@ * brin_minmax.c * Implementation of Min/Max opclass for BRIN * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -16,6 +16,7 @@ #include "access/stratnum.h" #include "catalog/pg_type.h" #include "catalog/pg_amop.h" +#include "utils/builtins.h" #include "utils/datum.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -28,10 +29,6 @@ typedef struct MinmaxOpaque FmgrInfo strategy_procinfos[BTMaxStrategyNumber]; } MinmaxOpaque; -Datum brin_minmax_opcinfo(PG_FUNCTION_ARGS); -Datum brin_minmax_add_value(PG_FUNCTION_ARGS); -Datum brin_minmax_consistent(PG_FUNCTION_ARGS); -Datum brin_minmax_union(PG_FUNCTION_ARGS); static FmgrInfo *minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype, uint16 strategynum); diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index 6ebfedd6a9..1725591b05 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -2,7 +2,7 @@ * brin_pageops.c * Page-handling routines for BRIN indexes * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -178,10 +178,8 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, } START_CRIT_SECTION(); - PageIndexDeleteNoCompact(oldpage, &oldoff, 1); - if (PageAddItemExtended(oldpage, (Item) newtup, newsz, oldoff, - PAI_OVERWRITE | PAI_ALLOW_FAR_OFFSET) == InvalidOffsetNumber) - elog(ERROR, "failed to add BRIN tuple"); + if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz)) + elog(ERROR, "failed to replace BRIN tuple"); MarkBufferDirty(oldbuf); /* XLOG stuff */ @@ -247,7 +245,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, if (extended) brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); - PageIndexDeleteNoCompact(oldpage, &oldoff, 1); + PageIndexTupleDeleteNoCompact(oldpage, oldoff); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) @@ -289,7 +287,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, XLogRegisterBufData(0, (char *) newtup, newsz); /* revmap page */ - XLogRegisterBuffer(1, revmapbuf, REGBUF_STANDARD); + XLogRegisterBuffer(1, revmapbuf, 0); /* old page */ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); @@ -550,6 +548,8 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, OffsetNumber off; OffsetNumber maxoff; Page page; + BrinTuple *btup = NULL; + Size btupsz = 0; page = BufferGetPage(buf); @@ -569,7 +569,7 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, { sz = ItemIdGetLength(lp); tup = (BrinTuple *) PageGetItem(page, lp); - tup = brin_copy_tuple(tup, sz); + tup = brin_copy_tuple(tup, sz, btup, &btupsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c index 853181b3fa..fc8b10ab39 100644 --- a/src/backend/access/brin/brin_revmap.c +++ b/src/backend/access/brin/brin_revmap.c @@ -12,7 +12,7 @@ * the metapage. When the revmap needs to be expanded, all tuples on the * regular BRIN page at that block (if any) are moved out of the way. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -168,9 +168,12 @@ brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, iptr = (ItemPointerData *) contents->rm_tids; iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk); - ItemPointerSet(iptr, - ItemPointerGetBlockNumber(&tid), - ItemPointerGetOffsetNumber(&tid)); + if (ItemPointerIsValid(&tid)) + ItemPointerSet(iptr, + ItemPointerGetBlockNumber(&tid), + ItemPointerGetOffsetNumber(&tid)); + else + ItemPointerSetInvalid(iptr); } /* @@ -205,7 +208,11 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, /* normalize the heap block number to be the first page in the range */ heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; - /* Compute the revmap page number we need */ + /* + * Compute the revmap page number we need. If Invalid is returned (i.e., + * the revmap page hasn't been created yet), the requested page range is + * not summarized. + */ mapBlk = revmap_get_blkno(revmap, heapBlk); if (mapBlk == InvalidBlockNumber) { @@ -301,6 +308,138 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, } /* + * Delete an index tuple, marking a page range as unsummarized. + * + * Index must be locked in ShareUpdateExclusiveLock mode. + * + * Return FALSE if caller should retry. + */ +bool +brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk) +{ + BrinRevmap *revmap; + BlockNumber pagesPerRange; + RevmapContents *contents; + ItemPointerData *iptr; + ItemPointerData invalidIptr; + BlockNumber revmapBlk; + Buffer revmapBuf; + Buffer regBuf; + Page revmapPg; + Page regPg; + OffsetNumber revmapOffset; + OffsetNumber regOffset; + ItemId lp; + BrinTuple *tup; + + revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL); + + revmapBlk = revmap_get_blkno(revmap, heapBlk); + if (!BlockNumberIsValid(revmapBlk)) + { + /* revmap page doesn't exist: range not summarized, we're done */ + brinRevmapTerminate(revmap); + return true; + } + + /* Lock the revmap page, obtain the index tuple pointer from it */ + revmapBuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + revmapPg = BufferGetPage(revmapBuf); + revmapOffset = HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); + + contents = (RevmapContents *) PageGetContents(revmapPg); + iptr = contents->rm_tids; + iptr += revmapOffset; + + if (!ItemPointerIsValid(iptr)) + { + /* no index tuple: range not summarized, we're done */ + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + return true; + } + + regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr)); + LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE); + regPg = BufferGetPage(regBuf); + + /* if this is no longer a regular page, tell caller to start over */ + if (!BRIN_IS_REGULAR_PAGE(regPg)) + { + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + LockBuffer(regBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + return false; + } + + regOffset = ItemPointerGetOffsetNumber(iptr); + if (regOffset > PageGetMaxOffsetNumber(regPg)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("corrupted BRIN index: inconsistent range map"))); + + lp = PageGetItemId(regPg, regOffset); + if (!ItemIdIsUsed(lp)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("corrupted BRIN index: inconsistent range map"))); + tup = (BrinTuple *) PageGetItem(regPg, lp); + /* XXX apply sanity checks? Might as well delete a bogus tuple ... */ + + /* + * We're only removing data, not reading it, so there's no need to + * TestForOldSnapshot here. + */ + + /* + * Because of SUE lock, this function shouldn't run concurrently with + * summarization. Placeholder tuples can only exist as leftovers from + * crashed summarization, so if we detect any, we complain but proceed. + */ + if (BrinTupleIsPlaceholder(tup)) + ereport(WARNING, + (errmsg("leftover placeholder tuple detected in BRIN index \"%s\", deleting", + RelationGetRelationName(idxrel)))); + + START_CRIT_SECTION(); + + ItemPointerSetInvalid(&invalidIptr); + brinSetHeapBlockItemptr(revmapBuf, revmap->rm_pagesPerRange, heapBlk, + invalidIptr); + PageIndexTupleDeleteNoCompact(regPg, regOffset); + /* XXX record free space in FSM? */ + + MarkBufferDirty(regBuf); + MarkBufferDirty(revmapBuf); + + if (RelationNeedsWAL(idxrel)) + { + xl_brin_desummarize xlrec; + XLogRecPtr recptr; + + xlrec.pagesPerRange = revmap->rm_pagesPerRange; + xlrec.heapBlk = heapBlk; + xlrec.regOffset = regOffset; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinDesummarize); + XLogRegisterBuffer(0, revmapBuf, 0); + XLogRegisterBuffer(1, regBuf, REGBUF_STANDARD); + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_DESUMMARIZE); + PageSetLSN(revmapPg, recptr); + PageSetLSN(regPg, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(regBuf); + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + + return true; +} + +/* * Given a heap block number, find the corresponding physical revmap block * number and return it. If the revmap page hasn't been allocated yet, return * InvalidBlockNumber. diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c index 64b8264959..e2e1d23377 100644 --- a/src/backend/access/brin/brin_tuple.c +++ b/src/backend/access/brin/brin_tuple.c @@ -23,7 +23,7 @@ * Note the size of the null bitmask may not be the same as that of the * datum array. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -311,17 +311,26 @@ brin_free_tuple(BrinTuple *tuple) } /* - * Create a palloc'd copy of a BrinTuple. + * Given a brin tuple of size len, create a copy of it. If 'dest' is not + * NULL, its size is destsz, and can be used as output buffer; if the tuple + * to be copied does not fit, it is enlarged by repalloc, and the size is + * updated to match. This avoids palloc/free cycles when many brin tuples + * are being processed in loops. */ BrinTuple * -brin_copy_tuple(BrinTuple *tuple, Size len) +brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz) { - BrinTuple *newtup; + if (!destsz || *destsz == 0) + dest = palloc(len); + else if (len > *destsz) + { + dest = repalloc(dest, len); + *destsz = len; + } - newtup = palloc(len); - memcpy(newtup, tuple, len); + memcpy(dest, tuple, len); - return newtup; + return dest; } /* @@ -348,56 +357,69 @@ BrinMemTuple * brin_new_memtuple(BrinDesc *brdesc) { BrinMemTuple *dtup; - char *currdatum; long basesize; - int i; basesize = MAXALIGN(sizeof(BrinMemTuple) + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored); - currdatum = (char *) dtup + basesize; - for (i = 0; i < brdesc->bd_tupdesc->natts; i++) - { - dtup->bt_columns[i].bv_attno = i + 1; - dtup->bt_columns[i].bv_allnulls = true; - dtup->bt_columns[i].bv_hasnulls = false; - dtup->bt_columns[i].bv_values = (Datum *) currdatum; - currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored; - } + + dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored); + dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext, "brin dtuple", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); + + brin_memtuple_initialize(dtup, brdesc); + return dtup; } /* - * Reset a BrinMemTuple to initial state + * Reset a BrinMemTuple to initial state. We return the same tuple, for + * notational convenience. */ -void +BrinMemTuple * brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc) { int i; + char *currdatum; MemoryContextReset(dtuple->bt_context); + + currdatum = (char *) dtuple + + MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); for (i = 0; i < brdesc->bd_tupdesc->natts; i++) { dtuple->bt_columns[i].bv_allnulls = true; dtuple->bt_columns[i].bv_hasnulls = false; + + dtuple->bt_columns[i].bv_attno = i + 1; + dtuple->bt_columns[i].bv_allnulls = true; + dtuple->bt_columns[i].bv_hasnulls = false; + dtuple->bt_columns[i].bv_values = (Datum *) currdatum; + currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored; } + + return dtuple; } /* * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of * brin_form_tuple. * + * As an optimization, the caller can pass a previously allocated 'dMemtuple'. + * This avoids having to allocate it here, which can be useful when this + * function is called many times in a loop. It is caller's responsibility + * that the given BrinMemTuple matches what we need here. + * * Note we don't need the "on disk tupdesc" here; we rely on our own routine to * deconstruct the tuple from the on-disk format. */ BrinMemTuple * -brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple) +brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple) { BrinMemTuple *dtup; Datum *values; @@ -409,15 +431,16 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple) int valueno; MemoryContext oldcxt; - dtup = brin_new_memtuple(brdesc); + dtup = dMemtuple ? brin_memtuple_initialize(dMemtuple, brdesc) : + brin_new_memtuple(brdesc); if (BrinTupleIsPlaceholder(tuple)) dtup->bt_placeholder = true; dtup->bt_blkno = tuple->bt_blkno; - values = palloc(sizeof(Datum) * brdesc->bd_totalstored); - allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); - hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + values = dtup->bt_values; + allnulls = dtup->bt_allnulls; + hasnulls = dtup->bt_hasnulls; tp = (char *) tuple + BrinTupleDataOffset(tuple); @@ -460,10 +483,6 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple) MemoryContextSwitchTo(oldcxt); - pfree(values); - pfree(allnulls); - pfree(hasnulls); - return dtup; } diff --git a/src/backend/access/brin/brin_validate.c b/src/backend/access/brin/brin_validate.c index 1f1011e0ac..dc23e00e89 100644 --- a/src/backend/access/brin/brin_validate.c +++ b/src/backend/access/brin/brin_validate.c @@ -3,7 +3,7 @@ * brin_validate.c * Opclass validator for BRIN. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -23,6 +23,7 @@ #include "catalog/pg_type.h" #include "utils/builtins.h" #include "utils/syscache.h" +#include "utils/regproc.h" /* diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index 27ba0a97f8..dff7198a39 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -2,7 +2,7 @@ * brin_xlog.c * XLog replay routines for BRIN indexes * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -13,6 +13,7 @@ #include "access/brin_page.h" #include "access/brin_pageops.h" #include "access/brin_xlog.h" +#include "access/bufmask.h" #include "access/xlogutils.h" @@ -148,10 +149,8 @@ brin_xlog_update(XLogReaderState *record) page = (Page) BufferGetPage(buffer); offnum = xlrec->oldOffnum; - if (PageGetMaxOffsetNumber(page) + 1 < offnum) - elog(PANIC, "brin_xlog_update: invalid max offset number"); - PageIndexDeleteNoCompact(page, &offnum, 1); + PageIndexTupleDeleteNoCompact(page, offnum); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -189,14 +188,9 @@ brin_xlog_samepage_update(XLogReaderState *record) page = (Page) BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) + 1 < offnum) - elog(PANIC, "brin_xlog_samepage_update: invalid max offset number"); - PageIndexDeleteNoCompact(page, &offnum, 1); - offnum = PageAddItemExtended(page, (Item) brintuple, tuplen, offnum, - PAI_OVERWRITE | PAI_ALLOW_FAR_OFFSET); - if (offnum == InvalidOffsetNumber) - elog(PANIC, "brin_xlog_samepage_update: failed to add tuple"); + if (!PageIndexTupleOverwrite(page, offnum, (Item) brintuple, tuplen)) + elog(PANIC, "brin_xlog_samepage_update: failed to replace tuple"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -260,6 +254,46 @@ brin_xlog_revmap_extend(XLogReaderState *record) UnlockReleaseBuffer(metabuf); } +static void +brin_xlog_desummarize_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_desummarize *xlrec; + Buffer buffer; + XLogRedoAction action; + + xlrec = (xl_brin_desummarize *) XLogRecGetData(record); + + /* Update the revmap */ + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + ItemPointerData iptr; + + ItemPointerSetInvalid(&iptr); + brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, iptr); + + PageSetLSN(BufferGetPage(buffer), lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* remove the leftover entry from the regular page */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + Page regPg = BufferGetPage(buffer); + + PageIndexTupleDeleteNoCompact(regPg, xlrec->regOffset); + + PageSetLSN(regPg, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + void brin_redo(XLogReaderState *record) { @@ -282,7 +316,29 @@ brin_redo(XLogReaderState *record) case XLOG_BRIN_REVMAP_EXTEND: brin_xlog_revmap_extend(record); break; + case XLOG_BRIN_DESUMMARIZE: + brin_xlog_desummarize_page(record); + break; default: elog(PANIC, "brin_redo: unknown op code %u", info); } } + +/* + * Mask a BRIN page before doing consistency checks. + */ +void +brin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + + if (BRIN_IS_REGULAR_PAGE(page)) + { + /* Regular brin pages contain unused space which needs to be masked. */ + mask_unused_space(page); + } +} diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile index 1fa6de0823..fb27944b89 100644 --- a/src/backend/access/common/Makefile +++ b/src/backend/access/common/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/access/common top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = heaptuple.o indextuple.o printtup.o reloptions.o scankey.o \ - tupconvert.o tupdesc.o +OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \ + reloptions.o scankey.o tupconvert.o tupdesc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c new file mode 100644 index 0000000000..10253d3354 --- /dev/null +++ b/src/backend/access/common/bufmask.c @@ -0,0 +1,128 @@ +/*------------------------------------------------------------------------- + * + * bufmask.c + * Routines for buffer masking. Used to mask certain bits + * in a page which can be different when the WAL is generated + * and when the WAL is applied. + * + * Portions Copyright (c) 2016-2017, PostgreSQL Global Development Group + * + * Contains common routines required for masking a page. + * + * IDENTIFICATION + * src/backend/storage/buffer/bufmask.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/bufmask.h" + +/* + * mask_page_lsn + * + * In consistency checks, the LSN of the two pages compared will likely be + * different because of concurrent operations when the WAL is generated + * and the state of the page when WAL is applied. + */ +void +mask_page_lsn(Page page) +{ + PageHeader phdr = (PageHeader) page; + + PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER); +} + +/* + * mask_page_hint_bits + * + * Mask hint bits in PageHeader. We want to ignore differences in hint bits, + * since they can be set without emitting any WAL. + */ +void +mask_page_hint_bits(Page page) +{ + PageHeader phdr = (PageHeader) page; + + /* Ignore prune_xid (it's like a hint-bit) */ + phdr->pd_prune_xid = MASK_MARKER; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */ + PageClearFull(page); + PageClearHasFreeLinePointers(page); + + /* + * During replay, if the page LSN has advanced past our XLOG record's LSN, + * we don't mark the page all-visible. See heap_xlog_visible() for + * details. + */ + PageClearAllVisible(page); +} + +/* + * mask_unused_space + * + * Mask the unused space of a page between pd_lower and pd_upper. + */ +void +mask_unused_space(Page page) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_special; + + /* Sanity check */ + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n", + pd_lower, pd_upper, pd_special); + } + + memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower); +} + +/* + * mask_lp_flags + * + * In some index AMs, line pointer flags can be modified in master without + * emitting any WAL record. + */ +void +mask_lp_flags(Page page) +{ + OffsetNumber offnum, + maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemId)) + itemId->lp_flags = LP_UNUSED; + } +} + +/* + * mask_page_content + * + * In some index AMs, the contents of deleted pages need to be almost + * completely ignored. + */ +void +mask_page_content(Page page) +{ + /* Mask Page Content */ + memset(page + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - SizeOfPageHeaderData); + + /* Mask pd_lower and pd_upper */ + memset(&((PageHeader) page)->pd_lower, MASK_MARKER, + sizeof(uint16)); + memset(&((PageHeader) page)->pd_upper, MASK_MARKER, + sizeof(uint16)); +} diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 15a18a51cc..970e3aa6c9 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -46,7 +46,7 @@ * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -875,6 +875,72 @@ heap_modify_tuple(HeapTuple tuple, } /* + * heap_modify_tuple_by_cols + * form a new tuple from an old tuple and a set of replacement values. + * + * This is like heap_modify_tuple, except that instead of specifying which + * column(s) to replace by a boolean map, an array of target column numbers + * is used. This is often more convenient when a fixed number of columns + * are to be replaced. The replCols, replValues, and replIsnull arrays must + * be of length nCols. Target column numbers are indexed from 1. + * + * The result is allocated in the current memory context. + */ +HeapTuple +heap_modify_tuple_by_cols(HeapTuple tuple, + TupleDesc tupleDesc, + int nCols, + int *replCols, + Datum *replValues, + bool *replIsnull) +{ + int numberOfAttributes = tupleDesc->natts; + Datum *values; + bool *isnull; + HeapTuple newTuple; + int i; + + /* + * allocate and fill values and isnull arrays from the tuple, then replace + * selected columns from the input arrays. + */ + values = (Datum *) palloc(numberOfAttributes * sizeof(Datum)); + isnull = (bool *) palloc(numberOfAttributes * sizeof(bool)); + + heap_deform_tuple(tuple, tupleDesc, values, isnull); + + for (i = 0; i < nCols; i++) + { + int attnum = replCols[i]; + + if (attnum <= 0 || attnum > numberOfAttributes) + elog(ERROR, "invalid column number %d", attnum); + values[attnum - 1] = replValues[i]; + isnull[attnum - 1] = replIsnull[i]; + } + + /* + * create a new tuple from the values and isnull arrays + */ + newTuple = heap_form_tuple(tupleDesc, values, isnull); + + pfree(values); + pfree(isnull); + + /* + * copy the identification info of the old tuple: t_ctid, t_self, and OID + * (if any) + */ + newTuple->t_data->t_ctid = tuple->t_data->t_ctid; + newTuple->t_self = tuple->t_self; + newTuple->t_tableOid = tuple->t_tableOid; + if (tupleDesc->tdhasoid) + HeapTupleSetOid(newTuple, HeapTupleGetOid(tuple)); + + return newTuple; +} + +/* * heap_deform_tuple * Given a tuple, extract data into values/isnull arrays; this is * the inverse of heap_form_tuple. diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c index 274a6c2e70..2846ec8b34 100644 --- a/src/backend/access/common/indextuple.c +++ b/src/backend/access/common/indextuple.c @@ -4,7 +4,7 @@ * This file contains index tuple accessor and mutator routines, * as well as various tuple utilities. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/common/printsimple.c b/src/backend/access/common/printsimple.c new file mode 100644 index 0000000000..851c3bf4de --- /dev/null +++ b/src/backend/access/common/printsimple.c @@ -0,0 +1,131 @@ +/*------------------------------------------------------------------------- + * + * printsimple.c + * Routines to print out tuples containing only a limited range of + * builtin types without catalog access. This is intended for + * backends that don't have catalog access because they are not bound + * to a specific database, such as some walsender processes. It + * doesn't handle standalone backends or protocol versions other than + * 3.0, because we don't need such handling for current applications. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/common/printsimple.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/printsimple.h" +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "libpq/pqformat.h" +#include "utils/builtins.h" + +/* + * At startup time, send a RowDescription message. + */ +void +printsimple_startup(DestReceiver *self, int operation, TupleDesc tupdesc) +{ + StringInfoData buf; + int i; + + pq_beginmessage(&buf, 'T'); /* RowDescription */ + pq_sendint(&buf, tupdesc->natts, 2); + + for (i = 0; i < tupdesc->natts; ++i) + { + Form_pg_attribute attr = tupdesc->attrs[i]; + + pq_sendstring(&buf, NameStr(attr->attname)); + pq_sendint(&buf, 0, 4); /* table oid */ + pq_sendint(&buf, 0, 2); /* attnum */ + pq_sendint(&buf, (int) attr->atttypid, 4); + pq_sendint(&buf, attr->attlen, 2); + pq_sendint(&buf, attr->atttypmod, 4); + pq_sendint(&buf, 0, 2); /* format code */ + } + + pq_endmessage(&buf); +} + +/* + * For each tuple, send a DataRow message. + */ +bool +printsimple(TupleTableSlot *slot, DestReceiver *self) +{ + TupleDesc tupdesc = slot->tts_tupleDescriptor; + StringInfoData buf; + int i; + + /* Make sure the tuple is fully deconstructed */ + slot_getallattrs(slot); + + /* Prepare and send message */ + pq_beginmessage(&buf, 'D'); + pq_sendint(&buf, tupdesc->natts, 2); + + for (i = 0; i < tupdesc->natts; ++i) + { + Form_pg_attribute attr = tupdesc->attrs[i]; + Datum value; + + if (slot->tts_isnull[i]) + { + pq_sendint(&buf, -1, 4); + continue; + } + + value = slot->tts_values[i]; + + /* + * We can't call the regular type output functions here because we + * might not have catalog access. Instead, we must hard-wire + * knowledge of the required types. + */ + switch (attr->atttypid) + { + case TEXTOID: + { + text *t = DatumGetTextPP(value); + + pq_sendcountedtext(&buf, + VARDATA_ANY(t), + VARSIZE_ANY_EXHDR(t), + false); + } + break; + + case INT4OID: + { + int32 num = DatumGetInt32(value); + char str[12]; /* sign, 10 digits and '\0' */ + + pg_ltoa(num, str); + pq_sendcountedtext(&buf, str, strlen(str), false); + } + break; + + case INT8OID: + { + int64 num = DatumGetInt64(value); + char str[23]; /* sign, 21 digits and '\0' */ + + pg_lltoa(num, str); + pq_sendcountedtext(&buf, str, strlen(str), false); + } + break; + + default: + elog(ERROR, "unsupported type OID: %u", attr->atttypid); + } + } + + pq_endmessage(&buf); + + return true; +} diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index a44be6f96f..78704dafd9 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -6,7 +6,7 @@ * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -139,9 +139,7 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo) */ myState->tmpcontext = AllocSetContextCreate(CurrentMemoryContext, "printtup", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); if (PG_PROTOCOL_MAJOR(FrontendProtocol) < 3) { @@ -249,9 +247,7 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats) atttypid = getBaseTypeAndTypmod(atttypid, &atttypmod); pq_sendint(&buf, (int) atttypid, sizeof(atttypid)); pq_sendint(&buf, attrs[i]->attlen, sizeof(attrs[i]->attlen)); - /* typmod appears in protocol 2.0 and up */ - if (proto >= 2) - pq_sendint(&buf, atttypmod, sizeof(atttypmod)); + pq_sendint(&buf, atttypmod, sizeof(atttypmod)); /* format info appears in protocol 3.0 and up */ if (proto >= 3) { diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index ba1f3aafed..6d1f22f049 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -3,7 +3,7 @@ * reloptions.c * Core support for relation options (pg_class.reloptions) * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -15,6 +15,8 @@ #include "postgres.h" +#include <float.h> + #include "access/gist_private.h" #include "access/hash.h" #include "access/htup_details.h" @@ -46,16 +48,61 @@ * (iii) add it to the appropriate options struct (perhaps StdRdOptions) * (iv) add it to the appropriate handling routine (perhaps * default_reloptions) - * (v) don't forget to document the option + * (v) make sure the lock level is set correctly for that operation + * (vi) don't forget to document the option * * Note that we don't handle "oids" in relOpts because it is handled by * interpretOidsOption(). + * + * The default choice for any new option should be AccessExclusiveLock. + * In some cases the lock level can be reduced from there, but the lock + * level chosen should always conflict with itself to ensure that multiple + * changes aren't lost when we attempt concurrent changes. + * The choice of lock level depends completely upon how that parameter + * is used within the server, not upon how and when you'd like to change it. + * Safety first. Existing choices are documented here, and elsewhere in + * backend code where the parameters are used. + * + * In general, anything that affects the results obtained from a SELECT must be + * protected by AccessExclusiveLock. + * + * Autovacuum related parameters can be set at ShareUpdateExclusiveLock + * since they are only used by the AV procs and don't change anything + * currently executing. + * + * Fillfactor can be set because it applies only to subsequent changes made to + * data blocks, as documented in heapio.c + * + * n_distinct options can be set at ShareUpdateExclusiveLock because they + * are only used during ANALYZE, which uses a ShareUpdateExclusiveLock, + * so the ANALYZE will not be affected by in-flight changes. Changing those + * values has no affect until the next ANALYZE, so no need for stronger lock. + * + * Planner-related parameters can be set with ShareUpdateExclusiveLock because + * they only affect planning and not the correctness of the execution. Plans + * cannot be changed in mid-flight, so changes here could not easily result in + * new improved plans in any case. So we allow existing queries to continue + * and existing plans to survive, a small price to pay for allowing better + * plans to be introduced concurrently without interfering with users. + * + * Setting parallel_workers is safe, since it acts the same as + * max_parallel_workers_per_gather which is a USERSET parameter that doesn't + * affect existing plans or queries. */ static relopt_bool boolRelOpts[] = { { { + "autosummarize", + "Enables automatic summarization on this BRIN index", + RELOPT_KIND_BRIN, + AccessExclusiveLock + }, + false + }, + { + { "autovacuum_enabled", "Enables autovacuum in this relation", RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, @@ -265,7 +312,7 @@ static relopt_int intRelOpts[] = "effective_io_concurrency", "Number of simultaneous requests that can be handled efficiently by the disk subsystem.", RELOPT_KIND_TABLESPACE, - AccessExclusiveLock + ShareUpdateExclusiveLock }, #ifdef USE_PREFETCH -1, 0, MAX_IO_CONCURRENCY @@ -278,7 +325,7 @@ static relopt_int intRelOpts[] = "parallel_workers", "Number of parallel processes that can be used per executor node for this relation.", RELOPT_KIND_HEAP, - AccessExclusiveLock + ShareUpdateExclusiveLock }, -1, 0, 1024 }, @@ -312,7 +359,7 @@ static relopt_real realRelOpts[] = "seq_page_cost", "Sets the planner's estimate of the cost of a sequentially fetched disk page.", RELOPT_KIND_TABLESPACE, - AccessExclusiveLock + ShareUpdateExclusiveLock }, -1, 0.0, DBL_MAX }, @@ -321,7 +368,7 @@ static relopt_real realRelOpts[] = "random_page_cost", "Sets the planner's estimate of the cost of a nonsequentially fetched disk page.", RELOPT_KIND_TABLESPACE, - AccessExclusiveLock + ShareUpdateExclusiveLock }, -1, 0.0, DBL_MAX }, @@ -330,7 +377,7 @@ static relopt_real realRelOpts[] = "n_distinct", "Sets the planner's estimate of the number of distinct values appearing in a column (excluding child relations).", RELOPT_KIND_ATTRIBUTE, - AccessExclusiveLock + ShareUpdateExclusiveLock }, 0, -1.0, DBL_MAX }, @@ -339,7 +386,7 @@ static relopt_real realRelOpts[] = "n_distinct_inherited", "Sets the planner's estimate of the number of distinct values appearing in a column (including child relations).", RELOPT_KIND_ATTRIBUTE, - AccessExclusiveLock + ShareUpdateExclusiveLock }, 0, -1.0, DBL_MAX }, @@ -722,9 +769,8 @@ transformRelOptions(Datum oldOptions, List *defList, char *namspace, for (i = 0; i < noldoptions; i++) { - text *oldoption = DatumGetTextP(oldoptions[i]); - char *text_str = VARDATA(oldoption); - int text_len = VARSIZE(oldoption) - VARHDRSZ; + char *text_str = VARDATA(oldoptions[i]); + int text_len = VARSIZE(oldoptions[i]) - VARHDRSZ; /* Search for a match in defList */ foreach(cell, defList) @@ -888,7 +934,7 @@ untransformRelOptions(Datum options) *p++ = '\0'; val = (Node *) makeString(pstrdup(p)); } - result = lappend(result, makeDefElem(pstrdup(s), val)); + result = lappend(result, makeDefElem(pstrdup(s), val, -1)); } return result; @@ -930,6 +976,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: + case RELKIND_PARTITIONED_TABLE: options = heap_reloptions(classForm->relkind, datum, false); break; case RELKIND_VIEW: @@ -962,7 +1009,8 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, * array; this is so that the caller can easily locate the default values. * * If there are no options of the given kind, numrelopts is set to 0 and NULL - * is returned. + * is returned (unless options are illegally supplied despite none being + * defined, in which case an error occurs). * * Note: values of type int, bool and real are allocated as part of the * returned array. Values of type string are allocated separately and must @@ -972,7 +1020,7 @@ relopt_value * parseRelOptions(Datum options, bool validate, relopt_kind kind, int *numrelopts) { - relopt_value *reloptions; + relopt_value *reloptions = NULL; int numoptions = 0; int i; int j; @@ -986,21 +1034,18 @@ parseRelOptions(Datum options, bool validate, relopt_kind kind, if (relOpts[i]->kinds & kind) numoptions++; - if (numoptions == 0) + if (numoptions > 0) { - *numrelopts = 0; - return NULL; - } - - reloptions = palloc(numoptions * sizeof(relopt_value)); + reloptions = palloc(numoptions * sizeof(relopt_value)); - for (i = 0, j = 0; relOpts[i]; i++) - { - if (relOpts[i]->kinds & kind) + for (i = 0, j = 0; relOpts[i]; i++) { - reloptions[j].gen = relOpts[i]; - reloptions[j].isset = false; - j++; + if (relOpts[i]->kinds & kind) + { + reloptions[j].gen = relOpts[i]; + reloptions[j].isset = false; + j++; + } } } @@ -1016,9 +1061,8 @@ parseRelOptions(Datum options, bool validate, relopt_kind kind, for (i = 0; i < noptions; i++) { - text *optiontext = DatumGetTextP(optiondatums[i]); - char *text_str = VARDATA(optiontext); - int text_len = VARSIZE(optiontext) - VARHDRSZ; + char *text_str = VARDATA(optiondatums[i]); + int text_len = VARSIZE(optiondatums[i]) - VARHDRSZ; int j; /* Search for a match in reloptions */ @@ -1382,6 +1426,9 @@ heap_reloptions(char relkind, Datum reloptions, bool validate) case RELKIND_RELATION: case RELKIND_MATVIEW: return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP); + case RELKIND_PARTITIONED_TABLE: + return default_reloptions(reloptions, validate, + RELOPT_KIND_PARTITIONED); default: /* other relkinds are not supported */ return NULL; diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c index 35391c2c60..13edca1f94 100644 --- a/src/backend/access/common/scankey.c +++ b/src/backend/access/common/scankey.c @@ -3,7 +3,7 @@ * scankey.c * scan key support code * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c index 4787d4ca98..392a49b522 100644 --- a/src/backend/access/common/tupconvert.c +++ b/src/backend/access/common/tupconvert.c @@ -9,7 +9,7 @@ * executor's "junkfilter" routines, but these functions work on bare * HeapTuples rather than TupleTableSlots. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -138,12 +138,14 @@ convert_tuples_by_position(TupleDesc indesc, nincols, noutcols))); /* - * Check to see if the map is one-to-one and the tuple types are the same. - * (We check the latter because if they're not, we want to do conversion - * to inject the right OID into the tuple datum.) + * Check to see if the map is one-to-one, in which case we need not do a + * tuple conversion. We must also insist that both tupdescs either + * specify or don't specify an OID column, else we need a conversion to + * add/remove space for that. (For some callers, presence or absence of + * an OID column perhaps would not really matter, but let's be safe.) */ if (indesc->natts == outdesc->natts && - indesc->tdtypeid == outdesc->tdtypeid) + indesc->tdhasoid == outdesc->tdhasoid) { for (i = 0; i < n; i++) { @@ -206,63 +208,22 @@ convert_tuples_by_name(TupleDesc indesc, { TupleConversionMap *map; AttrNumber *attrMap; - int n; + int n = outdesc->natts; int i; bool same; /* Verify compatibility and prepare attribute-number map */ - n = outdesc->natts; - attrMap = (AttrNumber *) palloc0(n * sizeof(AttrNumber)); - for (i = 0; i < n; i++) - { - Form_pg_attribute att = outdesc->attrs[i]; - char *attname; - Oid atttypid; - int32 atttypmod; - int j; - - if (att->attisdropped) - continue; /* attrMap[i] is already 0 */ - attname = NameStr(att->attname); - atttypid = att->atttypid; - atttypmod = att->atttypmod; - for (j = 0; j < indesc->natts; j++) - { - att = indesc->attrs[j]; - if (att->attisdropped) - continue; - if (strcmp(attname, NameStr(att->attname)) == 0) - { - /* Found it, check type */ - if (atttypid != att->atttypid || atttypmod != att->atttypmod) - ereport(ERROR, - (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg_internal("%s", _(msg)), - errdetail("Attribute \"%s\" of type %s does not match corresponding attribute of type %s.", - attname, - format_type_be(outdesc->tdtypeid), - format_type_be(indesc->tdtypeid)))); - attrMap[i] = (AttrNumber) (j + 1); - break; - } - } - if (attrMap[i] == 0) - ereport(ERROR, - (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg_internal("%s", _(msg)), - errdetail("Attribute \"%s\" of type %s does not exist in type %s.", - attname, - format_type_be(outdesc->tdtypeid), - format_type_be(indesc->tdtypeid)))); - } + attrMap = convert_tuples_by_name_map(indesc, outdesc, msg); /* - * Check to see if the map is one-to-one and the tuple types are the same. - * (We check the latter because if they're not, we want to do conversion - * to inject the right OID into the tuple datum.) + * Check to see if the map is one-to-one, in which case we need not do a + * tuple conversion. We must also insist that both tupdescs either + * specify or don't specify an OID column, else we need a conversion to + * add/remove space for that. (For some callers, presence or absence of + * an OID column perhaps would not really matter, but let's be safe.) */ if (indesc->natts == outdesc->natts && - indesc->tdtypeid == outdesc->tdtypeid) + indesc->tdhasoid == outdesc->tdhasoid) { same = true; for (i = 0; i < n; i++) @@ -313,6 +274,69 @@ convert_tuples_by_name(TupleDesc indesc, } /* + * Return a palloc'd bare attribute map for tuple conversion, matching input + * and output columns by name. (Dropped columns are ignored in both input and + * output.) This is normally a subroutine for convert_tuples_by_name, but can + * be used standalone. + */ +AttrNumber * +convert_tuples_by_name_map(TupleDesc indesc, + TupleDesc outdesc, + const char *msg) +{ + AttrNumber *attrMap; + int n; + int i; + + n = outdesc->natts; + attrMap = (AttrNumber *) palloc0(n * sizeof(AttrNumber)); + for (i = 0; i < n; i++) + { + Form_pg_attribute att = outdesc->attrs[i]; + char *attname; + Oid atttypid; + int32 atttypmod; + int j; + + if (att->attisdropped) + continue; /* attrMap[i] is already 0 */ + attname = NameStr(att->attname); + atttypid = att->atttypid; + atttypmod = att->atttypmod; + for (j = 0; j < indesc->natts; j++) + { + att = indesc->attrs[j]; + if (att->attisdropped) + continue; + if (strcmp(attname, NameStr(att->attname)) == 0) + { + /* Found it, check type */ + if (atttypid != att->atttypid || atttypmod != att->atttypmod) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg_internal("%s", _(msg)), + errdetail("Attribute \"%s\" of type %s does not match corresponding attribute of type %s.", + attname, + format_type_be(outdesc->tdtypeid), + format_type_be(indesc->tdtypeid)))); + attrMap[i] = (AttrNumber) (j + 1); + break; + } + } + if (attrMap[i] == 0) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg_internal("%s", _(msg)), + errdetail("Attribute \"%s\" of type %s does not exist in type %s.", + attname, + format_type_be(outdesc->tdtypeid), + format_type_be(indesc->tdtypeid)))); + } + + return attrMap; +} + +/* * Perform conversion of a tuple according to the map. */ HeapTuple diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index b56d0e336f..9fd7b4e019 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -3,7 +3,7 @@ * tupdesc.c * POSTGRES tuple descriptor support code * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -20,6 +20,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "catalog/pg_collation.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "parser/parse_type.h" @@ -149,6 +150,7 @@ CreateTupleDescCopy(TupleDesc tupdesc) memcpy(desc->attrs[i], tupdesc->attrs[i], ATTRIBUTE_FIXED_PART_SIZE); desc->attrs[i]->attnotnull = false; desc->attrs[i]->atthasdef = false; + desc->attrs[i]->attidentity = '\0'; } desc->tdtypeid = tupdesc->tdtypeid; @@ -256,6 +258,7 @@ TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno, /* since we're not copying constraints or defaults, clear these */ dst->attrs[dstAttno - 1]->attnotnull = false; dst->attrs[dstAttno - 1]->atthasdef = false; + dst->attrs[dstAttno - 1]->attidentity = '\0'; } /* @@ -400,6 +403,8 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2) return false; if (attr1->atthasdef != attr2->atthasdef) return false; + if (attr1->attidentity != attr2->attidentity) + return false; if (attr1->attisdropped != attr2->attisdropped) return false; if (attr1->attislocal != attr2->attislocal) @@ -533,6 +538,7 @@ TupleDescInitEntry(TupleDesc desc, att->attnotnull = false; att->atthasdef = false; + att->attidentity = '\0'; att->attisdropped = false; att->attislocal = true; att->attinhcount = 0; @@ -554,6 +560,93 @@ TupleDescInitEntry(TupleDesc desc, } /* + * TupleDescInitBuiltinEntry + * Initialize a tuple descriptor without catalog access. Only + * a limited range of builtin types are supported. + */ +void +TupleDescInitBuiltinEntry(TupleDesc desc, + AttrNumber attributeNumber, + const char *attributeName, + Oid oidtypeid, + int32 typmod, + int attdim) +{ + Form_pg_attribute att; + + /* sanity checks */ + AssertArg(PointerIsValid(desc)); + AssertArg(attributeNumber >= 1); + AssertArg(attributeNumber <= desc->natts); + + /* initialize the attribute fields */ + att = desc->attrs[attributeNumber - 1]; + att->attrelid = 0; /* dummy value */ + + /* unlike TupleDescInitEntry, we require an attribute name */ + Assert(attributeName != NULL); + namestrcpy(&(att->attname), attributeName); + + att->attstattarget = -1; + att->attcacheoff = -1; + att->atttypmod = typmod; + + att->attnum = attributeNumber; + att->attndims = attdim; + + att->attnotnull = false; + att->atthasdef = false; + att->attidentity = '\0'; + att->attisdropped = false; + att->attislocal = true; + att->attinhcount = 0; + /* attacl, attoptions and attfdwoptions are not present in tupledescs */ + + att->atttypid = oidtypeid; + + /* + * Our goal here is to support just enough types to let basic builtin + * commands work without catalog access - e.g. so that we can do certain + * things even in processes that are not connected to a database. + */ + switch (oidtypeid) + { + case TEXTOID: + case TEXTARRAYOID: + att->attlen = -1; + att->attbyval = false; + att->attalign = 'i'; + att->attstorage = 'x'; + att->attcollation = DEFAULT_COLLATION_OID; + break; + + case BOOLOID: + att->attlen = 1; + att->attbyval = true; + att->attalign = 'c'; + att->attstorage = 'p'; + att->attcollation = InvalidOid; + break; + + case INT4OID: + att->attlen = 4; + att->attbyval = true; + att->attalign = 'i'; + att->attstorage = 'p'; + att->attcollation = InvalidOid; + break; + + case INT8OID: + att->attlen = 8; + att->attbyval = FLOAT8PASSBYVAL; + att->attalign = 'd'; + att->attstorage = 'p'; + att->attcollation = InvalidOid; + break; + } +} + +/* * TupleDescInitEntryCollation * * Assign a nondefault collation to a previously initialized tuple descriptor diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index fade0cbb61..990b5ffa58 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -314,10 +314,17 @@ deleted. The previous paragraph's reasoning only applies to searches, and only to posting trees. To protect from inserters following a downlink to a deleted page, vacuum simply locks out all concurrent insertions to the posting tree, -by holding a super-exclusive lock on the posting tree root. Inserters hold a -pin on the root page, but searches do not, so while new searches cannot begin -while root page is locked, any already-in-progress scans can continue -concurrently with vacuum. In the entry tree, we never delete pages. +by holding a super-exclusive lock on the parent page of subtree with deletable +pages. Inserters hold a pin on the root page, but searches do not, so while +new searches cannot begin while root page is locked, any already-in-progress +scans can continue concurrently with vacuum in corresponding subtree of +posting tree. To exclude interference with readers vacuum takes exclusive +locks in a depth-first scan in left-to-right order of page tuples. Leftmost +page is never deleted. Thus before deleting any page we obtain exclusive +lock on any left page, effectively excluding deadlock with any reader, despite +taking parent lock before current and left lock after current. We take left +lock not for a concurrency reasons, but rather in need to mark page dirty. +In the entry tree, we never delete pages. (This is quite different from the mechanism the btree indexam uses to make page-deletions safe; it stamps the deleted pages with an XID and keeps the diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c index aaf72a3f9e..cc7435e030 100644 --- a/src/backend/access/gin/ginarrayproc.c +++ b/src/backend/access/gin/ginarrayproc.c @@ -4,7 +4,7 @@ * support functions for GIN's indexing of any array * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index fa383719e6..b02cb8ae58 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "utils/memutils.h" @@ -30,7 +31,7 @@ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, /* * Lock buffer by needed method for search. */ -static int +int ginTraverseLock(Buffer buffer, bool searchMode) { Page page; @@ -348,9 +349,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, */ tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "ginPlaceToPage temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldCxt = MemoryContextSwitchTo(tmpCxt); if (GinPageIsData(page)) diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c index d6422ea91e..f07c76b90b 100644 --- a/src/backend/access/gin/ginbulk.c +++ b/src/backend/access/gin/ginbulk.c @@ -4,7 +4,7 @@ * routines for fast build of inverted index * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -255,7 +255,7 @@ qsortCompareItemPointers(const void *a, const void *b) void ginBeginBAScan(BuildAccumulator *accum) { - rb_begin_iterate(accum->tree, LeftRightWalk); + rb_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk); } /* @@ -271,7 +271,7 @@ ginGetBAEntry(BuildAccumulator *accum, GinEntryAccumulator *entry; ItemPointerData *list; - entry = (GinEntryAccumulator *) rb_iterate(accum->tree); + entry = (GinEntryAccumulator *) rb_iterate(&accum->tree_walk); if (entry == NULL) return NULL; /* no more entries */ diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 97c8bf78e7..ad62d4e0e9 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -4,7 +4,7 @@ * routines for handling GIN posting tree pages. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "lib/ilist.h" #include "miscadmin.h" @@ -86,7 +87,7 @@ typedef struct char action; ItemPointerData *modifieditems; - int nmodifieditems; + uint16 nmodifieditems; /* * The following fields represent the items in this segment. If 'items' is diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 8c0bfe9fde..8c9859ce8e 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -4,7 +4,7 @@ * routines for handling GIN entry tree pages. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "utils/rel.h" diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 59a63f28d0..0d5bb70cc9 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -7,7 +7,7 @@ * transfer pending entries into the regular index structure. This * wins because bulk insertion is much more efficient than retail. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -19,6 +19,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "access/xlog.h" #include "commands/vacuum.h" @@ -30,6 +31,7 @@ #include "postmaster/autovacuum.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "utils/builtins.h" /* GUC parameter */ int gin_pending_list_limit = 0; @@ -808,9 +810,7 @@ ginInsertCleanup(GinState *ginstate, bool full_clean, */ opCtx = AllocSetContextCreate(CurrentMemoryContext, "GIN insert cleanup temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(opCtx); diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 9ed9fd2dc5..610d386ff8 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -4,7 +4,7 @@ * fetch tuples from a GIN scan. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -120,7 +120,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, Form_pg_attribute attr; /* Initialize empty bitmap result */ - scanEntry->matchBitmap = tbm_create(work_mem * 1024L); + scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL); /* Null query cannot partial-match anything */ if (scanEntry->isPartialMatch && @@ -626,8 +626,9 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, } else { - entry->btree.itemptr = advancePast; - entry->btree.itemptr.ip_posid++; + ItemPointerSet(&entry->btree.itemptr, + GinItemPointerGetBlockNumber(&advancePast), + OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast))); } entry->btree.fullScan = false; stack = ginFindLeafPage(&entry->btree, true, snapshot); @@ -979,15 +980,17 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, if (GinItemPointerGetBlockNumber(&advancePast) < GinItemPointerGetBlockNumber(&minItem)) { - advancePast.ip_blkid = minItem.ip_blkid; - advancePast.ip_posid = 0; + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&minItem), + InvalidOffsetNumber); } } else { - Assert(minItem.ip_posid > 0); - advancePast = minItem; - advancePast.ip_posid--; + Assert(GinItemPointerGetOffsetNumber(&minItem) > 0); + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&minItem), + OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem))); } /* @@ -1245,15 +1248,17 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, if (GinItemPointerGetBlockNumber(&advancePast) < GinItemPointerGetBlockNumber(&key->curItem)) { - advancePast.ip_blkid = key->curItem.ip_blkid; - advancePast.ip_posid = 0; + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&key->curItem), + InvalidOffsetNumber); } } else { - Assert(key->curItem.ip_posid > 0); - advancePast = key->curItem; - advancePast.ip_posid--; + Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0); + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&key->curItem), + OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem))); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 9f784bf48d..d90faae65d 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -4,7 +4,7 @@ * insert routines for the postgres inverted index access method. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "catalog/index.h" #include "miscadmin.h" @@ -372,9 +373,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); /* * create a temporary memory context that is used for calling @@ -382,9 +381,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context for user-defined function", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); @@ -486,41 +483,48 @@ ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, bool gininsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, - IndexUniqueCheck checkUnique) + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) { - GinState ginstate; + GinState *ginstate = (GinState *) indexInfo->ii_AmCache; MemoryContext oldCtx; MemoryContext insertCtx; int i; + /* Initialize GinState cache if first call in this statement */ + if (ginstate == NULL) + { + oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context); + ginstate = (GinState *) palloc(sizeof(GinState)); + initGinState(ginstate, index); + indexInfo->ii_AmCache = (void *) ginstate; + MemoryContextSwitchTo(oldCtx); + } + insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin insert temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(insertCtx); - initGinState(&ginstate, index); - if (GinGetUseFastUpdate(index)) { GinTupleCollector collector; memset(&collector, 0, sizeof(GinTupleCollector)); - for (i = 0; i < ginstate.origTupdesc->natts; i++) - ginHeapTupleFastCollect(&ginstate, &collector, + for (i = 0; i < ginstate->origTupdesc->natts; i++) + ginHeapTupleFastCollect(ginstate, &collector, (OffsetNumber) (i + 1), values[i], isnull[i], ht_ctid); - ginHeapTupleFastInsert(&ginstate, &collector); + ginHeapTupleFastInsert(ginstate, &collector); } else { - for (i = 0; i < ginstate.origTupdesc->natts; i++) - ginHeapTupleInsert(&ginstate, (OffsetNumber) (i + 1), + for (i = 0; i < ginstate->origTupdesc->natts; i++) + ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1), values[i], isnull[i], ht_ctid); } diff --git a/src/backend/access/gin/ginlogic.c b/src/backend/access/gin/ginlogic.c index d3e84eee97..a940a9374a 100644 --- a/src/backend/access/gin/ginlogic.c +++ b/src/backend/access/gin/ginlogic.c @@ -24,7 +24,7 @@ * is used for.) * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c index 54d5f6f630..8d2d31ac72 100644 --- a/src/backend/access/gin/ginpostinglist.c +++ b/src/backend/access/gin/ginpostinglist.c @@ -4,7 +4,7 @@ * routines for dealing with posting lists. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -79,13 +79,11 @@ itemptr_to_uint64(const ItemPointer iptr) uint64 val; Assert(ItemPointerIsValid(iptr)); - Assert(iptr->ip_posid < (1 << MaxHeapTuplesPerPageBits)); + Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits)); - val = iptr->ip_blkid.bi_hi; - val <<= 16; - val |= iptr->ip_blkid.bi_lo; + val = GinItemPointerGetBlockNumber(iptr); val <<= MaxHeapTuplesPerPageBits; - val |= iptr->ip_posid; + val |= GinItemPointerGetOffsetNumber(iptr); return val; } @@ -93,11 +91,9 @@ itemptr_to_uint64(const ItemPointer iptr) static inline void uint64_to_itemptr(uint64 val, ItemPointer iptr) { - iptr->ip_posid = val & ((1 << MaxHeapTuplesPerPageBits) - 1); + GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1)); val = val >> MaxHeapTuplesPerPageBits; - iptr->ip_blkid.bi_lo = val & 0xFFFF; - val = val >> 16; - iptr->ip_blkid.bi_hi = val & 0xFFFF; + GinItemPointerSetBlockNumber(iptr, val); Assert(ItemPointerIsValid(iptr)); } diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index c449c1cbc0..c83375d6b4 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -4,7 +4,7 @@ * routines to manage scans of inverted index relations * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -38,14 +38,10 @@ ginbeginscan(Relation rel, int nkeys, int norderbys) so->nkeys = 0; so->tempCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin scan temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); so->keyCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin scan key context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); initGinState(&so->ginstate, scan->indexRelation); scan->opaque = so; @@ -151,7 +147,7 @@ ginFillScanKey(GinScanOpaque so, OffsetNumber attnum, key->nuserentries = nUserQueryValues; key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) * nQueryValues); - key->entryRes = (bool *) palloc0(sizeof(bool) * nQueryValues); + key->entryRes = (GinTernaryValue *) palloc0(sizeof(GinTernaryValue) * nQueryValues); key->query = query; key->queryValues = queryValues; diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index d9146488c4..d03d59da6a 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -4,7 +4,7 @@ * Utility routines for the Postgres inverted index access method. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/reloptions.h" #include "access/xloginsert.h" #include "catalog/pg_collation.h" @@ -22,7 +23,9 @@ #include "miscadmin.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "utils/builtins.h" #include "utils/index_selfuncs.h" +#include "utils/typcache.h" /* @@ -47,6 +50,7 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->amstorage = true; amroutine->amclusterable = false; amroutine->ampredlocks = false; + amroutine->amcanparallel = false; amroutine->amkeytype = InvalidOid; amroutine->ambuild = ginbuild; @@ -66,6 +70,9 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->amendscan = ginendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; PG_RETURN_POINTER(amroutine); } @@ -105,9 +112,33 @@ initGinState(GinState *state, Relation index) origTupdesc->attrs[i]->attcollation); } - fmgr_info_copy(&(state->compareFn[i]), - index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), - CurrentMemoryContext); + /* + * If the compare proc isn't specified in the opclass definition, look + * up the index key type's default btree comparator. + */ + if (index_getprocid(index, i + 1, GIN_COMPARE_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->compareFn[i]), + index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), + CurrentMemoryContext); + } + else + { + TypeCacheEntry *typentry; + + typentry = lookup_type_cache(origTupdesc->attrs[i]->atttypid, + TYPECACHE_CMP_PROC_FINFO); + if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(origTupdesc->attrs[i]->atttypid)))); + fmgr_info_copy(&(state->compareFn[i]), + &(typentry->cmp_proc_finfo), + CurrentMemoryContext); + } + + /* Opclass must always provide extract procs */ fmgr_info_copy(&(state->extractValueFn[i]), index_getprocinfo(index, i + 1, GIN_EXTRACTVALUE_PROC), CurrentMemoryContext); diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index c258478f23..27e502a360 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -4,7 +4,7 @@ * delete & vacuum routines for the postgres GIN * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "commands/vacuum.h" #include "miscadmin.h" @@ -108,75 +109,17 @@ xlogVacuumPage(Relation index, Buffer buffer) PageSetLSN(page, recptr); } -static bool -ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer) -{ - Buffer buffer; - Page page; - bool hasVoidPage = FALSE; - MemoryContext oldCxt; - - buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, - RBM_NORMAL, gvs->strategy); - page = BufferGetPage(buffer); - - /* - * We should be sure that we don't concurrent with inserts, insert process - * never release root page until end (but it can unlock it and lock - * again). New scan can't start but previously started ones work - * concurrently. - */ - if (isRoot) - LockBufferForCleanup(buffer); - else - LockBuffer(buffer, GIN_EXCLUSIVE); - - Assert(GinPageIsData(page)); - if (GinPageIsLeaf(page)) - { - oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); - ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); - MemoryContextSwitchTo(oldCxt); - MemoryContextReset(gvs->tmpCxt); - - /* if root is a leaf page, we don't desire further processing */ - if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page)) - hasVoidPage = TRUE; - } - else - { - OffsetNumber i; - bool isChildHasVoid = FALSE; - - for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++) - { - PostingItem *pitem = GinDataPageGetPostingItem(page, i); - - if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL)) - isChildHasVoid = TRUE; - } - - if (isChildHasVoid) - hasVoidPage = TRUE; - } +typedef struct DataPageDeleteStack +{ + struct DataPageDeleteStack *child; + struct DataPageDeleteStack *parent; - /* - * if we have root and there are empty pages in tree, then we don't - * release lock to go further processing and guarantee that tree is unused - */ - if (!(isRoot && hasVoidPage)) - { - UnlockReleaseBuffer(buffer); - } - else - { - Assert(rootBuffer); - *rootBuffer = buffer; - } + BlockNumber blkno; /* current block number */ + BlockNumber leftBlkno; /* rightest non-deleted page on left */ + bool isRoot; +} DataPageDeleteStack; - return hasVoidPage; -} /* * Delete a posting tree page. @@ -193,8 +136,13 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn BlockNumber rightlink; /* - * Lock the pages in the same order as an insertion would, to avoid - * deadlocks: left, then right, then parent. + * This function MUST be called only if someone of parent pages hold + * exclusive cleanup lock. This guarantees that no insertions currently + * happen in this subtree. Caller also acquire Exclusive lock on deletable + * page and is acquiring and releasing exclusive lock on left page before. + * Left page was locked and released. Then parent and this page are + * locked. We acquire left page lock here only to mark page dirty after + * changing right pointer. */ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, RBM_NORMAL, gvs->strategy); @@ -204,10 +152,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn RBM_NORMAL, gvs->strategy); LockBuffer(lBuffer, GIN_EXCLUSIVE); - LockBuffer(dBuffer, GIN_EXCLUSIVE); - if (!isParentRoot) /* parent is already locked by - * LockBufferForCleanup() */ - LockBuffer(pBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); @@ -271,26 +215,15 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn PageSetLSN(BufferGetPage(lBuffer), recptr); } - if (!isParentRoot) - LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); UnlockReleaseBuffer(lBuffer); - UnlockReleaseBuffer(dBuffer); + ReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; } -typedef struct DataPageDeleteStack -{ - struct DataPageDeleteStack *child; - struct DataPageDeleteStack *parent; - - BlockNumber blkno; /* current block number */ - BlockNumber leftBlkno; /* rightest non-deleted page on left */ - bool isRoot; -} DataPageDeleteStack; /* * scans posting tree and deletes empty pages @@ -324,6 +257,10 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); + + if (!isRoot) + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); Assert(GinPageIsData(page)); @@ -358,6 +295,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, } } + if (!isRoot) + LockBuffer(buffer, GIN_UNLOCK); + ReleaseBuffer(buffer); if (!meDelete) @@ -366,37 +306,124 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, return meDelete; } -static void -ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) + +/* + * Scan through posting tree, delete empty tuples from leaf pages. + * Also, this function collects empty subtrees (with all empty leafs). + * For parents of these subtrees CleanUp lock is taken, then we call + * ScanToDelete. This is done for every inner page, which points to + * empty subtree. + */ +static bool +ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot) { - Buffer rootBuffer = InvalidBuffer; - DataPageDeleteStack root, - *ptr, - *tmp; + Buffer buffer; + Page page; + bool hasVoidPage = FALSE; + MemoryContext oldCxt; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + page = BufferGetPage(buffer); + + ginTraverseLock(buffer, false); + + Assert(GinPageIsData(page)); - if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE) + if (GinPageIsLeaf(page)) { - Assert(rootBuffer == InvalidBuffer); - return; + oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); + ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(gvs->tmpCxt); + + /* if root is a leaf page, we don't desire further processing */ + if (GinDataLeafPageIsEmpty(page)) + hasVoidPage = TRUE; + + UnlockReleaseBuffer(buffer); + + return hasVoidPage; } + else + { + OffsetNumber i; + bool hasEmptyChild = FALSE; + bool hasNonEmptyChild = FALSE; + OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; + BlockNumber *children = palloc(sizeof(BlockNumber) * (maxoff + 1)); + + /* + * Read all children BlockNumbers. Not sure it is safe if there are + * many concurrent vacuums. + */ - memset(&root, 0, sizeof(DataPageDeleteStack)); - root.leftBlkno = InvalidBlockNumber; - root.isRoot = TRUE; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + PostingItem *pitem = GinDataPageGetPostingItem(page, i); - vacuum_delay_point(); + children[i] = PostingItemGetBlockNumber(pitem); + } - ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber); + UnlockReleaseBuffer(buffer); - ptr = root.child; - while (ptr) - { - tmp = ptr->child; - pfree(ptr); - ptr = tmp; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE)) + hasEmptyChild = TRUE; + else + hasNonEmptyChild = TRUE; + } + + pfree(children); + + vacuum_delay_point(); + + /* + * All subtree is empty - just return TRUE to indicate that parent + * must do a cleanup. Unless we are ROOT an there is way to go upper. + */ + + if (hasEmptyChild && !hasNonEmptyChild && !isRoot) + return TRUE; + + if (hasEmptyChild) + { + DataPageDeleteStack root, + *ptr, + *tmp; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + LockBufferForCleanup(buffer); + + memset(&root, 0, sizeof(DataPageDeleteStack)); + root.leftBlkno = InvalidBlockNumber; + root.isRoot = TRUE; + + ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber); + + ptr = root.child; + + while (ptr) + { + tmp = ptr->child; + pfree(ptr); + ptr = tmp; + } + + UnlockReleaseBuffer(buffer); + } + + /* Here we have deleted all empty subtrees */ + return FALSE; } +} - UnlockReleaseBuffer(rootBuffer); +static void +ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) +{ + ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE); } /* @@ -526,9 +553,7 @@ ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "Gin vacuum temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); gvs.index = index; gvs.callback = callback; gvs.callback_state = callback_state; diff --git a/src/backend/access/gin/ginvalidate.c b/src/backend/access/gin/ginvalidate.c index 032508387d..0d2847456e 100644 --- a/src/backend/access/gin/ginvalidate.c +++ b/src/backend/access/gin/ginvalidate.c @@ -3,7 +3,7 @@ * ginvalidate.c * Opclass validator for GIN. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -24,6 +24,7 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/syscache.h" +#include "utils/regproc.h" /* @@ -237,7 +238,7 @@ ginvalidate(Oid opclassoid) if (opclassgroup && (opclassgroup->functionset & (((uint64) 1) << i)) != 0) continue; /* got it */ - if (i == GIN_COMPARE_PARTIAL_PROC) + if (i == GIN_COMPARE_PROC || i == GIN_COMPARE_PARTIAL_PROC) continue; /* optional method */ if (i == GIN_CONSISTENT_PROC || i == GIN_TRICONSISTENT_PROC) continue; /* don't need both, see check below loop */ diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index b4d310f337..7ba04e324f 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -4,7 +4,7 @@ * WAL replay logic for inverted index. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -13,7 +13,9 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xlogutils.h" #include "utils/memutils.h" @@ -749,13 +751,43 @@ gin_xlog_startup(void) { opCtx = AllocSetContextCreate(CurrentMemoryContext, "GIN recovery temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); } void gin_xlog_cleanup(void) { MemoryContextDelete(opCtx); + opCtx = NULL; +} + +/* + * Mask a GIN page before running consistency checks on it. + */ +void +gin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + GinPageOpaque opaque; + + mask_page_lsn(page); + opaque = GinPageGetOpaque(page); + + mask_page_hint_bits(page); + + /* + * GIN metapage doesn't use pd_lower/pd_upper. Other page types do. Hence, + * we need to apply masking for those pages. + */ + if (opaque->flags != GIN_META) + { + /* + * For GIN_DELETED page, the page is initialized to empty. Hence, mask + * the page content. + */ + if (opaque->flags & GIN_DELETED) + mask_page_content(page); + else + mask_unused_space(page); + } } diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index dd4c9fa70a..02228662b8 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -28,7 +28,7 @@ The current implementation of GiST supports: The support for concurrency implemented in PostgreSQL was developed based on the paper "Access Methods for Next-Generation Database Systems" by -Marcel Kornaker: +Marcel Kornacker: http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 9a417ca2f4..6593771361 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -4,7 +4,7 @@ * interface routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -18,6 +18,8 @@ #include "access/gistscan.h" #include "catalog/pg_collation.h" #include "miscadmin.h" +#include "nodes/execnodes.h" +#include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -69,6 +71,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->amstorage = true; amroutine->amclusterable = true; amroutine->ampredlocks = false; + amroutine->amcanparallel = false; amroutine->amkeytype = InvalidOid; amroutine->ambuild = gistbuild; @@ -88,6 +91,9 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->amendscan = gistendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; PG_RETURN_POINTER(amroutine); } @@ -105,9 +111,7 @@ createTempGistContext(void) { return AllocSetContextCreate(CurrentMemoryContext, "GiST temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); } /* @@ -142,21 +146,23 @@ gistbuildempty(Relation index) bool gistinsert(Relation r, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, - IndexUniqueCheck checkUnique) + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) { + GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; IndexTuple itup; - GISTSTATE *giststate; MemoryContext oldCxt; - giststate = initGISTstate(r); + /* Initialize GISTSTATE cache if first call in this statement */ + if (giststate == NULL) + { + oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context); + giststate = initGISTstate(r); + giststate->tempCxt = createTempGistContext(); + indexInfo->ii_AmCache = (void *) giststate; + MemoryContextSwitchTo(oldCxt); + } - /* - * We use the giststate's scan context as temp context too. This means - * that any memory leaked by the support functions is not reclaimed until - * end of insert. In most cases, we aren't going to call the support - * functions very many times before finishing the insert, so this seems - * cheaper than resetting a temp context for each function call. - */ oldCxt = MemoryContextSwitchTo(giststate->tempCxt); itup = gistFormTuple(giststate, r, @@ -167,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull, /* cleanup */ MemoryContextSwitchTo(oldCxt); - freeGISTstate(giststate); + MemoryContextReset(giststate->tempCxt); return false; } @@ -495,18 +501,36 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, else { /* - * Enough space. We also get here if ntuples==0. + * Enough space. We always get here if ntup==0. */ START_CRIT_SECTION(); /* - * While we delete only one tuple at once we could mix calls - * PageIndexTupleDelete() here and PageIndexMultiDelete() in - * gistRedoPageUpdateRecord() + * Delete old tuple if any, then insert new tuple(s) if any. If + * possible, use the fast path of PageIndexTupleOverwrite. */ if (OffsetNumberIsValid(oldoffnum)) - PageIndexTupleDelete(page, oldoffnum); - gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + { + if (ntup == 1) + { + /* One-for-one replacement, so use PageIndexTupleOverwrite */ + if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup, + IndexTupleSize(*itup))) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(rel)); + } + else + { + /* Delete old, then append new tuple(s) to page */ + PageIndexTupleDelete(page, oldoffnum); + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } + } + else + { + /* Just append new tuples at the end of the page */ + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } MarkBufferDirty(buffer); @@ -1411,9 +1435,7 @@ initGISTstate(Relation index) /* Create the memory context that will hold the GISTSTATE */ scanCxt = AllocSetContextCreate(CurrentMemoryContext, "GiST scan context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldCxt = MemoryContextSwitchTo(scanCxt); /* Create and fill in the GISTSTATE */ diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 4e43a6932a..f1f08bb3d8 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -4,7 +4,7 @@ * build algorithm for GiST indexes implementation. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -18,6 +18,7 @@ #include "access/genam.h" #include "access/gist_private.h" +#include "access/gistxlog.h" #include "access/xloginsert.h" #include "catalog/index.h" #include "miscadmin.h" diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c index 8e3fcfbdc1..ca4c32b3fe 100644 --- a/src/backend/access/gist/gistbuildbuffers.c +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -4,7 +4,7 @@ * node buffer management functions for GiST buffering build algorithm. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 5ba7d0a793..5a4dea89ac 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -4,7 +4,7 @@ * fetch tuples from a GiST scan. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -375,6 +375,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, } so->nPageData = so->curPageData = 0; + scan->xs_hitup = NULL; /* might point into pageDataCxt */ if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); @@ -441,12 +442,13 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, so->pageData[so->nPageData].offnum = i; /* - * In an index-only scan, also fetch the data from the tuple. + * In an index-only scan, also fetch the data from the tuple. The + * reconstructed tuples are stored in pageDataCxt. */ if (scan->xs_want_itup) { oldcxt = MemoryContextSwitchTo(so->pageDataCxt); - so->pageData[so->nPageData].ftup = + so->pageData[so->nPageData].recontup = gistFetchTuple(giststate, r, it); MemoryContextSwitchTo(oldcxt); } @@ -478,7 +480,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) - item->data.heap.ftup = gistFetchTuple(giststate, r, it); + item->data.heap.recontup = gistFetchTuple(giststate, r, it); } else { @@ -540,11 +542,11 @@ getNextNearest(IndexScanDesc scan) bool res = false; int i; - if (scan->xs_itup) + if (scan->xs_hitup) { /* free previously returned tuple */ - pfree(scan->xs_itup); - scan->xs_itup = NULL; + pfree(scan->xs_hitup); + scan->xs_hitup = NULL; } do @@ -601,7 +603,7 @@ getNextNearest(IndexScanDesc scan) /* in an index-only scan, also return the reconstructed tuple. */ if (scan->xs_want_itup) - scan->xs_itup = item->data.heap.ftup; + scan->xs_hitup = item->data.heap.recontup; res = true; } else @@ -641,6 +643,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) so->firstCall = false; so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); @@ -685,7 +688,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) /* in an index-only scan, also return the reconstructed tuple */ if (scan->xs_want_itup) - scan->xs_itup = so->pageData[so->curPageData].ftup; + scan->xs_hitup = so->pageData[so->curPageData].recontup; so->curPageData++; @@ -765,6 +768,7 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) /* Begin the scan by processing the root page */ so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c index d47211afc0..15b89fd8ad 100644 --- a/src/backend/access/gist/gistproc.c +++ b/src/backend/access/gist/gistproc.c @@ -7,7 +7,7 @@ * This gives R-tree behavior, with Guttman's poly-time split algorithm. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -17,6 +17,7 @@ */ #include "postgres.h" +#include <float.h> #include <math.h> #include "access/gist.h" diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 6f07cd8d46..058544e2ae 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -4,7 +4,7 @@ * routines to manage scans on GiST index relations * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -125,7 +125,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, * which is created on the second call and reset on later calls. Thus, in * the common case where a scan is only rescan'd once, we just put the * queue in scanCxt and don't pay the overhead of making a second memory - * context. If we do rescan more than once, the first RBTree is just left + * context. If we do rescan more than once, the first queue is just left * for dead until end of scan; this small wastage seems worth the savings * in the common case. */ @@ -140,9 +140,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, /* second time through */ so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, "GiST queue context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); first_time = false; } else @@ -157,7 +155,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, * tuple descriptor to represent the returned index tuples and create a * memory context to hold them during the scan. */ - if (scan->xs_want_itup && !scan->xs_itupdesc) + if (scan->xs_want_itup && !scan->xs_hitupdesc) { int natts; int attno; @@ -176,16 +174,15 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, scan->indexRelation->rd_opcintype[attno - 1], -1, 0); } - scan->xs_itupdesc = so->giststate->fetchTupdesc; + scan->xs_hitupdesc = so->giststate->fetchTupdesc; + /* Also create a memory context that will hold the returned tuples */ so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, "GiST page data context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); } - /* create new, empty RBTree for search queue */ + /* create new, empty pairing heap for search queue */ oldCxt = MemoryContextSwitchTo(so->queueCxt); so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); MemoryContextSwitchTo(oldCxt); @@ -316,6 +313,9 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, if (!first_time) pfree(fn_extras); } + + /* any previous xs_hitup will have been pfree'd in context resets above */ + scan->xs_hitup = NULL; } void diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c index d394969a57..cffc5ddc75 100644 --- a/src/backend/access/gist/gistsplit.c +++ b/src/backend/access/gist/gistsplit.c @@ -15,7 +15,7 @@ * gistSplitByKey() is the entry point to this file. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 26d4a64694..cbdaec9d2b 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -4,7 +4,7 @@ * utilities routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include <float.h> #include <math.h> #include "access/gist_private.h" @@ -624,9 +625,9 @@ gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r) /* * Fetch all keys in tuple. - * returns new IndexTuple that contains GISTENTRY with fetched data + * Returns a new HeapTuple containing the originally-indexed data. */ -IndexTuple +HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) { MemoryContext oldcxt = MemoryContextSwitchTo(giststate->tempCxt); @@ -660,7 +661,7 @@ gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) } MemoryContextSwitchTo(oldcxt); - return index_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); + return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); } float @@ -852,7 +853,7 @@ gistproperty(Oid index_oid, int attno, bool *res, bool *isnull) { HeapTuple tuple; - Form_pg_index rd_index; + Form_pg_index rd_index PG_USED_FOR_ASSERTS_ONLY; Form_pg_opclass rd_opclass; Datum datum; bool disnull; diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 53e5cea580..77d9d12f0b 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -4,7 +4,7 @@ * vacuuming routines for the postgres GiST index access method. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c index ffd7fd631b..585c92be26 100644 --- a/src/backend/access/gist/gistvalidate.c +++ b/src/backend/access/gist/gistvalidate.c @@ -3,7 +3,7 @@ * gistvalidate.c * Opclass validator for GiST. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -23,6 +23,7 @@ #include "catalog/pg_type.h" #include "utils/builtins.h" #include "utils/lsyscache.h" +#include "utils/regproc.h" #include "utils/syscache.h" diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 01c7ef7ea6..4f4fe8fab5 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -4,7 +4,7 @@ * WAL replay logic for GiST. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -13,7 +13,9 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/gist_private.h" +#include "access/gistxlog.h" #include "access/xloginsert.h" #include "access/xlogutils.h" #include "utils/memutils.h" @@ -80,9 +82,31 @@ gistRedoPageUpdateRecord(XLogReaderState *record) page = (Page) BufferGetPage(buffer); - /* Delete old tuples */ - if (xldata->ntodelete > 0) + if (xldata->ntodelete == 1 && xldata->ntoinsert == 1) { + /* + * When replacing one tuple with one other tuple, we must use + * PageIndexTupleOverwrite for consistency with gistplacetopage. + */ + OffsetNumber offnum = *((OffsetNumber *) data); + IndexTuple itup; + Size itupsize; + + data += sizeof(OffsetNumber); + itup = (IndexTuple) data; + itupsize = IndexTupleSize(itup); + if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize)) + elog(ERROR, "failed to add item to GiST index page, size %d bytes", + (int) itupsize); + data += itupsize; + /* should be nothing left after consuming 1 tuple */ + Assert(data - begin == datalen); + /* update insertion count for assert check below */ + ninserted++; + } + else if (xldata->ntodelete > 0) + { + /* Otherwise, delete old tuples if any */ OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; @@ -92,7 +116,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record) GistMarkTuplesDeleted(page); } - /* add tuples */ + /* Add new tuples if any */ if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : @@ -115,6 +139,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record) } } + /* Check that XLOG record contained expected number of tuples */ Assert(ninserted == xldata->ntoinsert); PageSetLSN(page, lsn); @@ -320,6 +345,48 @@ gist_xlog_cleanup(void) } /* + * Mask a Gist page before running consistency checks on it. + */ +void +gist_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + /* + * NSN is nothing but a special purpose LSN. Hence, mask it for the same + * reason as mask_page_lsn. + */ + GistPageSetNSN(page, (uint64) MASK_MARKER); + + /* + * We update F_FOLLOW_RIGHT flag on the left child after writing WAL + * record. Hence, mask this flag. See gistplacetopage() for details. + */ + GistMarkFollowRight(page); + + if (GistPageIsLeaf(page)) + { + /* + * In gist leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * gistkillitems() for details. + */ + mask_lp_flags(page); + } + + /* + * During gist redo, we never mark a page as garbage. Hence, mask it to + * ignore any differences. + */ + GistClearPageHasGarbage(page); +} + +/* * Write WAL record of a page split. */ XLogRecPtr diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile index 5d3bd94d3e..b154569b46 100644 --- a/src/backend/access/hash/Makefile +++ b/src/backend/access/hash/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/access/hash top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \ - hashsearch.o hashsort.o hashutil.o hashvalidate.o +OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o \ + hashsort.o hashutil.o hashvalidate.o hash_xlog.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 0a7da89285..c8a0ec78a9 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to move primary bucket pages around after they are created. Primary bucket pages (henceforth just "bucket pages") are allocated in -power-of-2 groups, called "split points" in the code. Buckets 0 and 1 -are created when the index is initialized. At the first split, buckets 2 -and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated; -when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket -pages of a power-of-2 group appear consecutively in the index. This -addressing scheme allows the physical location of a bucket page to be -computed from the bucket number relatively easily, using only a small -amount of control information. We take the log2() of the bucket number -to determine which split point S the bucket belongs to, and then simply -add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the -metapage) to compute the physical address. hashm_spares[S] can be -interpreted as the total number of overflow pages that have been allocated -before the bucket pages of splitpoint S. hashm_spares[0] is always 0, -so that buckets 0 and 1 (which belong to splitpoint 0) always appear at -block numbers 1 and 2, just after the meta page. We always have -hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the -former. The difference between the two represents the number of overflow -pages appearing between the bucket page groups of splitpoints N and N+1. - +power-of-2 groups, called "split points" in the code. That means at every new +splitpoint we double the existing number of buckets. Allocating huge chunks +of bucket pages all at once isn't optimal and we will take ages to consume +those. To avoid this exponential growth of index size, we did use a trick to +break up allocation of buckets at the splitpoint into 4 equal phases. If +(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on +we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2)) +of total buckets at each phase of splitpoint group. Next quarter of allocation +will only happen if buckets of the previous phase have been already consumed. +For the initial splitpoint groups < 10 we will allocate all of their buckets in +single phase only, as number of buckets allocated at initial groups are small +in numbers. And for the groups >= 10 the allocation process is distributed +among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4 +different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces +indicate the number of buckets allocated within each phase of splitpoint group +10. And, for splitpoint group 11 and 12 allocation phases will be +{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We +can see that at each splitpoint group we double the total number of buckets +from the previous group but in an incremental phase. The bucket pages +allocated within one phase of a splitpoint group will appear consecutively in +the index. This addressing scheme allows the physical location of a bucket +page to be computed from the bucket number relatively easily, using only a +small amount of control information. If we look at the function +_hash_spareindex for a given bucket number we first compute the +splitpoint group it belongs to and then the phase to which the bucket belongs +to. Adding them we get the global splitpoint phase number S to which the +bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[] +is an array stored in the metapage) with given bucket number to compute its +physical address. The hashm_spares[S] can be interpreted as the total number +of overflow pages that have been allocated before the bucket pages of +splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1 +always appear at block numbers 1 and 2, just after the meta page. We always +have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the +former. The difference between the two represents the number of overflow pages +appearing between the bucket page groups of splitpoints phase N and N+1. (Note: the above describes what happens when filling an initially minimally -sized hash index. In practice, we try to estimate the required index size -and allocate a suitable number of splitpoints immediately, to avoid +sized hash index. In practice, we try to estimate the required index size and +allocate a suitable number of splitpoints phases immediately, to avoid expensive re-splitting during initial index build.) When S splitpoints exist altogether, the array entries hashm_spares[0] through hashm_spares[S] are valid; hashm_spares[S] records the current total number of overflow pages. New overflow pages are created as needed at the end of the index, and recorded by incrementing hashm_spares[S]. -When it is time to create a new splitpoint's worth of bucket pages, we +When it is time to create a new splitpoint phase's worth of bucket pages, we copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is stored in the hashm_ovflpoint field of the meta page). This has the effect of reserving the correct number of bucket pages at the end of the @@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during an index extension we crash after allocating filesystem space and before updating the metapage. Note that on filesystems that allow "holes" in files, it's entirely likely that pages before the logical EOF are not yet -allocated: when we allocate a new splitpoint's worth of bucket pages, we +allocated: when we allocate a new splitpoint phase's worth of bucket pages, we physically zero the last such page to force the EOF up, and the first such page will be used immediately, but the intervening pages are not written until needed. @@ -126,53 +142,98 @@ the initially created buckets. Lock Definitions ---------------- -We use both lmgr locks ("heavyweight" locks) and buffer context locks -(LWLocks) to control access to a hash index. lmgr locks are needed for -long-term locking since there is a (small) risk of deadlock, which we must -be able to detect. Buffer context locks are used for short-term access -control to individual pages of the index. - -LockPage(rel, page), where page is the page number of a hash bucket page, -represents the right to split or compact an individual bucket. A process -splitting a bucket must exclusive-lock both old and new halves of the -bucket until it is done. A process doing VACUUM must exclusive-lock the -bucket it is currently purging tuples from. Processes doing scans or -insertions must share-lock the bucket they are scanning or inserting into. -(It is okay to allow concurrent scans and insertions.) - -The lmgr lock IDs corresponding to overflow pages are currently unused. -These are available for possible future refinements. LockPage(rel, 0) -is also currently undefined (it was previously used to represent the right -to modify the hash-code-to-bucket mapping, but it is no longer needed for -that purpose). - -Note that these lock definitions are conceptually distinct from any sort -of lock on the pages whose numbers they share. A process must also obtain -read or write buffer lock on the metapage or bucket page before accessing -said page. - -Processes performing hash index scans must hold share lock on the bucket -they are scanning throughout the scan. This seems to be essential, since -there is no reasonable way for a scan to cope with its bucket being split -underneath it. This creates a possibility of deadlock external to the -hash index code, since a process holding one of these locks could block -waiting for an unrelated lock held by another process. If that process -then does something that requires exclusive lock on the bucket, we have -deadlock. Therefore the bucket locks must be lmgr locks so that deadlock -can be detected and recovered from. - -Processes must obtain read (share) buffer context lock on any hash index -page while reading it, and write (exclusive) lock while modifying it. -To prevent deadlock we enforce these coding rules: no buffer lock may be -held long term (across index AM calls), nor may any buffer lock be held -while waiting for an lmgr lock, nor may more than one buffer lock -be held at a time by any one process. (The third restriction is probably -stronger than necessary, but it makes the proof of no deadlock obvious.) +Concurrency control for hash indexes is provided using buffer content +locks, buffer pins, and cleanup locks. Here as elsewhere in PostgreSQL, +cleanup lock means that we hold an exclusive lock on the buffer and have +observed at some point after acquiring the lock that we hold the only pin +on that buffer. For hash indexes, a cleanup lock on a primary bucket page +represents the right to perform an arbitrary reorganization of the entire +bucket. Therefore, scans retain a pin on the primary bucket page for the +bucket they are currently scanning. Splitting a bucket requires a cleanup +lock on both the old and new primary bucket pages. VACUUM therefore takes +a cleanup lock on every bucket page in order to remove tuples. It can also +remove tuples copied to a new bucket by any previous split operation, because +the cleanup lock taken on the primary bucket page guarantees that no scans +which started prior to the most recent split can still be in progress. After +cleaning each page individually, it attempts to take a cleanup lock on the +primary bucket page in order to "squeeze" the bucket down to the minimum +possible number of pages. + +To avoid deadlocks, we must be consistent about the lock order in which we +lock the buckets for operations that requires locks on two different buckets. +We choose to always lock the lower-numbered bucket first. The metapage is +only ever locked after all bucket locks have been taken. + + +Metapage Caching +---------------- +Both scanning the index and inserting tuples require locating the bucket +where a given tuple ought to be located. To do this, we need the bucket +count, highmask, and lowmask from the metapage; however, it's undesirable +for performance reasons to have to have to lock and pin the metapage for +every such operation. Instead, we retain a cached copy of the metapage +in each each backend's relcache entry. This will produce the correct +bucket mapping as long as the target bucket hasn't been split since the +last cache refresh. + +To guard against the possibility that such a split has occurred, the +primary page of each bucket chain stores the number of buckets that +existed as of the time the bucket was last split, or if never split as +of the time it was created, in the space normally used for the +previous block number (that is, hasho_prevblkno). This doesn't cost +anything because the primary bucket page is always the first page in +the chain, and the previous block number is therefore always, in +reality, InvalidBlockNumber. + +After computing the ostensibly-correct bucket number based on our cached +copy of the metapage, we lock the corresponding primary bucket page and +check whether the bucket count stored in hasho_prevblkno is greater than +our the number of buckets stored in our cached copy of the metapage. If +so, the bucket has certainly been split, because the must originally +have been less than the number of buckets that existed at that time and +can't have increased except due to a split. If not, the bucket can't have +been split, because a split would have created a new bucket with a higher +bucket number than any we'd seen previously. In the latter case, we've +locked the correct bucket and can proceed; in the former case, we must +release the lock on this bucket, lock the metapage, update our cache, +unlock the metapage, and retry. + +Needing to retry occasionally might seem expensive, but the number of times +any given bucket can be split is limited to a few dozen no matter how +many times the hash index is accessed, because the total number of +buckets is limited to less than 2^32. On the other hand, the number of +times we access a bucket is unbounded and will be several orders of +magnitude larger even in unsympathetic cases. + +(The metapage cache is new in v10. Older hash indexes had the primary +bucket page's hasho_prevblkno initialized to InvalidBuffer.) Pseudocode Algorithms --------------------- +Various flags that are used in hash index operations are described as below: + +The bucket-being-split and bucket-being-populated flags indicate that split +the operation is in progress for a bucket. During split operation, a +bucket-being-split flag is set on the old bucket and bucket-being-populated +flag is set on new bucket. These flags are cleared once the split operation +is finished. + +The split-cleanup flag indicates that a bucket which has been recently split +still contains tuples that were also copied to the new bucket; it essentially +marks the split as incomplete. Once we're certain that no scans which +started before the new bucket was fully populated are still in progress, we +can remove the copies from the old bucket and clear the flag. We insist that +this flag must be clear before splitting a bucket; thus, a bucket can't be +split again until the previous split is totally complete. + +The moved-by-split flag on a tuple indicates that tuple is moved from old to +new bucket. Concurrent scans will skip such tuples until the split operation +is finished. Once the tuple is marked as moved-by-split, it will remain so +forever but that does no harm. We have intentionally not cleared it as that +can generate an additional I/O which is not necessary. + The operations we need to support are: readers scanning the index for entries of a particular hash code (which by definition are all in the same bucket); insertion of a new tuple into the correct bucket; enlarging the @@ -187,67 +248,75 @@ track of available overflow pages. The reader algorithm is: - pin meta page and take buffer content lock in shared mode - loop: - compute bucket number for target hash key - release meta page buffer content lock - if (correct bucket page is already locked) - break - release any existing bucket page lock (if a concurrent split happened) - take heavyweight bucket lock - retake meta page buffer content lock in shared mode + lock the primary bucket page of the target bucket + if the target bucket is still being populated by a split: + release the buffer content lock on current bucket page + pin and acquire the buffer content lock on old bucket in shared mode + release the buffer content lock on old bucket, but not pin + retake the buffer content lock on new bucket + arrange to scan the old bucket normally and the new bucket for + tuples which are not moved-by-split -- then, per read request: - release pin on metapage - read current page of bucket and take shared buffer content lock - step to next page if necessary (no chaining of locks) + reacquire content lock on current page + step to next page if necessary (no chaining of content locks, but keep + the pin on the primary bucket throughout the scan; we also maintain + a pin on the page currently being scanned) get tuple - release buffer content lock and pin on current page + release content lock -- at scan shutdown: - release bucket share-lock - -We can't hold the metapage lock while acquiring a lock on the target bucket, -because that might result in an undetected deadlock (lwlocks do not participate -in deadlock detection). Instead, we relock the metapage after acquiring the -bucket page lock and check whether the bucket has been split. If not, we're -done. If so, we release our previously-acquired lock and repeat the process -using the new bucket number. Holding the bucket sharelock for -the remainder of the scan prevents the reader's current-tuple pointer from -being invalidated by splits or compactions. Notice that the reader's lock -does not prevent other buckets from being split or compacted. + release all pins still held + +Holding the buffer pin on the primary bucket page for the whole scan prevents +the reader's current-tuple pointer from being invalidated by splits or +compactions. (Of course, other buckets can still be split or compacted.) To keep concurrency reasonably good, we require readers to cope with concurrent insertions, which means that they have to be able to re-find -their current scan position after re-acquiring the page sharelock. Since -deletion is not possible while a reader holds the bucket sharelock, and -we assume that heap tuple TIDs are unique, this can be implemented by +their current scan position after re-acquiring the buffer content lock on +page. Since deletion is not possible while a reader holds the pin on bucket, +and we assume that heap tuple TIDs are unique, this can be implemented by searching for the same heap tuple TID previously returned. Insertion does not move index entries across pages, so the previously-returned index entry should always be on the same page, at the same or higher offset number, as it was before. +To allow for scans during a bucket split, if at the start of the scan, the +bucket is marked as bucket-being-populated, it scan all the tuples in that +bucket except for those that are marked as moved-by-split. Once it finishes +the scan of all the tuples in the current bucket, it scans the old bucket from +which this bucket is formed by split. + The insertion algorithm is rather similar: - pin meta page and take buffer content lock in shared mode - loop: - compute bucket number for target hash key - release meta page buffer content lock - if (correct bucket page is already locked) - break - release any existing bucket page lock (if a concurrent split happened) - take heavyweight bucket lock in shared mode - retake meta page buffer content lock in shared mode --- (so far same as reader) - release pin on metapage - pin current page of bucket and take exclusive buffer content lock - if full, release, read/exclusive-lock next page; repeat as needed + lock the primary bucket page of the target bucket +-- (so far same as reader, except for acquisition of buffer content lock in + exclusive mode on primary bucket page) + if the bucket-being-split flag is set for a bucket and pin count on it is + one, then finish the split + release the buffer content lock on current bucket + get the "new" bucket which was being populated by the split + scan the new bucket and form the hash table of TIDs + conditionally get the cleanup lock on old and new buckets + if we get the lock on both the buckets + finish the split using algorithm mentioned below for split + release the pin on old bucket and restart the insert from beginning. + if current page is full, first check if this page contains any dead tuples. + if yes, remove dead tuples from the current page and again check for the + availability of the space. If enough space found, insert the tuple else + release lock but not pin, read/exclusive-lock + next page; repeat as needed >> see below if no space in any page of bucket + take buffer content lock in exclusive mode on metapage insert tuple at appropriate place in page - mark current page dirty and release buffer content lock and pin - release heavyweight share-lock - pin meta page and take buffer content lock in shared mode + mark current page dirty increment tuple count, decide if split needed - mark meta page dirty and release buffer content lock and pin - done if no split needed, else enter Split algorithm below + mark meta page dirty + write WAL for insertion of tuple + release the buffer content lock on metapage + release buffer content lock on current page + if current page is not a bucket page, release the pin on bucket page + if split is needed, enter Split algorithm below + release the pin on metapage To speed searches, the index entries within any individual index page are kept sorted by hash code; the insertion code must take care to insert new @@ -256,11 +325,13 @@ bucket that is being actively scanned, because readers can cope with this as explained above. We only need the short-term buffer locks to ensure that readers do not see a partially-updated page. -It is clearly impossible for readers and inserters to deadlock, and in -fact this algorithm allows them a very high degree of concurrency. -(The exclusive metapage lock taken to update the tuple count is stronger -than necessary, since readers do not care about the tuple count, but the -lock is held for such a short time that this is probably not an issue.) +To avoid deadlock between readers and inserters, whenever there is a need +to lock multiple buckets, we always take in the order suggested in Lock +Definitions above. This algorithm allows them a very high degree of +concurrency. (The exclusive metapage lock taken to update the tuple count +is stronger than necessary, since readers do not care about the tuple count, +but the lock is held for such a short time that this is probably not an +issue.) When an inserter cannot find space in any existing page of a bucket, it must obtain an overflow page and add that page to the bucket's chain. @@ -271,46 +342,47 @@ index is overfull (has a higher-than-wanted ratio of tuples to buckets). The algorithm attempts, but does not necessarily succeed, to split one existing bucket in two, thereby lowering the fill ratio: - pin meta page and take buffer content lock in exclusive mode - check split still needed - if split not needed anymore, drop buffer content lock and pin and exit - decide which bucket to split - Attempt to X-lock old bucket number (definitely could fail) - Attempt to X-lock new bucket number (shouldn't fail, but...) - if above fail, drop locks and pin and exit - update meta page to reflect new number of buckets - mark meta page dirty and release buffer content lock and pin - -- now, accesses to all other buckets can proceed. - Perform actual split of bucket, moving tuples as needed - >> see below about acquiring needed extra space - Release X-locks of old and new buckets - -Note the metapage lock is not held while the actual tuple rearrangement is -performed, so accesses to other buckets can proceed in parallel; in fact, -it's possible for multiple bucket splits to proceed in parallel. - -Split's attempt to X-lock the old bucket number could fail if another -process holds S-lock on it. We do not want to wait if that happens, first -because we don't want to wait while holding the metapage exclusive-lock, -and second because it could very easily result in deadlock. (The other -process might be out of the hash AM altogether, and could do something -that blocks on another lock this process holds; so even if the hash -algorithm itself is deadlock-free, a user-induced deadlock could occur.) -So, this is a conditional LockAcquire operation, and if it fails we just -abandon the attempt to split. This is all right since the index is -overfull but perfectly functional. Every subsequent inserter will try to -split, and eventually one will succeed. If multiple inserters failed to -split, the index might still be overfull, but eventually, the index will + pin meta page and take buffer content lock in exclusive mode + check split still needed + if split not needed anymore, drop buffer content lock and pin and exit + decide which bucket to split + try to take a cleanup lock on that bucket; if fail, give up + if that bucket is still being split or has split-cleanup work: + try to finish the split and the cleanup work + if that succeeds, start over; if it fails, give up + mark the old and new buckets indicating split is in progress + mark both old and new buckets as dirty + write WAL for allocation of new page for split + copy the tuples that belongs to new bucket from old bucket, marking + them as moved-by-split + write WAL record for moving tuples to new page once the new page is full + or all the pages of old bucket are finished + release lock but not pin for primary bucket page of old bucket, + read/shared-lock next page; repeat as needed + clear the bucket-being-split and bucket-being-populated flags + mark the old bucket indicating split-cleanup + write WAL for changing the flags on both old and new buckets + +The split operation's attempt to acquire cleanup-lock on the old bucket number +could fail if another process holds any lock or pin on it. We do not want to +wait if that happens, because we don't want to wait while holding the metapage +exclusive-lock. So, this is a conditional LWLockAcquire operation, and if +it fails we just abandon the attempt to split. This is all right since the +index is overfull but perfectly functional. Every subsequent inserter will +try to split, and eventually one will succeed. If multiple inserters failed +to split, the index might still be overfull, but eventually, the index will not be overfull and split attempts will stop. (We could make a successful splitter loop to see if the index is still overfull, but it seems better to distribute the split overhead across successive insertions.) -A problem is that if a split fails partway through (eg due to insufficient -disk space) the index is left corrupt. The probability of that could be -made quite low if we grab a free page or two before we update the meta -page, but the only real solution is to treat a split as a WAL-loggable, -must-complete action. I'm not planning to teach hash about WAL in this -go-round. +If a split fails partway through (e.g. due to insufficient disk space or an +interrupt), the index will not be corrupted. Instead, we'll retry the split +every time a tuple is inserted into the old bucket prior to inserting the new +tuple; eventually, we should succeed. The fact that a split is left +unfinished doesn't prevent subsequent buckets from being split, but we won't +try to split the bucket again until the prior split is finished. In other +words, a bucket can be in the middle of being split for some time, but it can't +be in the middle of two splits at the same time. The fourth operation is garbage collection (bulk deletion): @@ -319,31 +391,46 @@ The fourth operation is garbage collection (bulk deletion): fetch current max bucket number release meta page buffer content lock and pin while next bucket <= max bucket do - Acquire X lock on target bucket - Scan and remove tuples, compact free space as needed - Release X lock + acquire cleanup lock on primary bucket page + loop: + scan and remove tuples + mark the target page dirty + write WAL for deleting tuples from target page + if this is the last bucket page, break out of loop + pin and x-lock next page + release prior lock and pin (except keep pin on primary bucket page) + if the page we have locked is not the primary bucket page: + release lock and take exclusive lock on primary bucket page + if there are no other pins on the primary bucket page: + squeeze the bucket to remove free space + release the pin on primary bucket page next bucket ++ end loop pin metapage and take buffer content lock in exclusive mode check if number of buckets changed if so, release content lock and pin and return to for-each-bucket loop else update metapage tuple count - mark meta page dirty and release buffer content lock and pin - -Note that this is designed to allow concurrent splits. If a split occurs, -tuples relocated into the new bucket will be visited twice by the scan, -but that does no harm. (We must however be careful about the statistics + mark meta page dirty and write WAL for update of metapage + release buffer content lock and pin + +Note that this is designed to allow concurrent splits and scans. If a split +occurs, tuples relocated into the new bucket will be visited twice by the +scan, but that does no harm. As we release the lock on bucket page during +cleanup scan of a bucket, it will allow concurrent scan to start on a bucket +and ensures that scan will always be behind cleanup. It is must to keep scans +behind cleanup, else vacuum could decrease the TIDs that are required to +complete the scan. Now, as the scan that returns multiple tuples from the +same bucket page always expect next valid TID to be greater than or equal to +the current TID, it might miss the tuples. This holds true for backward scans +as well (backward scans first traverse each bucket starting from first bucket +to last overflow page in the chain). We must be careful about the statistics reported by the VACUUM operation. What we can do is count the number of -tuples scanned, and believe this in preference to the stored tuple count -if the stored tuple count and number of buckets did *not* change at any -time during the scan. This provides a way of correcting the stored tuple -count if it gets out of sync for some reason. But if a split or insertion -does occur concurrently, the scan count is untrustworthy; instead, -subtract the number of tuples deleted from the stored tuple count and -use that.) - -The exclusive lock request could deadlock in some strange scenarios, but -we can just error out without any great harm being done. +tuples scanned, and believe this in preference to the stored tuple count if +the stored tuple count and number of buckets did *not* change at any time +during the scan. This provides a way of correcting the stored tuple count if +it gets out of sync for some reason. But if a split or insertion does occur +concurrently, the scan count is untrustworthy; instead, subtract the number of +tuples deleted from the stored tuple count and use that. Free Space Management @@ -366,18 +453,16 @@ Obtaining an overflow page: search for a free page (zero bit in bitmap) if found: set bit in bitmap - mark bitmap page dirty and release content lock + mark bitmap page dirty take metapage buffer content lock in exclusive mode if first-free-bit value did not change, update it and mark meta page dirty - release meta page buffer content lock - return page number else (not found): release bitmap page buffer content lock loop back to try next bitmap page, if any -- here when we have checked all bitmap pages; we hold meta excl. lock extend index to add another overflow page; update meta information - mark meta page dirty and release buffer content lock + mark meta page dirty return page number It is slightly annoying to release and reacquire the metapage lock @@ -397,12 +482,17 @@ like this: -- having determined that no space is free in the target bucket: remember last page of bucket, drop write lock on it - call free-page-acquire routine re-write-lock last page of bucket if it is not last anymore, step to the last page - update (former) last page to point to new page + execute free-page-acquire (obtaining an overflow page) mechanism + described above + update (former) last page to point to the new page and mark buffer dirty write-lock and initialize new page, with back link to former last page - write and release former last page + write WAL for addition of overflow page + release the locks on meta page and bitmap page acquired in + free-page-acquire algorithm + release the lock on former last page + release the lock on new overflow page insert tuple into new page -- etc. @@ -417,13 +507,11 @@ free page; there can be no other process holding lock on it. Bucket splitting uses a similar algorithm if it has to extend the new bucket, but it need not worry about concurrent extension since it has -exclusive lock on the new bucket. +buffer content lock in exclusive mode on the new bucket. -Freeing an overflow page is done by garbage collection and by bucket -splitting (the old bucket may contain no-longer-needed overflow pages). -In both cases, the process holds exclusive lock on the containing bucket, -so need not worry about other accessors of pages in the bucket. The -algorithm is: +Freeing an overflow page requires the process to hold buffer content lock in +exclusive mode on the containing bucket, so need not worry about other +accessors of pages in the bucket. The algorithm is: delink overflow page from bucket chain (this requires read/update/write/release of fore and aft siblings) @@ -431,12 +519,14 @@ algorithm is: determine which bitmap page contains the free space bit for page release meta page buffer content lock pin bitmap page and take buffer content lock in exclusive mode - update bitmap bit - mark bitmap page dirty and release buffer content lock and pin - if page number is less than what we saw as first-free-bit in meta: retake meta page buffer content lock in exclusive mode + move (insert) tuples that belong to the overflow page being freed + update bitmap bit + mark bitmap page dirty if page number is still less than first-free-bit, update first-free-bit field and mark meta page dirty + write WAL for delinking overflow page operation + release buffer content lock and pin release meta page buffer content lock and pin We have to do it this way because we must clear the bitmap bit before @@ -447,21 +537,96 @@ page acquirer will scan more bitmap bits than he needs to. What must be avoided is having first-free-bit greater than the actual first free bit, because then that free page would never be found by searchers. -All the freespace operations should be called while holding no buffer -locks. Since they need no lmgr locks, deadlock is not possible. +The reason of moving tuples from overflow page while delinking the later is +to make that as an atomic operation. Not doing so could lead to spurious reads +on standby. Basically, the user might see the same tuple twice. + + +WAL Considerations +------------------ + +The hash index operations like create index, insert, delete, bucket split, +allocate overflow page, and squeeze in themselves don't guarantee hash index +consistency after a crash. To provide robustness, we write WAL for each of +these operations. + +CREATE INDEX writes multiple WAL records. First, we write a record to cover +the initializatoin of the metapage, followed by one for each new bucket +created, followed by one for the initial bitmap page. It's not important for +index creation to appear atomic, because the index isn't yet visible to any +other transaction, and the creating transaction will roll back in the event of +a crash. It would be difficult to cover the whole operation with a single +write-ahead log record anyway, because we can log only a fixed number of +pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery. + +Ordinary item insertions (that don't force a page split or need a new overflow +page) are single WAL entries. They touch a single bucket page and the +metapage. The metapage is updated during replay as it is updated during +original operation. + +If an insertion causes the addition of an overflow page, there will be one +WAL entry for the new overflow page and second entry for insert itself. + +If an insertion causes a bucket split, there will be one WAL entry for insert +itself, followed by a WAL entry for allocating a new bucket, followed by a WAL +entry for each overflow bucket page in the new bucket to which the tuples are +moved from old bucket, followed by a WAL entry to indicate that split is +complete for both old and new buckets. A split operation which requires +overflow pages to complete the operation will need to write a WAL record for +each new allocation of an overflow page. + +As splitting involves multiple atomic actions, it's possible that the system +crashes between moving tuples from bucket pages of the old bucket to new +bucket. In such a case, after recovery, the old and new buckets will be +marked with bucket-being-split and bucket-being-populated flags respectively +which indicates that split is in progress for those buckets. The reader +algorithm works correctly, as it will scan both the old and new buckets when +the split is in progress as explained in the reader algorithm section above. + +We finish the split at next insert or split operation on the old bucket as +explained in insert and split algorithm above. It could be done during +searches, too, but it seems best not to put any extra updates in what would +otherwise be a read-only operation (updating is not possible in hot standby +mode anyway). It would seem natural to complete the split in VACUUM, but since +splitting a bucket might require allocating a new page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. + +Deletion of tuples from a bucket is performed for two reasons: to remove dead +tuples, and to remove tuples that were moved by a bucket split. A WAL entry +is made for each bucket page from which tuples are removed, and then another +WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples +are removed, a separate WAL entry is made to update the metapage. + +As deletion involves multiple atomic operations, it is quite possible that +system crashes after (a) removing tuples from some of the bucket pages, (b) +before clearing the garbage flag, or (c) before updating the metapage. If the +system crashes before completing (b), it will again try to clean the bucket +during next vacuum or insert after recovery which can have some performance +impact, but it will work fine. If the system crashes before completing (c), +after recovery there could be some additional splits until the next vacuum +updates the metapage, but the other operations like insert, delete and scan +will work correctly. We can fix this problem by actually updating the +metapage based on delete operation during replay, but it's not clear whether +it's worth the complication. + +A squeeze operation moves tuples from one of the buckets later in the chain to +one of the bucket earlier in chain and writes WAL record when either the +bucket to which it is writing tuples is filled or bucket from which it +is removing the tuples becomes empty. + +As a squeeze operation involves writing multiple atomic operations, it is +quite possible that the system crashes before completing the operation on +entire bucket. After recovery, the operations will work correctly, but +the index will remain bloated and this can impact performance of read and +insert operations until the next vacuum squeeze the bucket completely. Other Notes ----------- -All the shenanigans with locking prevent a split occurring while *another* -process is stopped in a given bucket. They do not ensure that one of -our *own* backend's scans is not stopped in the bucket, because lmgr -doesn't consider a process's own locks to conflict. So the Split -algorithm must check for that case separately before deciding it can go -ahead with the split. VACUUM does not have this problem since nothing -else can be happening within the vacuuming backend. - -Should we instead try to fix the state of any conflicting local scan? -Seems mighty ugly --- got to move the held bucket S-lock as well as lots -of other messiness. For now, just punt and don't split. +Clean up locks prevent a split from occurring while *another* process is stopped +in a given bucket. It also ensures that one of our *own* backend's scans is not +stopped in the bucket. diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 07496f8156..8a3297924f 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -3,7 +3,7 @@ * hash.c * Implementation of Margo Seltzer's Hashing package for postgres. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -19,13 +19,16 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" #include "access/relscan.h" #include "catalog/index.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "optimizer/plancat.h" +#include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" +#include "miscadmin.h" /* Working state for hashbuild and its callback */ @@ -33,6 +36,7 @@ typedef struct { HSpool *spool; /* NULL if not using spooling */ double indtuples; /* # tuples accepted into index */ + Relation heapRel; /* heap relation descriptor */ } HashBuildState; static void hashbuildCallback(Relation index, @@ -65,6 +69,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amstorage = false; amroutine->amclusterable = false; amroutine->ampredlocks = false; + amroutine->amcanparallel = false; amroutine->amkeytype = INT4OID; amroutine->ambuild = hashbuild; @@ -84,6 +89,9 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amendscan = hashendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; PG_RETURN_POINTER(amroutine); } @@ -114,7 +122,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); /* Initialize the hash index metadata page and initial buckets */ - num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM); + num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then @@ -147,6 +155,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) /* prepare to build the index */ buildstate.indtuples = 0; + buildstate.heapRel = heap; /* do the heap scan */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, @@ -155,7 +164,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) if (buildstate.spool) { /* sort the tuples and insert them into the index */ - _h_indexbuild(buildstate.spool); + _h_indexbuild(buildstate.spool, buildstate.heapRel); _h_spooldestroy(buildstate.spool); } @@ -176,7 +185,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) void hashbuildempty(Relation index) { - _hash_metapinit(index, 0, INIT_FORKNUM); + _hash_init(index, 0, INIT_FORKNUM); } /* @@ -211,7 +220,7 @@ hashbuildCallback(Relation index, itup = index_form_tuple(RelationGetDescr(index), index_values, index_isnull); itup->t_tid = htup->t_self; - _hash_doinsert(index, itup); + _hash_doinsert(index, itup, buildstate->heapRel); pfree(itup); } @@ -227,7 +236,8 @@ hashbuildCallback(Relation index, bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, - IndexUniqueCheck checkUnique) + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) { Datum index_values[1]; bool index_isnull[1]; @@ -243,7 +253,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull, itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); itup->t_tid = *ht_ctid; - _hash_doinsert(rel, itup); + _hash_doinsert(rel, itup, heapRel); pfree(itup); @@ -273,7 +283,7 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) - _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); /* * If we've already initialized this scan, we can just advance it in the @@ -286,16 +296,21 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking - * for the TID we previously returned. (Because we hold share lock on - * the bucket, no deletions or splits could have occurred; therefore - * we can expect that the TID still exists in the current index page, - * at an offset >= where we were.) + * for the TID we previously returned. (Because we hold a pin on the + * primary bucket page, no deletions or splits could have occurred; + * therefore we can expect that the TID still exists in the current + * index page, at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); + + /* + * We don't need test for old snapshot here as the current buffer is + * pinned, so vacuum can't clean the page. + */ maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; @@ -318,14 +333,24 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) if (scan->kill_prior_tuple) { /* - * Yes, so mark it by setting the LP_DEAD state in the item flags. + * Yes, so remember it for later. (We'll deal with all such tuples + * at once right after leaving the index page or at end of scan.) + * In case if caller reverses the indexscan direction it is quite + * possible that the same item might get entered multiple times. + * But, we don't detect that; instead, we just forget any excess + * entries. */ - ItemIdMarkDead(PageGetItemId(page, offnum)); + if (so->killedItems == NULL) + so->killedItems = palloc(MaxIndexTuplesPerPage * + sizeof(HashScanPosItem)); - /* - * Since this can be redone later if needed, mark as a hint. - */ - MarkBufferDirtyHint(buf, true); + if (so->numKilled < MaxIndexTuplesPerPage) + { + so->killedItems[so->numKilled].heapTid = so->hashso_heappos; + so->killedItems[so->numKilled].indexOffset = + ItemPointerGetOffsetNumber(&(so->hashso_curpos)); + so->numKilled++; + } } /* @@ -353,7 +378,7 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) - _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; @@ -423,17 +448,20 @@ hashbeginscan(Relation rel, int nkeys, int norderbys) scan = RelationGetIndexScan(rel, nkeys, norderbys); so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); - so->hashso_bucket_valid = false; - so->hashso_bucket_blkno = 0; so->hashso_curbuf = InvalidBuffer; + so->hashso_bucket_buf = InvalidBuffer; + so->hashso_split_bucket_buf = InvalidBuffer; /* set position invalid (this will cause _hash_first call) */ ItemPointerSetInvalid(&(so->hashso_curpos)); ItemPointerSetInvalid(&(so->hashso_heappos)); - scan->opaque = so; + so->hashso_buc_populated = false; + so->hashso_buc_split = false; - /* register scan in case we change pages it's using */ - _hash_regscan(scan); + so->killedItems = NULL; + so->numKilled = 0; + + scan->opaque = so; return scan; } @@ -448,15 +476,18 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - /* release any pin we still hold */ - if (BufferIsValid(so->hashso_curbuf)) - _hash_dropbuf(rel, so->hashso_curbuf); - so->hashso_curbuf = InvalidBuffer; + /* + * Before leaving current page, deal with any killed items. Also, ensure + * that we acquire lock on current page before calling _hash_kill_items. + */ + if (so->numKilled > 0) + { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); + _hash_kill_items(scan); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); + } - /* release lock on bucket, too */ - if (so->hashso_bucket_blkno) - _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); - so->hashso_bucket_blkno = 0; + _hash_dropscanbuf(rel, so); /* set position invalid (this will cause _hash_first call) */ ItemPointerSetInvalid(&(so->hashso_curpos)); @@ -468,8 +499,10 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, memmove(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); - so->hashso_bucket_valid = false; } + + so->hashso_buc_populated = false; + so->hashso_buc_split = false; } /* @@ -481,19 +514,21 @@ hashendscan(IndexScanDesc scan) HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - /* don't need scan registered anymore */ - _hash_dropscan(scan); - - /* release any pin we still hold */ - if (BufferIsValid(so->hashso_curbuf)) - _hash_dropbuf(rel, so->hashso_curbuf); - so->hashso_curbuf = InvalidBuffer; + /* + * Before leaving current page, deal with any killed items. Also, ensure + * that we acquire lock on current page before calling _hash_kill_items. + */ + if (so->numKilled > 0) + { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); + _hash_kill_items(scan); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); + } - /* release lock on bucket, too */ - if (so->hashso_bucket_blkno) - _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); - so->hashso_bucket_blkno = 0; + _hash_dropscanbuf(rel, so); + if (so->killedItems != NULL) + pfree(so->killedItems); pfree(so); scan->opaque = NULL; } @@ -503,6 +538,9 @@ hashendscan(IndexScanDesc scan) * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * + * This function also deletes the tuples that are moved by split to other + * bucket. + * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * @@ -516,27 +554,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; - Buffer metabuf; + Buffer metabuf = InvalidBuffer; HashMetaPage metap; - HashMetaPageData local_metapage; + HashMetaPage cachedmetap; tuples_removed = 0; num_index_tuples = 0; /* - * Read the metapage to fetch original bucket and tuple counts. Also, we - * keep a copy of the last-seen metapage so that we can use its - * hashm_spares[] values to compute bucket page addresses. This is a bit - * hokey but perfectly safe, since the interesting entries in the spares - * array cannot change under us; and it beats rereading the metapage for - * each bucket. + * We need a copy of the metapage so that we can use its hashm_spares[] + * values to compute bucket page addresses, but a cached copy should be + * good enough. (If not, we'll detect that further down and refresh the + * cache as necessary.) */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = HashPageGetMeta(BufferGetPage(metabuf)); - orig_maxbucket = metap->hashm_maxbucket; - orig_ntuples = metap->hashm_ntuples; - memcpy(&local_metapage, metap, sizeof(local_metapage)); - _hash_relbuf(rel, metabuf); + cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(cachedmetap != NULL); + + orig_maxbucket = cachedmetap->hashm_maxbucket; + orig_ntuples = cachedmetap->hashm_ntuples; /* Scan the buckets that we know exist */ cur_bucket = 0; @@ -547,102 +582,89 @@ loop_top: { BlockNumber bucket_blkno; BlockNumber blkno; - bool bucket_dirty = false; + Buffer bucket_buf; + Buffer buf; + HashPageOpaque bucket_opaque; + Page page; + bool split_cleanup = false; /* Get address of bucket's start page */ - bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); + bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); - /* Exclusive-lock the bucket so we can shrink it */ - _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); + blkno = bucket_blkno; - /* Shouldn't have any active scans locally, either */ - if (_hash_has_active_scan(rel, cur_bucket)) - elog(ERROR, "hash index has active scan during VACUUM"); + /* + * We need to acquire a cleanup lock on the primary bucket page to out + * wait concurrent scans before deleting the dead tuples. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); + LockBufferForCleanup(buf); + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); - /* Scan each page in bucket */ - blkno = bucket_blkno; - while (BlockNumberIsValid(blkno)) - { - Buffer buf; - Page page; - HashPageOpaque opaque; - OffsetNumber offno; - OffsetNumber maxoffno; - OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; - - vacuum_delay_point(); - - buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, - LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, - info->strategy); - page = BufferGetPage(buf); - opaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == cur_bucket); - - /* Scan each tuple in page */ - maxoffno = PageGetMaxOffsetNumber(page); - for (offno = FirstOffsetNumber; - offno <= maxoffno; - offno = OffsetNumberNext(offno)) - { - IndexTuple itup; - ItemPointer htup; + page = BufferGetPage(buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); - itup = (IndexTuple) PageGetItem(page, - PageGetItemId(page, offno)); - htup = &(itup->t_tid); - if (callback(htup, callback_state)) - { - /* mark the item for deletion */ - deletable[ndeletable++] = offno; - tuples_removed += 1; - } - else - num_index_tuples += 1; - } + /* + * If the bucket contains tuples that are moved by split, then we need + * to delete such tuples. We can't delete such tuples if the split + * operation on bucket is not finished as those are needed by scans. + */ + if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && + H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) + { + split_cleanup = true; /* - * Apply deletions and write page if needed, advance to next page. + * This bucket might have been split since we last held a lock on + * the metapage. If so, hashm_maxbucket, hashm_highmask and + * hashm_lowmask might be old enough to cause us to fail to remove + * tuples left behind by the most recent split. To prevent that, + * now that the primary page of the target bucket has been locked + * (and thus can't be further split), check whether we need to + * update our cached metapage data. */ - blkno = opaque->hasho_nextblkno; - - if (ndeletable > 0) + Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); + if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) { - PageIndexMultiDelete(page, deletable, ndeletable); - _hash_wrtbuf(rel, buf); - bucket_dirty = true; + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); } - else - _hash_relbuf(rel, buf); } - /* If we deleted anything, try to compact free space */ - if (bucket_dirty) - _hash_squeezebucket(rel, cur_bucket, bucket_blkno, - info->strategy); + bucket_buf = buf; + + hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, + cachedmetap->hashm_maxbucket, + cachedmetap->hashm_highmask, + cachedmetap->hashm_lowmask, &tuples_removed, + &num_index_tuples, split_cleanup, + callback, callback_state); - /* Release bucket lock */ - _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); + _hash_dropbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; } + if (BufferIsInvalid(metabuf)) + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); + /* Write-lock metapage and check for split since we started */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ - cur_maxbucket = metap->hashm_maxbucket; - memcpy(&local_metapage, metap, sizeof(local_metapage)); - _hash_relbuf(rel, metabuf); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); + cur_maxbucket = cachedmetap->hashm_maxbucket; goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ + START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) @@ -668,7 +690,28 @@ loop_top: num_index_tuples = metap->hashm_ntuples; } - _hash_wrtbuf(rel, metabuf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_update_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.ntuples = metap->hashm_ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); + + XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + _hash_relbuf(rel, metabuf); /* return statistics */ if (stats == NULL) @@ -704,9 +747,262 @@ hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) return stats; } - +/* + * Helper function to perform deletion of index entries from a bucket. + * + * This function expects that the caller has acquired a cleanup lock on the + * primary bucket page, and will return with a write lock again held on the + * primary bucket page. The lock won't necessarily be held continuously, + * though, because we'll release it when visiting overflow pages. + * + * It would be very bad if this function cleaned a page while some other + * backend was in the midst of scanning it, because hashgettuple assumes + * that the next valid TID will be greater than or equal to the current + * valid TID. There can't be any concurrent scans in progress when we first + * enter this function because of the cleanup lock we hold on the primary + * bucket page, but as soon as we release that lock, there might be. We + * handle that by conspiring to prevent those scans from passing our cleanup + * scan. To do that, we lock the next page in the bucket chain before + * releasing the lock on the previous page. (This type of lock chaining is + * not ideal, so we might want to look for a better solution at some point.) + * + * We need to retain a pin on the primary bucket to ensure that no concurrent + * split can start. + */ void -hash_redo(XLogReaderState *record) +hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, + BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, + uint32 maxbucket, uint32 highmask, uint32 lowmask, + double *tuples_removed, double *num_index_tuples, + bool split_cleanup, + IndexBulkDeleteCallback callback, void *callback_state) { - elog(PANIC, "hash_redo: unimplemented"); + BlockNumber blkno; + Buffer buf; + Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; + bool bucket_dirty = false; + + blkno = bucket_blkno; + buf = bucket_buf; + + if (split_cleanup) + new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, + lowmask, maxbucket); + + /* Scan each page in bucket */ + for (;;) + { + HashPageOpaque opaque; + OffsetNumber offno; + OffsetNumber maxoffno; + Buffer next_buf; + Page page; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + bool retain_pin = false; + bool clear_dead_marking = false; + + vacuum_delay_point(); + + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* Scan each tuple in page */ + maxoffno = PageGetMaxOffsetNumber(page); + for (offno = FirstOffsetNumber; + offno <= maxoffno; + offno = OffsetNumberNext(offno)) + { + ItemPointer htup; + IndexTuple itup; + Bucket bucket; + bool kill_tuple = false; + + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, offno)); + htup = &(itup->t_tid); + + /* + * To remove the dead tuples, we strictly want to rely on results + * of callback function. refer btvacuumpage for detailed reason. + */ + if (callback && callback(htup, callback_state)) + { + kill_tuple = true; + if (tuples_removed) + *tuples_removed += 1; + } + else if (split_cleanup) + { + /* delete the tuples that are moved by split. */ + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + maxbucket, + highmask, + lowmask); + /* mark the item for deletion */ + if (bucket != cur_bucket) + { + /* + * We expect tuples to either belong to current bucket or + * new_bucket. This is ensured because we don't allow + * further splits from bucket that contains garbage. See + * comments in _hash_expandtable. + */ + Assert(bucket == new_bucket); + kill_tuple = true; + } + } + + if (kill_tuple) + { + /* mark the item for deletion */ + deletable[ndeletable++] = offno; + } + else + { + /* we're keeping it, so count it */ + if (num_index_tuples) + *num_index_tuples += 1; + } + } + + /* retain the pin on primary bucket page till end of bucket scan */ + if (blkno == bucket_blkno) + retain_pin = true; + else + retain_pin = false; + + blkno = opaque->hasho_nextblkno; + + /* + * Apply deletions, advance to next page and write page if needed. + */ + if (ndeletable > 0) + { + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + bucket_dirty = true; + + /* + * Let us mark the page as clean if vacuum removes the DEAD tuples + * from an index page. We do this by clearing + * LH_PAGE_HAS_DEAD_TUPLES flag. + */ + if (tuples_removed && *tuples_removed > 0 && + H_HAS_DEAD_TUPLES(opaque)) + { + opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + clear_dead_marking = true; + } + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_delete xlrec; + XLogRecPtr recptr; + + xlrec.clear_dead_marking = clear_dead_marking; + xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashDelete); + + /* + * bucket buffer needs to be registered to ensure that we can + * acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_primary_bucket_page) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); + PageSetLSN(BufferGetPage(buf), recptr); + } + + END_CRIT_SECTION(); + } + + /* bail out if there are no more pages to scan. */ + if (!BlockNumberIsValid(blkno)) + break; + + next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + /* + * release the lock on previous page after acquiring the lock on next + * page + */ + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + buf = next_buf; + } + + /* + * lock the bucket page to clear the garbage flag and squeeze the bucket. + * if the current buffer is same as bucket buffer, then we already have + * lock on bucket page. + */ + if (buf != bucket_buf) + { + _hash_relbuf(rel, buf); + LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * Clear the garbage flag from bucket after deleting the tuples that are + * moved by split. We purposefully clear the flag before squeeze bucket, + * so that after restart, vacuum shouldn't again try to delete the moved + * by split tuples. + */ + if (split_cleanup) + { + HashPageOpaque bucket_opaque; + Page page; + + page = BufferGetPage(bucket_buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + MarkBufferDirty(bucket_buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + /* + * If we have deleted anything, try to compact free space. For squeezing + * the bucket, we must have a cleanup lock, else it can impact the + * ordering of tuples for a scan that has started before it. + */ + if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) + _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, + bstrategy); + else + LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); } diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c new file mode 100644 index 0000000000..0ea11b2e74 --- /dev/null +++ b/src/backend/access/hash/hash_xlog.c @@ -0,0 +1,1270 @@ +/*------------------------------------------------------------------------- + * + * hash_xlog.c + * WAL replay logic for hash index. + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/hash/hash_xlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam_xlog.h" +#include "access/bufmask.h" +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/xlogutils.h" +#include "access/xlog.h" +#include "access/transam.h" +#include "storage/procarray.h" +#include "miscadmin.h" + +/* + * replay a hash index meta page + */ +static void +hash_xlog_init_meta_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Page page; + Buffer metabuf; + + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); + + /* create the index' metapage */ + metabuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(metabuf)); + _hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid, + xlrec->ffactor, true); + page = (Page) BufferGetPage(metabuf); + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + /* all done */ + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index bitmap page + */ +static void +hash_xlog_init_bitmap_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer bitmapbuf; + Buffer metabuf; + Page page; + HashMetaPage metap; + uint32 num_buckets; + + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); + + /* + * Initialize bitmap page + */ + bitmapbuf = XLogInitBufferForRedo(record, 0); + _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); + PageSetLSN(BufferGetPage(bitmapbuf), lsn); + MarkBufferDirty(bitmapbuf); + UnlockReleaseBuffer(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the bitmap page. But during replay it's not + * necessary to hold that lock, since nobody can see it yet; the + * creating transaction hasn't yet committed. + */ + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + num_buckets = metap->hashm_maxbucket + 1; + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + metap->hashm_nmaps++; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index insert without split + */ +static void +hash_xlog_insert(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); + + page = BufferGetPage(buffer); + + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "hash_xlog_insert: failed to add item"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the page we inserted into. But during replay it's + * not necessary to hold that lock, since no other index updates can + * be happening concurrently. + */ + page = BufferGetPage(buffer); + metap = HashPageGetMeta(page); + metap->hashm_ntuples += 1; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay addition of overflow page for hash index + */ +static void +hash_xlog_add_ovfl_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); + Buffer leftbuf; + Buffer ovflbuf; + Buffer metabuf; + BlockNumber leftblk; + BlockNumber rightblk; + BlockNumber newmapblk = InvalidBlockNumber; + Page ovflpage; + HashPageOpaque ovflopaque; + uint32 *num_bucket; + char *data; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + bool new_bmpage = false; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); + XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); + + ovflbuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(ovflbuf)); + + data = XLogRecGetBlockData(record, 0, &datalen); + num_bucket = (uint32 *) data; + Assert(datalen == sizeof(uint32)); + _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, + true); + /* update backlink */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = leftblk; + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) + { + Page leftpage; + HashPageOpaque leftopaque; + + leftpage = BufferGetPage(leftbuf); + leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); + leftopaque->hasho_nextblkno = rightblk; + + PageSetLSN(leftpage, lsn); + MarkBufferDirty(leftbuf); + } + + if (BufferIsValid(leftbuf)) + UnlockReleaseBuffer(leftbuf); + UnlockReleaseBuffer(ovflbuf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the overflow pages. But during replay it's not + * necessary to hold those locks, since no other index updates can be + * happening concurrently. + */ + if (XLogRecHasBlockRef(record, 2)) + { + Buffer mapbuffer; + + if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuffer); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 2, &datalen); + bitmap_page_bit = (uint32 *) data; + + SETBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuffer); + } + if (BufferIsValid(mapbuffer)) + UnlockReleaseBuffer(mapbuffer); + } + + if (XLogRecHasBlockRef(record, 3)) + { + Buffer newmapbuf; + + newmapbuf = XLogInitBufferForRedo(record, 3); + + _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); + + new_bmpage = true; + newmapblk = BufferGetBlockNumber(newmapbuf); + + MarkBufferDirty(newmapbuf); + PageSetLSN(BufferGetPage(newmapbuf), lsn); + + UnlockReleaseBuffer(newmapbuf); + } + + if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + uint32 *firstfree_ovflpage; + + data = XLogRecGetBlockData(record, 4, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + if (!xlrec->bmpage_found) + { + metap->hashm_spares[metap->hashm_ovflpoint]++; + + if (new_bmpage) + { + Assert(BlockNumberIsValid(newmapblk)); + + metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; + metap->hashm_nmaps++; + metap->hashm_spares[metap->hashm_ovflpoint]++; + } + } + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay allocation of page for split operation + */ +static void +hash_xlog_split_allocate_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + Buffer metabuf; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + char *data; + XLogRedoAction action; + + /* + * To be consistent with normal operation, here we take cleanup locks on + * both the old and new buckets even though there can't be any concurrent + * inserts. + */ + + /* replay the record for old bucket */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the special space is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + oldopaque->hasho_prevblkno = xlrec->new_bucket; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + + /* replay the record for new bucket */ + newbuf = XLogInitBufferForRedo(record, 1); + _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, + xlrec->new_bucket_flag, true); + if (!IsBufferCleanupOK(newbuf)) + elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); + MarkBufferDirty(newbuf); + PageSetLSN(BufferGetPage(newbuf), lsn); + + /* + * We can release the lock on old bucket early as well but doing here to + * consistent with normal operation. + */ + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); + + /* + * Note: in normal operation, we'd update the meta page while still + * holding lock on the old and new bucket pages. But during replay it's + * not necessary to hold those locks, since no other bucket splits can be + * happening concurrently. + */ + + /* replay the record for metapage changes */ + if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) + { + Page page; + HashMetaPage metap; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_maxbucket = xlrec->new_bucket; + + data = XLogRecGetBlockData(record, 2, &datalen); + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) + { + uint32 lowmask; + uint32 *highmask; + + /* extract low and high masks. */ + memcpy(&lowmask, data, sizeof(uint32)); + highmask = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_lowmask = lowmask; + metap->hashm_highmask = *highmask; + + data += sizeof(uint32) * 2; + } + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) + { + uint32 ovflpoint; + uint32 *ovflpages; + + /* extract information of overflow pages. */ + memcpy(&ovflpoint, data, sizeof(uint32)); + ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_spares[ovflpoint] = *ovflpages; + metap->hashm_ovflpoint = ovflpoint; + } + + MarkBufferDirty(metabuf); + PageSetLSN(BufferGetPage(metabuf), lsn); + } + + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay of split operation + */ +static void +hash_xlog_split_page(XLogReaderState *record) +{ + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED) + elog(ERROR, "Hash split record did not contain a full-page image"); + + UnlockReleaseBuffer(buf); +} + +/* + * replay completion of split operation + */ +static void +hash_xlog_split_complete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + XLogRedoAction action; + + /* replay the record for old bucket */ + action = XLogReadBufferForRedo(record, 0, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + + /* replay the record for new bucket */ + action = XLogReadBufferForRedo(record, 1, &newbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page newpage; + HashPageOpaque nopaque; + + newpage = BufferGetPage(newbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->hasho_flag = xlrec->new_bucket_flag; + + PageSetLSN(newpage, lsn); + MarkBufferDirty(newbuf); + } + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); +} + +/* + * replay move of page contents for squeeze operation of hash index + */ +static void +hash_xlog_move_page_contents(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf = InvalidBuffer; + Buffer deletebuf = InvalidBuffer; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for deleting entries from overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) + { + Page page; + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 2, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + + /* + * Replay is complete, now we can release the buffers. We release locks at + * end of replay operation to ensure that we hold lock on primary bucket + * page till end of operation. We can optimize by releasing the lock on + * write buffer as soon as the operation for same is complete, if it is + * not same as primary bucket page, but that doesn't seem to be worth + * complicating the code. + */ + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay squeeze page operation of hash index + */ +static void +hash_xlog_squeeze_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf; + Buffer ovflbuf; + Buffer prevbuf = InvalidBuffer; + Buffer mapbuf; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + /* + * if the page on which are adding tuples is a page previous to freed + * overflow page, then update its nextblno. + */ + if (xldata->is_prev_bucket_same_wrt) + { + HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); + + writeopaque->hasho_nextblkno = xldata->nextblkno; + } + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for initializing overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) + { + Page ovflpage; + HashPageOpaque ovflopaque; + + ovflpage = BufferGetPage(ovflbuf); + + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + } + if (BufferIsValid(ovflbuf)) + UnlockReleaseBuffer(ovflbuf); + + /* replay the record for page previous to the freed overflow page */ + if (!xldata->is_prev_bucket_same_wrt && + XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + prevopaque->hasho_nextblkno = xldata->nextblkno; + + PageSetLSN(prevpage, lsn); + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(prevbuf)) + UnlockReleaseBuffer(prevbuf); + + /* replay the record for page next to the freed overflow page */ + if (XLogRecHasBlockRef(record, 4)) + { + Buffer nextbuf; + + if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + nextopaque->hasho_prevblkno = xldata->prevblkno; + + PageSetLSN(nextpage, lsn); + MarkBufferDirty(nextbuf); + } + if (BufferIsValid(nextbuf)) + UnlockReleaseBuffer(nextbuf); + } + + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the primary bucket page and overflow pages. But + * during replay it's not necessary to hold those locks, since no other + * index updates can be happening concurrently. + */ + /* replay the record for bitmap page */ + if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuf); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + Size datalen; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 5, &datalen); + bitmap_page_bit = (uint32 *) data; + + CLRBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuf); + } + if (BufferIsValid(mapbuf)) + UnlockReleaseBuffer(mapbuf); + + /* replay the record for meta page */ + if (XLogRecHasBlockRef(record, 6)) + { + Buffer metabuf; + + if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + char *data; + uint32 *firstfree_ovflpage; + Size datalen; + + data = XLogRecGetBlockData(record, 6, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); + } +} + +/* + * replay delete operation of hash index + */ +static void +hash_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer deletebuf; + Page page; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_primary_bucket_page) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &deletebuf); + } + + /* replay the record for deleting entries in bucket page */ + if (action == BLK_NEEDS_REDO) + { + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 1, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + /* + * Mark the page as not containing any LP_DEAD items only if + * clear_dead_marking flag is set to true. See comments in + * hashbucketcleanup() for details. + */ + if (xldata->clear_dead_marking) + { + HashPageOpaque pageopaque; + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay split cleanup flag operation for primary bucket page. + */ +static void +hash_xlog_split_cleanup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + HashPageOpaque bucket_opaque; + + page = (Page) BufferGetPage(buffer); + + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay for update meta page + */ +static void +hash_xlog_update_meta_page(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record); + Buffer metabuf; + Page page; + + if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) + { + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + metap->hashm_ntuples = xldata->ntuples; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * Get the latestRemovedXid from the heap pages pointed at by the index + * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid, + * on which this function is based. + */ +static TransactionId +hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record) +{ + xl_hash_vacuum_one_page *xlrec; + OffsetNumber *unused; + Buffer ibuffer, + hbuffer; + Page ipage, + hpage; + RelFileNode rnode; + BlockNumber blkno; + ItemId iitemid, + hitemid; + IndexTuple itup; + HeapTupleHeader htuphdr; + BlockNumber hblkno; + OffsetNumber hoffnum; + TransactionId latestRemovedXid = InvalidTransactionId; + int i; + + xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record); + + /* + * If there's nothing running on the standby we don't need to derive a + * full latestRemovedXid value, so use a fast path out of here. This + * returns InvalidTransactionId, and so will conflict with all HS + * transactions; but since we just worked out that that's zero people, + * it's OK. + * + * XXX There is a race condition here, which is that a new backend might + * start just after we look. If so, it cannot need to conflict, but this + * coding will result in throwing a conflict anyway. + */ + if (CountDBBackends(InvalidOid) == 0) + return latestRemovedXid; + + /* + * Check if WAL replay has reached a consistent database state. If not, we + * must PANIC. See the definition of + * btree_xlog_delete_get_latestRemovedXid for more details. + */ + if (!reachedConsistency) + elog(PANIC, "hash_xlog_vacuum_get_latestRemovedXid: cannot operate with inconsistent data"); + + /* + * Get index page. If the DB is consistent, this should not fail, nor + * should any of the heap page fetches below. If one does, we return + * InvalidTransactionId to cancel all HS transactions. That's probably + * overkill, but it's safe, and certainly better than panicking here. + */ + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); + + if (!BufferIsValid(ibuffer)) + return InvalidTransactionId; + LockBuffer(ibuffer, HASH_READ); + ipage = (Page) BufferGetPage(ibuffer); + + /* + * Loop through the deleted index items to obtain the TransactionId from + * the heap items they point to. + */ + unused = (OffsetNumber *) ((char *) xlrec + SizeOfHashVacuumOnePage); + + for (i = 0; i < xlrec->ntuples; i++) + { + /* + * Identify the index tuple about to be deleted. + */ + iitemid = PageGetItemId(ipage, unused[i]); + itup = (IndexTuple) PageGetItem(ipage, iitemid); + + /* + * Locate the heap page that the index tuple points at + */ + hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, + hblkno, RBM_NORMAL); + + if (!BufferIsValid(hbuffer)) + { + UnlockReleaseBuffer(ibuffer); + return InvalidTransactionId; + } + LockBuffer(hbuffer, HASH_READ); + hpage = (Page) BufferGetPage(hbuffer); + + /* + * Look up the heap tuple header that the index tuple points at by + * using the heap node supplied with the xlrec. We can't use + * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. + * Note that we are not looking at tuple data here, just headers. + */ + hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); + hitemid = PageGetItemId(hpage, hoffnum); + + /* + * Follow any redirections until we find something useful. + */ + while (ItemIdIsRedirected(hitemid)) + { + hoffnum = ItemIdGetRedirect(hitemid); + hitemid = PageGetItemId(hpage, hoffnum); + CHECK_FOR_INTERRUPTS(); + } + + /* + * If the heap item has storage, then read the header and use that to + * set latestRemovedXid. + * + * Some LP_DEAD items may not be accessible, so we ignore them. + */ + if (ItemIdHasStorage(hitemid)) + { + htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); + HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); + } + else if (ItemIdIsDead(hitemid)) + { + /* + * Conjecture: if hitemid is dead then it had xids before the xids + * marked on LP_NORMAL items. So we just ignore this item and move + * onto the next, for the purposes of calculating + * latestRemovedxids. + */ + } + else + Assert(!ItemIdIsUsed(hitemid)); + + UnlockReleaseBuffer(hbuffer); + } + + UnlockReleaseBuffer(ibuffer); + + /* + * If all heap tuples were LP_DEAD then we will be returning + * InvalidTransactionId here, which avoids conflicts. This matches + * existing logic which assumes that LP_DEAD tuples must already be older + * than the latestRemovedXid on the cleanup record that set them as + * LP_DEAD, hence must already have generated a conflict. + */ + return latestRemovedXid; +} + +/* + * replay delete operation in hash index to remove + * tuples marked as DEAD during index tuple insertion. + */ +static void +hash_xlog_vacuum_one_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_vacuum_one_page *xldata; + Buffer buffer; + Buffer metabuf; + Page page; + XLogRedoAction action; + HashPageOpaque pageopaque; + + xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); + + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * Hash index records that are marked as LP_DEAD and being removed during + * hash index tuple insertion can conflict with standby queries. You might + * think that vacuum records would conflict as well, but we've handled + * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid + * cleaned by the vacuum of the heap and so we can resolve any conflicts + * just once when that arrives. After that we know that no conflicts + * exist from individual hash index vacuum records on that index. + */ + if (InHotStandby) + { + TransactionId latestRemovedXid = + hash_xlog_vacuum_get_latestRemovedXid(record); + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); + } + + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); + + if (action == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + if (XLogRecGetDataLen(record) > SizeOfHashVacuumOnePage) + { + OffsetNumber *unused; + + unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage); + + PageIndexMultiDelete(page, unused, xldata->ntuples); + } + + /* + * Mark the page as not containing any LP_DEAD items. See comments in + * _hash_vacuum_one_page() for details. + */ + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) + { + Page metapage; + HashMetaPage metap; + + metapage = BufferGetPage(metabuf); + metap = HashPageGetMeta(metapage); + + metap->hashm_ntuples -= xldata->ntuples; + + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +void +hash_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + hash_xlog_init_meta_page(record); + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + hash_xlog_init_bitmap_page(record); + break; + case XLOG_HASH_INSERT: + hash_xlog_insert(record); + break; + case XLOG_HASH_ADD_OVFL_PAGE: + hash_xlog_add_ovfl_page(record); + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + hash_xlog_split_allocate_page(record); + break; + case XLOG_HASH_SPLIT_PAGE: + hash_xlog_split_page(record); + break; + case XLOG_HASH_SPLIT_COMPLETE: + hash_xlog_split_complete(record); + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + hash_xlog_move_page_contents(record); + break; + case XLOG_HASH_SQUEEZE_PAGE: + hash_xlog_squeeze_page(record); + break; + case XLOG_HASH_DELETE: + hash_xlog_delete(record); + break; + case XLOG_HASH_SPLIT_CLEANUP: + hash_xlog_split_cleanup(record); + break; + case XLOG_HASH_UPDATE_META_PAGE: + hash_xlog_update_meta_page(record); + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + hash_xlog_vacuum_one_page(record); + break; + default: + elog(PANIC, "hash_redo: unknown op code %u", info); + } +} + +/* + * Mask a hash page before performing consistency checks on it. + */ +void +hash_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + HashPageOpaque opaque; + int pagetype; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + pagetype = opaque->hasho_flag & LH_PAGE_TYPE; + if (pagetype == LH_UNUSED_PAGE) + { + /* + * Mask everything on a UNUSED page. + */ + mask_page_content(page); + } + else if (pagetype == LH_BUCKET_PAGE || + pagetype == LH_OVERFLOW_PAGE) + { + /* + * In hash bucket and overflow pages, it is possible to modify the + * LP_FLAGS without emitting any WAL record. Hence, mask the line + * pointer flags. See hashgettuple(), _hash_kill_items() for details. + */ + mask_lp_flags(page); + } + + /* + * It is possible that the hint bit LH_PAGE_HAS_DEAD_TUPLES may remain + * unlogged. So, mask it. See _hash_kill_items() for details. + */ + opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; +} diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index bb9adad82e..4089fd6d8a 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -3,7 +3,7 @@ * hashfunc.c * Support functions for hash access method. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -27,6 +27,7 @@ #include "postgres.h" #include "access/hash.h" +#include "utils/builtins.h" #ifdef PGXC #include "catalog/pg_type.h" @@ -36,6 +37,16 @@ #include "utils/nabstime.h" #endif +/* + * Datatype-specific hash functions. + * + * These support both hash indexes and hash joins. + * + * NOTE: some of these are also used by catcache operations, without + * any direct connection to hash indexes. Also, the common hash_any + * routine is also used by dynahash tables. + */ + /* Note: this is used for both "char" and boolean datatypes */ Datum hashchar(PG_FUNCTION_ARGS) @@ -138,22 +149,11 @@ hashoidvector(PG_FUNCTION_ARGS) } Datum -hashint2vector(PG_FUNCTION_ARGS) -{ - int2vector *key = (int2vector *) PG_GETARG_POINTER(0); - - return hash_any((unsigned char *) key->values, key->dim1 * sizeof(int16)); -} - -Datum hashname(PG_FUNCTION_ARGS) { char *key = NameStr(*PG_GETARG_NAME(0)); - int keylen = strlen(key); - - Assert(keylen < NAMEDATALEN); /* else it's not truncated correctly */ - return hash_any((unsigned char *) key, keylen); + return hash_any((unsigned char *) key, strlen(key)); } Datum @@ -581,8 +581,6 @@ compute_hash(Oid type, Datum value, char locator) return DirectFunctionCall1(hashchar, value); case NAMEOID: return DirectFunctionCall1(hashname, value); - case INT2VECTOROID: - return DirectFunctionCall1(hashint2vector, value); case VARCHAROID: case TEXTOID: @@ -677,8 +675,6 @@ get_compute_hash_function(Oid type, char locator) return "hashchar"; case NAMEOID: return "hashname"; - case INT2VECTOROID: - return "hashint2vector"; case VARCHAROID: case TEXTOID: return "hashtext"; diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index acd2e64763..01c8d8006c 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -3,7 +3,7 @@ * hashinsert.c * Item insertion in hash tables for Postgres. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -16,8 +16,15 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/heapam.h" +#include "miscadmin.h" #include "utils/rel.h" +#include "storage/lwlock.h" +#include "storage/buf_internals.h" +static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, + RelFileNode hnode); /* * _hash_doinsert() -- Handle insertion of a single index tuple. @@ -26,20 +33,21 @@ * and hashinsert. By here, itup is completely filled in. */ void -_hash_doinsert(Relation rel, IndexTuple itup) +_hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel) { - Buffer buf; + Buffer buf = InvalidBuffer; + Buffer bucket_buf; Buffer metabuf; HashMetaPage metap; - BlockNumber blkno; - BlockNumber oldblkno = InvalidBlockNumber; - bool retry = false; + HashMetaPage usedmetap = NULL; + Page metapage; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; + OffsetNumber itup_off; /* * Get the hash key for the item (it's stored in the index tuple itself). @@ -51,9 +59,15 @@ _hash_doinsert(Relation rel, IndexTuple itup) itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = HashPageGetMeta(BufferGetPage(metabuf)); +restart_insert: + + /* + * Read the metapage. We don't lock it yet; HashMaxItemSize() will + * examine pd_pagesize_version, but that can't change so we can examine it + * without a lock. + */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); + metapage = BufferGetPage(metabuf); /* * Check whether the item can fit on a hash page at all. (Eventually, we @@ -62,74 +76,90 @@ _hash_doinsert(Relation rel, IndexTuple itup) * * XXX this is useless code if we are only storing hash keys. */ - if (itemsz > HashMaxItemSize((Page) metap)) + if (itemsz > HashMaxItemSize(metapage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds hash maximum %zu", - itemsz, HashMaxItemSize((Page) metap)), + itemsz, HashMaxItemSize(metapage)), errhint("Values larger than a buffer page cannot be indexed."))); + /* Lock the primary bucket page for the target bucket. */ + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, + &usedmetap); + Assert(usedmetap != NULL); + + /* remember the primary bucket buffer to release the pin on it at end. */ + bucket_buf = buf; + + page = BufferGetPage(buf); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket = pageopaque->hasho_bucket; + /* - * Loop until we get a lock on the correct target bucket. + * If this bucket is in the process of being split, try to finish the + * split before inserting, because that might create room for the + * insertion to proceed without allocating an additional overflow page. + * It's only interesting to finish the split if we're trying to insert + * into the bucket from which we're removing tuples (the "old" bucket), + * not if we're trying to insert into the bucket into which tuples are + * being moved (the "new" bucket). */ - for (;;) + if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); + /* release the lock on bucket buffer, before completing the split. */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + _hash_finish_split(rel, metabuf, buf, bucket, + usedmetap->hashm_maxbucket, + usedmetap->hashm_highmask, + usedmetap->hashm_lowmask); + + /* release the pin on old and meta buffer. retry for insert. */ + _hash_dropbuf(rel, buf); + _hash_dropbuf(rel, metabuf); + goto restart_insert; + } - /* Release metapage lock, but keep pin. */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + /* Do the insertion */ + while (PageGetFreeSpace(page) < itemsz) + { + BlockNumber nextblkno; /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old lock - * and lock what now appears to be the correct bucket. + * Check if current page has any DEAD tuples. If yes, delete these + * tuples and see if we can get a space for the new item to be + * inserted before moving to the next page in the bucket chain. */ - if (retry) + if (H_HAS_DEAD_TUPLES(pageopaque)) { - if (oldblkno == blkno) - break; - _hash_droplock(rel, oldblkno, HASH_SHARE); - } - _hash_getlock(rel, blkno, HASH_SHARE); - /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. - */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - retry = true; - } + if (IsBufferCleanupOK(buf)) + { + _hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node); - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); - page = BufferGetPage(buf); - pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(pageopaque->hasho_bucket == bucket); + if (PageGetFreeSpace(page) >= itemsz) + break; /* OK, now we have enough space */ + } + } - /* Do the insertion */ - while (PageGetFreeSpace(page) < itemsz) - { /* * no space on this page; check for an overflow page */ - BlockNumber nextblkno = pageopaque->hasho_nextblkno; + nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll - * find out next pass through the loop test above. + * find out next pass through the loop test above. we always + * release both the lock and pin if this is an overflow page, but + * only the lock if this is the primary bucket page, since the pin + * on the primary bucket must be retained throughout the scan. */ - _hash_relbuf(rel, buf); + if (buf != bucket_buf) + _hash_relbuf(rel, buf); + else + LockBuffer(buf, BUFFER_LOCK_UNLOCK); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } @@ -141,43 +171,77 @@ _hash_doinsert(Relation rel, IndexTuple itup) */ /* release our write lock without modifying buffer */ - _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* chain to a new overflow page */ - buf = _hash_addovflpage(rel, metabuf, buf); + buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); + Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } - /* found page with enough space, so add the item here */ - (void) _hash_pgaddtup(rel, buf, itemsz, itup); - - /* write and release the modified page */ - _hash_wrtbuf(rel, buf); - - /* We can drop the bucket lock now */ - _hash_droplock(rel, blkno, HASH_SHARE); - /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* found page with enough space, so add the item here */ + itup_off = _hash_pgaddtup(rel, buf, itemsz, itup); + MarkBufferDirty(buf); + + /* metapage operations */ + metap = HashPageGetMeta(metapage); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); - /* Write out the metapage and drop lock, but keep pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_insert xlrec; + XLogRecPtr recptr; + + xlrec.offnum = itup_off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInsert); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock on metapage, but keep pin */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* + * Release the modified page and ensure to release the pin on primary + * page. + */ + _hash_relbuf(rel, buf); + if (buf != bucket_buf) + _hash_dropbuf(rel, bucket_buf); /* Attempt to split if a split is needed */ if (do_expand) @@ -219,3 +283,143 @@ _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup) return itup_off; } + +/* + * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the + * index. + * + * This routine has same requirements for locking and tuple ordering as + * _hash_pgaddtup(). + * + * Returns the offset number array at which the tuples were inserted. + */ +void +_hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, + OffsetNumber *itup_offsets, uint16 nitups) +{ + OffsetNumber itup_off; + Page page; + uint32 hashkey; + int i; + + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + + for (i = 0; i < nitups; i++) + { + Size itemsize; + + itemsize = IndexTupleDSize(*itups[i]); + itemsize = MAXALIGN(itemsize); + + /* Find where to insert the tuple (preserving page's hashkey ordering) */ + hashkey = _hash_get_indextuple_hashkey(itups[i]); + itup_off = _hash_binsearch(page, hashkey); + + itup_offsets[i] = itup_off; + + if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) + == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", + RelationGetRelationName(rel)); + } +} + +/* + * _hash_vacuum_one_page - vacuum just one index page. + * + * Try to remove LP_DEAD items from the given page. We must acquire cleanup + * lock on the page being modified before calling this function. + */ + +static void +_hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, + RelFileNode hnode) +{ + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + OffsetNumber offnum, + maxoff; + Page page = BufferGetPage(buf); + HashPageOpaque pageopaque; + HashMetaPage metap; + + /* Scan each tuple in page to see if it is marked as LP_DEAD */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + /* + * Write-lock the meta page so that we can decrement tuple count. + */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint + * anyway. + */ + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + + metap = HashPageGetMeta(BufferGetPage(metabuf)); + metap->hashm_ntuples -= ndeletable; + + MarkBufferDirty(buf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_vacuum_one_page xlrec; + XLogRecPtr recptr; + + xlrec.hnode = hnode; + xlrec.ntuples = ndeletable; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage); + + /* + * We need the target-offsets array whether or not we store the + * whole buffer, to allow us to find the latestRemovedXid on a + * standby server. + */ + XLogRegisterData((char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* + * Releasing write lock on meta page as we have updated the tuple + * count. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + } +} diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index db3e268a76..b5133e3945 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -3,7 +3,7 @@ * hashovfl.c * Overflow page management code for the Postgres hash access method * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -18,10 +18,11 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" #include "utils/rel.h" -static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); @@ -48,14 +49,16 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) * Convert to absolute page number by adding the number of bucket pages * that exist before this split point. */ - return (BlockNumber) ((1 << i) + ovflbitnum); + return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); } /* + * _hash_ovflblkno_to_bitno + * * Convert overflow page block number to bit number for free-page bitmap. */ -static uint32 -blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) +uint32 +_hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) { uint32 splitnum = metap->hashm_ovflpoint; uint32 i; @@ -64,14 +67,24 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) /* Determine the split number containing this page */ for (i = 1; i <= splitnum; i++) { - if (ovflblkno <= (BlockNumber) (1 << i)) + if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) break; /* oops */ - bitnum = ovflblkno - (1 << i); - if (bitnum <= metap->hashm_spares[i]) + bitnum = ovflblkno - _hash_get_totalbuckets(i); + + /* + * bitnum has to be greater than number of overflow page added in + * previous split point. The overflow page at this splitnum (i) if any + * should start from (_hash_get_totalbuckets(i) + + * metap->hashm_spares[i - 1] + 1). + */ + if (bitnum > metap->hashm_spares[i - 1] && + bitnum <= metap->hashm_spares[i]) return bitnum - 1; /* -1 to convert 1-based to 0-based */ } - elog(ERROR, "invalid overflow block number %u", ovflblkno); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid overflow block number %u", ovflblkno))); return 0; /* keep compiler quiet */ } @@ -82,38 +95,59 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' - * anymore). The returned overflow page will be pinned and write-locked; - * it is guaranteed to be empty. + * anymore) if not asked to retain. The pin will be retained only for the + * primary bucket. The returned overflow page will be pinned and + * write-locked; it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * - * The caller must hold at least share lock on the bucket, to ensure that - * no one else tries to compact the bucket meanwhile. This guarantees that - * 'buf' won't stop being part of the bucket while it's unlocked. - * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer -_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) +_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; - - /* allocate and lock an empty overflow page */ - ovflbuf = _hash_getovflpage(rel, metabuf); + HashMetaPage metap; + Buffer mapbuf = InvalidBuffer; + Buffer newmapbuf = InvalidBuffer; + BlockNumber blkno; + uint32 orig_firstfree; + uint32 splitnum; + uint32 *freep = NULL; + uint32 max_ovflpg; + uint32 bit; + uint32 bitmap_page_bit; + uint32 first_page; + uint32 last_bit; + uint32 last_page; + uint32 i, + j; + bool page_found = false; /* - * Write-lock the tail page. It is okay to hold two buffer locks here - * since there cannot be anyone else contending for access to ovflbuf. + * Write-lock the tail page. Here, we need to maintain locking order such + * that, first acquire the lock on tail page of bucket, then on meta page + * to find and lock the bitmap page and if it is found, then lock on meta + * page is released, then finally acquire the lock on new overflow buffer. + * We need this locking order to avoid deadlock with backends that are + * doing inserts. + * + * Note: We could have avoided locking many buffers here if we made two + * WAL records for acquiring an overflow page (one to allocate an overflow + * page and another to add it to overflow bucket chain). However, doing + * so can leak an overflow page, if the system crashes after allocation. + * Needless to say, it is better to have a single record from a + * performance point of view as well. */ - _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); @@ -131,59 +165,22 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) break; /* we assume we do not need to write the unmodified page */ - _hash_relbuf(rel, buf); + if (retain_pin) + { + /* pin will be retained only for the primary bucket page */ + Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + _hash_relbuf(rel, buf); + + retain_pin = false; buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } - /* now that we have correct backlink, initialize new overflow page */ - ovflpage = BufferGetPage(ovflbuf); - ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); - ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); - ovflopaque->hasho_nextblkno = InvalidBlockNumber; - ovflopaque->hasho_bucket = pageopaque->hasho_bucket; - ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; - ovflopaque->hasho_page_id = HASHO_PAGE_ID; - - MarkBufferDirty(ovflbuf); - - /* logically chain overflow page to previous page */ - pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); - _hash_wrtbuf(rel, buf); - - return ovflbuf; -} - -/* - * _hash_getovflpage() - * - * Find an available overflow page and return it. The returned buffer - * is pinned and write-locked, and has had _hash_pageinit() applied, - * but it is caller's responsibility to fill the special space. - * - * The caller must hold a pin, but no lock, on the metapage buffer. - * That buffer is left in the same state at exit. - */ -static Buffer -_hash_getovflpage(Relation rel, Buffer metabuf) -{ - HashMetaPage metap; - Buffer mapbuf = 0; - Buffer newbuf; - BlockNumber blkno; - uint32 orig_firstfree; - uint32 splitnum; - uint32 *freep = NULL; - uint32 max_ovflpg; - uint32 bit; - uint32 first_page; - uint32 last_bit; - uint32 last_page; - uint32 i, - j; - /* Get exclusive lock on the meta page */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); @@ -221,7 +218,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf) last_inpage = BMPGSZ_BIT(metap) - 1; /* Release exclusive lock on metapage while reading bitmap page */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); @@ -230,17 +227,37 @@ _hash_getovflpage(Relation rel, Buffer metabuf) for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) + { + page_found = true; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* convert bit to bit number within page */ + bit += _hash_firstfreebit(freep[j]); + bitmap_page_bit = bit; + + /* convert bit to absolute bit number */ + bit += (i << BMPG_SHIFT(metap)); + /* Calculate address of the recycled overflow page */ + blkno = bitno_to_blkno(metap, bit); + + /* Fetch and init the recycled page */ + ovflbuf = _hash_getinitbuf(rel, blkno); + goto found; + } } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); + mapbuf = InvalidBuffer; i++; j = 0; /* scan from start of next map page */ bit = 0; /* Reacquire exclusive lock on the meta page */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); } /* @@ -258,8 +275,15 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; - _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); - metap->hashm_spares[splitnum]++; + + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); } else { @@ -270,7 +294,8 @@ _hash_getovflpage(Relation rel, Buffer metabuf) } /* Calculate address of the new overflow page */ - bit = metap->hashm_spares[splitnum]; + bit = BufferIsValid(newmapbuf) ? + metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* @@ -278,39 +303,52 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. + * + * It is okay to hold two buffer locks here (one on tail page of bucket + * and other on new overflow page) since there cannot be anyone else + * contending for access to ovflbuf. */ - newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); + ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); - metap->hashm_spares[splitnum]++; +found: /* - * Adjust hashm_firstfree to avoid redundant searches. But don't risk - * changing it if someone moved it while we were searching bitmap pages. + * Do the update. No ereport(ERROR) until changes are logged. We want to + * log the changes for bitmap page and overflow page together to avoid + * loss of pages in case the new page is added. */ - if (metap->hashm_firstfree == orig_firstfree) - metap->hashm_firstfree = bit + 1; - - /* Write updated metapage and release lock, but not pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); + START_CRIT_SECTION(); - return newbuf; - -found: - /* convert bit to bit number within page */ - bit += _hash_firstfreebit(freep[j]); - - /* mark page "in use" in the bitmap */ - SETBIT(freep, bit); - _hash_wrtbuf(rel, mapbuf); + if (page_found) + { + Assert(BufferIsValid(mapbuf)); - /* Reacquire exclusive lock on the meta page */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + /* mark page "in use" in the bitmap */ + SETBIT(freep, bitmap_page_bit); + MarkBufferDirty(mapbuf); + } + else + { + /* update the count to indicate new overflow page is added */ + metap->hashm_spares[splitnum]++; - /* convert bit to absolute bit number */ - bit += (i << BMPG_SHIFT(metap)); + if (BufferIsValid(newmapbuf)) + { + _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(newmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); + metap->hashm_nmaps++; + metap->hashm_spares[splitnum]++; + MarkBufferDirty(metabuf); + } - /* Calculate address of the recycled overflow page */ - blkno = bitno_to_blkno(metap, bit); + /* + * for new overflow page, we don't need to explicitly set the bit in + * bitmap page, as by default that will be set to "in use". + */ + } /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk @@ -319,18 +357,84 @@ found: if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; - - /* Write updated metapage and release lock, but not pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); + MarkBufferDirty(metabuf); } - else + + /* initialize new overflow page */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { - /* We didn't change the metapage, so no need to write */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + XLogRecPtr recptr; + xl_hash_add_ovfl_page xlrec; + + xlrec.bmpage_found = page_found; + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); + + XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); + XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + + if (BufferIsValid(mapbuf)) + { + XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); + } + + if (BufferIsValid(newmapbuf)) + XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); + + XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); + + PageSetLSN(BufferGetPage(ovflbuf), recptr); + PageSetLSN(BufferGetPage(buf), recptr); + + if (BufferIsValid(mapbuf)) + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (BufferIsValid(newmapbuf)) + PageSetLSN(BufferGetPage(newmapbuf), recptr); + + PageSetLSN(BufferGetPage(metabuf), recptr); } - /* Fetch, init, and return the recycled page */ - return _hash_getinitbuf(rel, blkno); + END_CRIT_SECTION(); + + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + if (BufferIsValid(mapbuf)) + _hash_relbuf(rel, mapbuf); + + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newmapbuf)) + _hash_relbuf(rel, newmapbuf); + + return ovflbuf; } /* @@ -363,18 +467,27 @@ _hash_firstfreebit(uint32 map) * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * + * Add the tuples (itups) to wbuf in this function. We could do that in the + * caller as well, but the advantage of doing it here is we can easily write + * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and + * removal of overflow page has to done as an atomic operation, otherwise + * during replay on standby users might find duplicate records. + * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * - * NB: caller must not hold lock on metapage, nor on either page that's - * adjacent in the bucket chain. The caller had better hold exclusive lock - * on the bucket, too. + * NB: caller must not hold lock on metapage, nor on page, that's next to + * ovflbuf in the bucket chain. We don't acquire the lock on page that's + * prior to ovflbuf in chain if it is same as wbuf because the caller already + * has a lock on same. */ BlockNumber -_hash_freeovflpage(Relation rel, Buffer ovflbuf, +_hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy) { HashMetaPage metap; @@ -384,6 +497,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; + BlockNumber writeblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; @@ -392,6 +506,9 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; + Buffer prevbuf = InvalidBuffer; + Buffer nextbuf = InvalidBuffer; + bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); @@ -400,50 +517,32 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; + writeblkno = BufferGetBlockNumber(wbuf); bucket = ovflopaque->hasho_bucket; /* - * Zero the page for debugging's sake; then write and release it. (Note: - * if we failed to zero the page here, we'd have problems with the Assert - * in _hash_pageinit() when the page is reused.) - */ - MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); - _hash_wrtbuf(rel, ovflbuf); - - /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being - * deleted. No concurrency issues since we hold exclusive lock on the - * entire bucket. + * deleted. Concurrency issues are avoided by using lock chaining as + * described atop hashbucketcleanup. */ if (BlockNumberIsValid(prevblkno)) { - Buffer prevbuf = _hash_getbuf_with_strategy(rel, - prevblkno, - HASH_WRITE, + if (prevblkno == writeblkno) + prevbuf = wbuf; + else + prevbuf = _hash_getbuf_with_strategy(rel, + prevblkno, + HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, - bstrategy); - Page prevpage = BufferGetPage(prevbuf); - HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); - - Assert(prevopaque->hasho_bucket == bucket); - prevopaque->hasho_nextblkno = nextblkno; - _hash_wrtbuf(rel, prevbuf); + bstrategy); } if (BlockNumberIsValid(nextblkno)) - { - Buffer nextbuf = _hash_getbuf_with_strategy(rel, - nextblkno, - HASH_WRITE, - LH_OVERFLOW_PAGE, - bstrategy); - Page nextpage = BufferGetPage(nextbuf); - HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); - - Assert(nextopaque->hasho_bucket == bucket); - nextopaque->hasho_prevblkno = prevblkno; - _hash_wrtbuf(rel, nextbuf); - } + nextbuf = _hash_getbuf_with_strategy(rel, + nextblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); /* Note: bstrategy is intentionally not used for metapage and bitmap */ @@ -452,7 +551,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ - ovflbitno = blkno_to_bitno(metap, ovflblkno); + ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); @@ -462,67 +561,193 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - /* Clear the bitmap bit to indicate that this overflow page is free */ + /* read the bitmap page to clear the bitmap bit */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); - CLRBIT(freep, bitmapbit); - _hash_wrtbuf(rel, mapbuf); /* Get write-lock on metapage to update firstfree */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* This operation needs to log multiple tuples, prepare WAL for that */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being careful to preserve + * hashkey ordering. (If we insert many tuples into the same "write" page + * it would be worth qsort'ing them). + */ + if (nitups > 0) + { + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + } + + /* + * Reinitialize the freed overflow page. Just zeroing the page won't + * work, because WAL replay routines expect pages to be initialized. See + * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are + * careful to make the special space valid here so that tools like + * pageinspect won't get confused. + */ + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + if (BufferIsValid(prevbuf)) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(nextbuf)) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + Assert(nextopaque->hasho_bucket == bucket); + nextopaque->hasho_prevblkno = prevblkno; + MarkBufferDirty(nextbuf); + } + + /* Clear the bitmap bit to indicate that this overflow page is free */ + CLRBIT(freep, bitmapbit); + MarkBufferDirty(mapbuf); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; - _hash_wrtbuf(rel, metabuf); + update_metap = true; + MarkBufferDirty(metabuf); } - else + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { - /* no need to change metapage */ - _hash_relbuf(rel, metabuf); + xl_hash_squeeze_page xlrec; + XLogRecPtr recptr; + int i; + + xlrec.prevblkno = prevblkno; + xlrec.nextblkno = nextblkno; + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); + xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); + + /* + * bucket buffer needs to be registered to ensure that we can acquire + * a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + if (xlrec.ntups > 0) + { + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + } + + XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); + + /* + * If prevpage and the writepage (block in which we are moving tuples + * from overflow) are same, then no need to separately register + * prevpage. During replay, we can directly update the nextblock in + * writepage. + */ + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); + + if (BufferIsValid(nextbuf)) + XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); + + XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); + + if (update_metap) + { + XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); + } + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(ovflbuf), recptr); + + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + PageSetLSN(BufferGetPage(prevbuf), recptr); + if (BufferIsValid(nextbuf)) + PageSetLSN(BufferGetPage(nextbuf), recptr); + + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (update_metap) + PageSetLSN(BufferGetPage(metabuf), recptr); } + END_CRIT_SECTION(); + + /* release previous bucket if it is not same as write bucket */ + if (BufferIsValid(prevbuf) && prevblkno != writeblkno) + _hash_relbuf(rel, prevbuf); + + if (BufferIsValid(ovflbuf)) + _hash_relbuf(rel, ovflbuf); + + if (BufferIsValid(nextbuf)) + _hash_relbuf(rel, nextbuf); + + _hash_relbuf(rel, mapbuf); + _hash_relbuf(rel, metabuf); + return nextblkno; } /* - * _hash_initbitmap() - * - * Initialize a new bitmap page. The metapage has a write-lock upon - * entering the function, and must be written by caller after return. + * _hash_initbitmapbuffer() * - * 'blkno' is the block number of the new bitmap page. - * - * All bits in the new bitmap page are set to "1", indicating "in use". + * Initialize a new bitmap page. All bits in the new bitmap page are set to + * "1", indicating "in use". */ void -_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, - ForkNumber forkNum) +_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) { - Buffer buf; Page pg; HashPageOpaque op; uint32 *freep; - /* - * It is okay to write-lock the new bitmap page while holding metapage - * write lock, because no one else could be contending for the new page. - * Also, the metapage lock makes it safe to extend the index using - * _hash_getnewbuf. - * - * There is some loss of concurrency in possibly doing I/O for the new - * page while holding the metapage lock, but this path is taken so seldom - * that it's not worth worrying about. - */ - buf = _hash_getnewbuf(rel, blkno, forkNum); pg = BufferGetPage(buf); + /* initialize the page */ + if (initpage) + _hash_pageinit(pg, BufferGetPageSize(buf)); + /* initialize the page's special space */ op = (HashPageOpaque) PageGetSpecialPointer(pg); op->hasho_prevblkno = InvalidBlockNumber; @@ -533,22 +758,14 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, /* set all of the bits to 1 */ freep = HashPageGetBitmap(pg); - MemSet(freep, 0xFF, BMPGSZ_BYTE(metap)); - - /* write out the new bitmap page (releasing write lock and pin) */ - _hash_wrtbuf(rel, buf); + MemSet(freep, 0xFF, bmsize); - /* add the new bitmap page to the metapage's list of bitmaps */ - /* metapage already has a write lock */ - if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of overflow pages in hash index \"%s\"", - RelationGetRelationName(rel)))); - - metap->hashm_mapp[metap->hashm_nmaps] = blkno; - - metap->hashm_nmaps++; + /* + * Set pd_lower just past the end of the bitmap page data. We could even + * set pd_lower equal to pd_upper, but this is more precise and makes the + * page look compressible to xlog.c. + */ + ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; } @@ -570,8 +787,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * - * Caller must hold exclusive lock on the target bucket. This allows - * us to safely lock multiple pages in the bucket. + * Caller must acquire cleanup lock on the primary page of the target + * bucket to exclude any scans that are in progress, which could easily + * be confused into returning the same tuple more than once or some tuples + * not at all by the rearrangement we are performing here. To prevent + * any concurrent scan to cross the squeeze scan we use lock chaining + * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup. + * + * We need to retain a pin on the primary bucket to ensure that no concurrent + * split can start. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. @@ -580,6 +804,7 @@ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, + Buffer bucket_buf, BufferAccessStrategy bstrategy) { BlockNumber wblkno; @@ -590,26 +815,22 @@ _hash_squeezebucket(Relation rel, Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; - bool wbuf_dirty; /* - * start squeezing into the base bucket page. + * start squeezing into the primary bucket page. */ wblkno = bucket_blkno; - wbuf = _hash_getbuf_with_strategy(rel, - wblkno, - HASH_WRITE, - LH_BUCKET_PAGE, - bstrategy); + wbuf = bucket_buf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* - * if there aren't any overflow pages, there's nothing to squeeze. + * if there aren't any overflow pages, there's nothing to squeeze. caller + * is responsible for releasing the pin on primary bucket page. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { - _hash_relbuf(rel, wbuf); + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); return; } @@ -639,14 +860,21 @@ _hash_squeezebucket(Relation rel, /* * squeeze the tuples. */ - wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; - + IndexTuple itups[MaxIndexTuplesPerPage]; + Size tups_size[MaxIndexTuplesPerPage]; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + uint16 ndeletable = 0; + uint16 nitups = 0; + Size all_tups_size = 0; + int i; + bool retain_pin = false; + +readpage: /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; @@ -656,6 +884,10 @@ _hash_squeezebucket(Relation rel, IndexTuple itup; Size itemsz; + /* skip dead tuples */ + if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) + continue; + itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); @@ -663,56 +895,144 @@ _hash_squeezebucket(Relation rel, /* * Walk up the bucket chain, looking for a page big enough for - * this item. Exit if we reach the read page. + * this item and all other accumulated items. Exit if we reach + * the read page. */ - while (PageGetFreeSpace(wpage) < itemsz) + while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) { + Buffer next_wbuf = InvalidBuffer; + bool tups_moved = false; + Assert(!PageIsEmpty(wpage)); + if (wblkno == bucket_blkno) + retain_pin = true; + wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); - if (wbuf_dirty) - _hash_wrtbuf(rel, wbuf); + /* don't need to move to next page if we reached the read page */ + if (wblkno != rblkno) + next_wbuf = _hash_getbuf_with_strategy(rel, + wblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + if (nitups > 0) + { + Assert(nitups == ndeletable); + + /* + * This operation needs to log multiple tuples, prepare + * WAL for that. + */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(0, 3 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being + * careful to preserve hashkey ordering. (If we insert + * many tuples into the same "write" page it would be + * worth qsort'ing them). + */ + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + + /* Delete tuples we already moved off read page */ + PageIndexMultiDelete(rpage, deletable, ndeletable); + MarkBufferDirty(rbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_move_page_contents xlrec; + + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); + + /* + * bucket buffer needs to be registered to ensure that + * we can acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(rbuf), recptr); + } + + END_CRIT_SECTION(); + + tups_moved = true; + } + + /* + * release the lock on previous page after acquiring the lock + * on next page + */ + if (retain_pin) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { - if (ndeletable > 0) - { - /* Delete tuples we already moved off read page */ - PageIndexMultiDelete(rpage, deletable, ndeletable); - _hash_wrtbuf(rel, rbuf); - } - else - _hash_relbuf(rel, rbuf); + _hash_relbuf(rel, rbuf); return; } - wbuf = _hash_getbuf_with_strategy(rel, - wblkno, - HASH_WRITE, - LH_OVERFLOW_PAGE, - bstrategy); + wbuf = next_wbuf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); - wbuf_dirty = false; + retain_pin = false; + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + ndeletable = 0; + + /* + * after moving the tuples, rpage would have been compacted, + * so we need to rescan it. + */ + if (tups_moved) + goto readpage; } - /* - * we have found room so insert on the "write" page, being careful - * to preserve hashkey ordering. (If we insert many tuples into - * the same "write" page it would be worth qsort'ing instead of - * doing repeated _hash_pgaddtup.) - */ - (void) _hash_pgaddtup(rel, wbuf, itemsz, itup); - wbuf_dirty = true; - /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; + + /* + * we need a copy of index tuples as they can be freed as part of + * overflow page, however we need them to write a WAL record in + * _hash_freeovflpage. + */ + itups[nitups] = CopyIndexTuple(itup); + tups_size[nitups++] = itemsz; + all_tups_size += itemsz; } /* @@ -724,29 +1044,30 @@ _hash_squeezebucket(Relation rel, * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the - * removed page. However, in that case we are done anyway, so we can - * simply drop the write lock before calling _hash_freeovflpage. + * removed page. In that case, we don't need to lock it again. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); + /* free this overflow page (releases rbuf) */ + _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, + tups_size, nitups, bstrategy); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { - /* yes, so release wbuf lock first */ - if (wbuf_dirty) - _hash_wrtbuf(rel, wbuf); + /* retain the pin on primary bucket page till end of bucket scan */ + if (wblkno == bucket_blkno) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); - /* free this overflow page (releases rbuf) */ - _hash_freeovflpage(rel, rbuf, bstrategy); - /* done */ return; } - /* free this overflow page, then get the previous one */ - _hash_freeovflpage(rel, rbuf, bstrategy); - rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 178463fcb6..4544889294 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -3,7 +3,7 @@ * hashpage.c * Hash table page management code for the Postgres hash access method * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -29,6 +29,7 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/smgr.h" @@ -38,10 +39,12 @@ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks); static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, - BlockNumber start_oblkno, + Buffer obuf, Buffer nbuf, + HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask); +static void log_split_page(Relation rel, Buffer buf); /* @@ -55,46 +58,6 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, /* - * _hash_getlock() -- Acquire an lmgr lock. - * - * 'whichlock' should the block number of a bucket's primary bucket page to - * acquire the per-bucket lock. (See README for details of the use of these - * locks.) - * - * 'access' must be HASH_SHARE or HASH_EXCLUSIVE. - */ -void -_hash_getlock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - LockPage(rel, whichlock, access); -} - -/* - * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free. - * - * Same as above except we return FALSE without blocking if lock isn't free. - */ -bool -_hash_try_getlock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - return ConditionalLockPage(rel, whichlock, access); - else - return true; -} - -/* - * _hash_droplock() -- Release an lmgr lock. - */ -void -_hash_droplock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - UnlockPage(rel, whichlock, access); -} - -/* * _hash_getbuf() -- Get a buffer by block number for read or write. * * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK. @@ -132,6 +95,35 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags) } /* + * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup. + * + * We read the page and try to acquire a cleanup lock. If we get it, + * we return the buffer; otherwise, we return InvalidBuffer. + */ +Buffer +_hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags) +{ + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + + buf = ReadBuffer(rel, blkno); + + if (!ConditionalLockBufferForCleanup(buf)) + { + ReleaseBuffer(buf); + return InvalidBuffer; + } + + /* ref count and lock type are correct */ + + _hash_checkpage(rel, buf, flags); + + return buf; +} + +/* * _hash_getinitbuf() -- Get and initialize a buffer by block number. * * This must be used only to fetch pages that are known to be before @@ -167,6 +159,36 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno) } /* + * _hash_initbuf() -- Get and initialize a buffer by bucket number. + */ +void +_hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, + bool initpage) +{ + HashPageOpaque pageopaque; + Page page; + + page = BufferGetPage(buf); + + /* initialize the page */ + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* + * Set hasho_prevblkno with current hashm_maxbucket. This value will be + * used to validate cached HashMetaPageData. See + * _hash_getbucketbuf_from_hashkey(). + */ + pageopaque->hasho_prevblkno = max_bucket; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_bucket = num_bucket; + pageopaque->hasho_flag = flag; + pageopaque->hasho_page_id = HASHO_PAGE_ID; +} + +/* * _hash_getnewbuf() -- Get a new page at the end of the index. * * This has the same API as _hash_getinitbuf, except that we are adding @@ -266,53 +288,39 @@ _hash_dropbuf(Relation rel, Buffer buf) } /* - * _hash_wrtbuf() -- write a hash page to disk. + * _hash_dropscanbuf() -- release buffers used in scan. * - * This routine releases the lock held on the buffer and our refcount - * for it. It is an error to call _hash_wrtbuf() without a write lock - * and a pin on the buffer. - * - * NOTE: this routine should go away when/if hash indexes are WAL-ified. - * The correct sequence of operations is to mark the buffer dirty, then - * write the WAL record, then release the lock and pin; so marking dirty - * can't be combined with releasing. + * This routine unpins the buffers used during scan on which we + * hold no lock. */ void -_hash_wrtbuf(Relation rel, Buffer buf) +_hash_dropscanbuf(Relation rel, HashScanOpaque so) { - MarkBufferDirty(buf); - UnlockReleaseBuffer(buf); -} - -/* - * _hash_chgbufaccess() -- Change the lock type on a buffer, without - * dropping our pin on it. - * - * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK, - * the last indicating that no buffer-level lock is held or wanted. - * - * When from_access == HASH_WRITE, we assume the buffer is dirty and tell - * bufmgr it must be written out. If the caller wants to release a write - * lock on a page that's not been modified, it's okay to pass from_access - * as HASH_READ (a bit ugly, but handy in some places). - */ -void -_hash_chgbufaccess(Relation rel, - Buffer buf, - int from_access, - int to_access) -{ - if (from_access == HASH_WRITE) - MarkBufferDirty(buf); - if (from_access != HASH_NOLOCK) - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - if (to_access != HASH_NOLOCK) - LockBuffer(buf, to_access); + /* release pin we hold on primary bucket page */ + if (BufferIsValid(so->hashso_bucket_buf) && + so->hashso_bucket_buf != so->hashso_curbuf) + _hash_dropbuf(rel, so->hashso_bucket_buf); + so->hashso_bucket_buf = InvalidBuffer; + + /* release pin we hold on primary bucket page of bucket being split */ + if (BufferIsValid(so->hashso_split_bucket_buf) && + so->hashso_split_bucket_buf != so->hashso_curbuf) + _hash_dropbuf(rel, so->hashso_split_bucket_buf); + so->hashso_split_bucket_buf = InvalidBuffer; + + /* release any pin we still hold */ + if (BufferIsValid(so->hashso_curbuf)) + _hash_dropbuf(rel, so->hashso_curbuf); + so->hashso_curbuf = InvalidBuffer; + + /* reset split scan */ + so->hashso_buc_populated = false; + so->hashso_buc_split = false; } /* - * _hash_metapinit() -- Initialize the metadata page of a hash index, + * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate @@ -324,19 +332,18 @@ _hash_chgbufaccess(Relation rel, * multiple buffer locks is ignored. */ uint32 -_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) +_hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { - HashMetaPage metap; - HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; + Buffer bitmapbuf; Page pg; + HashMetaPage metap; + RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; - double dnumbuckets; uint32 num_buckets; - uint32 log2_num_buckets; uint32 i; /* safety check */ @@ -358,10 +365,151 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) if (ffactor < 10) ffactor = 10; + procid = index_getprocid(rel, 1, HASHPROC); + + /* + * We initialize the metapage, the first N bucket pages, and the first + * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() + * calls to occur. This ensures that the smgr level has the right idea of + * the physical index length. + * + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. + */ + metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); + _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); + MarkBufferDirty(metabuf); + + pg = BufferGetPage(metabuf); + metap = HashPageGetMeta(pg); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_init_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.num_tuples = num_tuples; + xlrec.procid = metap->hashm_procid; + xlrec.ffactor = metap->hashm_ffactor; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + num_buckets = metap->hashm_maxbucket + 1; + + /* + * Release buffer lock on the metapage while we initialize buckets. + * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS + * won't accomplish anything. It's a bad idea to hold buffer locks for + * long intervals in any case, since that can block the bgwriter. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* + * Initialize and WAL Log the first N buckets + */ + for (i = 0; i < num_buckets; i++) + { + BlockNumber blkno; + + /* Allow interrupts, in case N is huge */ + CHECK_FOR_INTERRUPTS(); + + blkno = BUCKET_TO_BLKNO(metap, i); + buf = _hash_getnewbuf(rel, blkno, forkNum); + _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); + MarkBufferDirty(buf); + + log_newpage(&rel->rd_node, + forkNum, + blkno, + BufferGetPage(buf), + true); + _hash_relbuf(rel, buf); + } + + /* Now reacquire buffer lock on metapage */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Initialize bitmap page + */ + bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); + _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + + metap->hashm_nmaps++; + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_init_bitmap_page xlrec; + XLogRecPtr recptr; + + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); + XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); + + /* + * This is safe only because nobody else can be modifying the index at + * this stage; it's only visible to the transaction that is creating + * it. + */ + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); + + PageSetLSN(BufferGetPage(bitmapbuf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + /* all done */ + _hash_relbuf(rel, bitmapbuf); + _hash_relbuf(rel, metabuf); + + return num_buckets; +} + +/* + * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. + */ +void +_hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, + uint16 ffactor, bool initpage) +{ + HashMetaPage metap; + HashPageOpaque pageopaque; + Page page; + double dnumbuckets; + uint32 num_buckets; + uint32 spare_index; + uint32 i; + /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the - * next power of 2, however, and always force at least 2 bucket pages. The + * total number of buckets which has to be allocated before using its + * _hashm_spare element. However always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ @@ -371,36 +519,30 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else - num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); + num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); - log2_num_buckets = _hash_log2(num_buckets); - Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); - Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); + spare_index = _hash_spareindex(num_buckets); + Assert(spare_index < HASH_MAX_SPLITPOINTS); - /* - * We initialize the metapage, the first N bucket pages, and the first - * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() - * calls to occur. This ensures that the smgr level has the right idea of - * the physical index length. - */ - metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); - pg = BufferGetPage(metabuf); + page = BufferGetPage(buf); + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); - pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; - metap = HashPageGetMeta(pg); + metap = HashPageGetMeta(page); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; - metap->hashm_bsize = HashGetMaxBitmapSize(pg); + metap->hashm_bsize = HashGetMaxBitmapSize(page); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { @@ -417,63 +559,35 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ - metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); + metap->hashm_procid = procid; /* * We initialize the index with N buckets, 0 .. N-1, occupying physical - * blocks 1 to N. The first freespace bitmap page is in block N+1. Since - * N is a power of 2, we can set the masks this way: + * blocks 1 to N. The first freespace bitmap page is in block N+1. */ - metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; - metap->hashm_highmask = (num_buckets << 1) - 1; + metap->hashm_maxbucket = num_buckets - 1; + + /* + * Set highmask as next immediate ((2 ^ x) - 1), which should be + * sufficient to cover num_buckets. + */ + metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1; + metap->hashm_lowmask = (metap->hashm_highmask >> 1); MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ - metap->hashm_spares[log2_num_buckets] = 1; - metap->hashm_ovflpoint = log2_num_buckets; + metap->hashm_spares[spare_index] = 1; + metap->hashm_ovflpoint = spare_index; metap->hashm_firstfree = 0; /* - * Release buffer lock on the metapage while we initialize buckets. - * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS - * won't accomplish anything. It's a bad idea to hold buffer locks for - * long intervals in any case, since that can block the bgwriter. - */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); - - /* - * Initialize the first N buckets - */ - for (i = 0; i < num_buckets; i++) - { - /* Allow interrupts, in case N is huge */ - CHECK_FOR_INTERRUPTS(); - - buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); - pg = BufferGetPage(buf); - pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); - pageopaque->hasho_prevblkno = InvalidBlockNumber; - pageopaque->hasho_nextblkno = InvalidBlockNumber; - pageopaque->hasho_bucket = i; - pageopaque->hasho_flag = LH_BUCKET_PAGE; - pageopaque->hasho_page_id = HASHO_PAGE_ID; - _hash_wrtbuf(rel, buf); - } - - /* Now reacquire buffer lock on metapage */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); - - /* - * Initialize first bitmap page + * Set pd_lower just past the end of the metadata. This is to log full + * page image of metapage in xloginsert.c. */ - _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); - - /* all done */ - _hash_wrtbuf(rel, metabuf); - - return num_buckets; + ((PageHeader) page)->pd_lower = + ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; } /* @@ -482,16 +596,17 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) void _hash_pageinit(Page page, Size size) { - Assert(PageIsNew(page)); PageInit(page, size, sizeof(HashPageOpaqueData)); } /* * Attempt to expand the hash table by creating one new bucket. * - * This will silently do nothing if it cannot get the needed locks. + * This will silently do nothing if we don't get cleanup lock on old or + * new bucket. * - * The caller should hold no locks on the hash index. + * Complete the pending splits and remove the tuples from old bucket, + * if there are any left over from the previous split. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. @@ -506,15 +621,24 @@ _hash_expandtable(Relation rel, Buffer metabuf) BlockNumber start_oblkno; BlockNumber start_nblkno; Buffer buf_nblkno; + Buffer buf_oblkno; + Page opage; + Page npage; + HashPageOpaque oopaque; + HashPageOpaque nopaque; uint32 maxbucket; uint32 highmask; uint32 lowmask; + bool metap_update_masks = false; + bool metap_update_splitpoint = false; + +restart_expand: /* * Write-lock the meta page. It used to be necessary to acquire a * heavyweight lock to begin a split, but that is no longer required. */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); @@ -542,17 +666,22 @@ _hash_expandtable(Relation rel, Buffer metabuf) * than a disk block then this would be an independent constraint. * * If you change this, see also the maximum initial number of buckets in - * _hash_metapinit(). + * _hash_init(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* - * Determine which bucket is to be split, and attempt to lock the old - * bucket. If we can't get the lock, give up. + * Determine which bucket is to be split, and attempt to take cleanup lock + * on the old bucket. If we can't get the lock, give up. + * + * The cleanup lock protects us not only against other backends, but + * against our own backend as well. * - * The lock protects us against other backends, but not against our own - * backend. Must check for active scans separately. + * The cleanup lock is mainly to protect the split from concurrent + * inserts. See src/backend/access/hash/README, Lock Definitions for + * further details. Due to this locking restriction, if there is any + * pending scan, the split will give up which is not good, but harmless. */ new_bucket = metap->hashm_maxbucket + 1; @@ -560,14 +689,86 @@ _hash_expandtable(Relation rel, Buffer metabuf) start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); - if (_hash_has_active_scan(rel, old_bucket)) + buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); + if (!buf_oblkno) goto fail; - if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) - goto fail; + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* + * We want to finish the split from a bucket as there is no apparent + * benefit by not doing so and it will make the code complicated to finish + * the split that involves multiple buckets considering the case where new + * split also fails. We don't need to consider the new bucket for + * completing the split here as it is not possible that a re-split of new + * bucket starts when there is still a pending split from old bucket. + */ + if (H_BUCKET_BEING_SPLIT(oopaque)) + { + /* + * Copy bucket mapping info now; refer the comment in code below where + * we copy this information before calling _hash_splitbucket to see + * why this is okay. + */ + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + /* + * Release the lock on metapage and old_bucket, before completing the + * split. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK); + + _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket, + highmask, lowmask); + + /* release the pin on old buffer and retry for expand. */ + _hash_dropbuf(rel, buf_oblkno); + + goto restart_expand; + } /* - * Likewise lock the new bucket (should never fail). + * Clean the tuples remained from the previous split. This operation + * requires cleanup lock and we already have one on the old bucket, so + * let's do it. We also don't want to allow further splits from the bucket + * till the garbage of previous split is cleaned. This has two + * advantages; first, it helps in avoiding the bloat due to garbage and + * second is, during cleanup of bucket, we are always sure that the + * garbage tuples belong to most recently split bucket. On the contrary, + * if we allow cleanup of bucket after meta page is updated to indicate + * the new split and before the actual split, the cleanup operation won't + * be able to decide whether the tuple has been moved to the newly created + * bucket and ended up deleting such tuples. + */ + if (H_NEEDS_SPLIT_CLEANUP(oopaque)) + { + /* + * Copy bucket mapping info now; refer to the comment in code below + * where we copy this information before calling _hash_splitbucket to + * see why this is okay. + */ + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + /* Release the metapage lock. */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL, + maxbucket, highmask, lowmask, NULL, NULL, true, + NULL, NULL); + + _hash_dropbuf(rel, buf_oblkno); + + goto restart_expand; + } + + /* + * There shouldn't be any active scan on new bucket. * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because @@ -576,32 +777,28 @@ _hash_expandtable(Relation rel, Buffer metabuf) */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); - if (_hash_has_active_scan(rel, new_bucket)) - elog(ERROR, "scan in progress on supposedly new bucket"); - - if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) - elog(ERROR, "could not get lock on supposedly new bucket"); - /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to allocate a new batch of bucket pages. + * If the split point is increasing we need to allocate a new batch of + * bucket pages. */ - spare_ndx = _hash_log2(new_bucket + 1); + spare_ndx = _hash_spareindex(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { + uint32 buckets_to_add; + Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* - * The number of buckets in the new splitpoint is equal to the total - * number already in existence, i.e. new_bucket. Currently this maps - * one-to-one to blocks required, but someday we may need a more - * complicated calculation here. + * We treat allocation of buckets as a separate WAL-logged action. + * Even if we fail after this operation, won't leak bucket pages; + * rather, the next split will consume this space. In any case, even + * without failure we don't use all the space in one split operation. */ - if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) + buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket; + if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add)) { /* can't split due to BlockNumber overflow */ - _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); - _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + _hash_relbuf(rel, buf_oblkno); goto fail; } } @@ -609,21 +806,29 @@ _hash_expandtable(Relation rel, Buffer metabuf) /* * Physically allocate the new bucket's primary page. We want to do this * before changing the metapage's mapping info, in case we can't get the - * disk space. + * disk space. Ideally, we don't need to check for cleanup lock on new + * bucket as no other backend could find this bucket unless meta page is + * updated. However, it is good to be consistent with old bucket locking. */ buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); + if (!IsBufferCleanupOK(buf_nblkno)) + { + _hash_relbuf(rel, buf_oblkno); + _hash_relbuf(rel, buf_nblkno); + goto fail; + } /* - * Okay to proceed with split. Update the metapage bucket mapping info. - * - * Since we are scribbling on the metapage data right in the shared - * buffer, any failure in this next little bit leaves us with a big + * Since we are scribbling on the pages in the shared buffers, establish a + * critical section. Any failure in this next code leaves us with a big * problem: the metapage is effectively corrupt but could get written back - * to disk. We don't really expect any failure, but just to be sure, - * establish a critical section. + * to disk. */ START_CRIT_SECTION(); + /* + * Okay to proceed with split. Update the metapage bucket mapping info. + */ metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) @@ -631,22 +836,22 @@ _hash_expandtable(Relation rel, Buffer metabuf) /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; + metap_update_masks = true; } /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to adjust the hashm_spares[] array and - * hashm_ovflpoint so that future overflow pages will be created beyond - * this new batch of bucket pages. + * If the split point is increasing we need to adjust the hashm_spares[] + * array and hashm_ovflpoint so that future overflow pages will be created + * beyond this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; + metap_update_splitpoint = true; } - /* Done mucking with metapage */ - END_CRIT_SECTION(); + MarkBufferDirty(metabuf); /* * Copy bucket mapping info now; this saves re-accessing the meta page @@ -659,18 +864,92 @@ _hash_expandtable(Relation rel, Buffer metabuf) highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; - /* Write out the metapage and drop lock, but keep pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* + * Mark the old bucket to indicate that split is in progress. (At + * operation end, we will clear the split-in-progress flag.) Also, for a + * primary bucket page, hasho_prevblkno stores the number of buckets that + * existed as of the last split, so we must update that value here. + */ + oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; + oopaque->hasho_prevblkno = maxbucket; + + MarkBufferDirty(buf_oblkno); + + npage = BufferGetPage(buf_nblkno); + + /* + * initialize the new bucket's primary page and mark it to indicate that + * split is in progress. + */ + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nopaque->hasho_prevblkno = maxbucket; + nopaque->hasho_nextblkno = InvalidBlockNumber; + nopaque->hasho_bucket = new_bucket; + nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; + nopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(buf_nblkno); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_split_allocate_page xlrec; + XLogRecPtr recptr; + + xlrec.new_bucket = maxbucket; + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + xlrec.flags = 0; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); + XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); + + if (metap_update_masks) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; + XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); + XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); + } + + if (metap_update_splitpoint) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; + XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, + sizeof(uint32)); + XLogRegisterBufData(2, + (char *) &metap->hashm_spares[metap->hashm_ovflpoint], + sizeof(uint32)); + } + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); + + PageSetLSN(BufferGetPage(buf_oblkno), recptr); + PageSetLSN(BufferGetPage(buf_nblkno), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock, but keep pin */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, - start_oblkno, buf_nblkno, + buf_oblkno, buf_nblkno, NULL, maxbucket, highmask, lowmask); - /* Release bucket locks, allowing others to access them */ - _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); - _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + /* all done, now release the locks and pins on primary buckets. */ + _hash_relbuf(rel, buf_oblkno); + _hash_relbuf(rel, buf_nblkno); return; @@ -678,7 +957,7 @@ _hash_expandtable(Relation rel, Buffer metabuf) fail: /* We didn't write the metapage, so just drop lock */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); } @@ -698,7 +977,7 @@ fail: * hash indexes sequentially anyway, that probably doesn't matter. * * XXX It's annoying that this code is executed with the metapage lock held. - * We need to interlock against _hash_getovflpage() adding a new overflow page + * We need to interlock against _hash_addovflpage() adding a new overflow page * concurrently, but it'd likely be better to use LockRelationForExtension * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. @@ -711,6 +990,8 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; char zerobuf[BLCKSZ]; + Page page; + HashPageOpaque ovflopaque; lastblock = firstblock + nblocks - 1; @@ -721,7 +1002,29 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; - MemSet(zerobuf, 0, sizeof(zerobuf)); + page = (Page) zerobuf; + + /* + * Initialize the page. Just zeroing the page won't work; see + * _hash_freeovflpage for similar usage. We take care to make the special + * space valid for the benefit of tools such as pageinspect. + */ + _hash_pageinit(page, BLCKSZ); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + if (RelationNeedsWAL(rel)) + log_newpage(&rel->rd_node, + MAIN_FORKNUM, + lastblock, + zerobuf, + true); RelationOpenSmgr(rel); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); @@ -733,18 +1036,28 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) /* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * + * This routine is used to partition the tuples between old and new bucket and + * is used to finish the incomplete split operations. To finish the previously + * interrupted split operation, the caller needs to fill htab. If htab is set, + * then we skip the movement of tuples that exists in htab, otherwise NULL + * value of htab indicates movement of all the tuples that belong to the new + * bucket. + * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that - * belong in the new bucket, and compress out any free space in the old - * bucket. + * belong in the new bucket. * - * The caller must hold exclusive locks on both buckets to ensure that + * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) * + * Split needs to retain pin on primary bucket pages of both old and new + * buckets till end of operation. This is to prevent vacuum from starting + * while a split is in progress. + * * In addition, the caller must have created the new bucket's base page, * which is passed in buffer nbuf, pinned and write-locked. That lock and * pin are released here. (The API is set up this way because we must do @@ -756,36 +1069,32 @@ _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, - BlockNumber start_oblkno, + Buffer obuf, Buffer nbuf, + HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { - Buffer obuf; + Buffer bucket_obuf; + Buffer bucket_nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size all_tups_size = 0; + int i; + uint16 nitups = 0; - /* - * It should be okay to simultaneously write-lock pages from each bucket, - * since no one else can be trying to acquire buffer lock on pages of - * either bucket. - */ - obuf = _hash_getbuf(rel, start_oblkno, HASH_WRITE, LH_BUCKET_PAGE); + bucket_obuf = obuf; opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); - - /* initialize the new bucket's primary page */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); - nopaque->hasho_prevblkno = InvalidBlockNumber; - nopaque->hasho_nextblkno = InvalidBlockNumber; - nopaque->hasho_bucket = nbucket; - nopaque->hasho_flag = LH_BUCKET_PAGE; - nopaque->hasho_page_id = HASHO_PAGE_ID; /* * Partition the tuples in the old bucket between the old bucket and the @@ -798,8 +1107,6 @@ _hash_splitbucket(Relation rel, BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; - OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); @@ -810,54 +1117,86 @@ _hash_splitbucket(Relation rel, IndexTuple itup; Size itemsz; Bucket bucket; + bool found = false; + + /* skip dead tuples */ + if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) + continue; /* - * Fetch the item's hash key (conveniently stored in the item) and - * determine which bucket it now belongs in. + * Before inserting a tuple, probe the hash table containing TIDs + * of tuples belonging to new bucket, if we find a match, then + * skip that tuple, else fetch the item's hash key (conveniently + * stored in the item) and determine which bucket it now belongs + * in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); + + if (htab) + (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); + + if (found) + continue; + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { + IndexTuple new_itup; + + /* + * make a copy of index tuple as we have to scribble on it. + */ + new_itup = CopyIndexTuple(itup); + + /* + * mark the index tuple as moved by split, such tuples are + * skipped by scan if there is split in progress for a bucket. + */ + new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; + /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. - * - * XXX we have a problem here if we fail to get space for a - * new overflow page: we'll error out leaving the bucket split - * only partially complete, meaning the index is corrupt, - * since searches may fail to find entries they should find. */ - itemsz = IndexTupleDSize(*itup); + itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); - if (PageGetFreeSpace(npage) < itemsz) + if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { - /* write out nbuf and drop lock, but keep pin */ - _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); + /* + * Change the shared buffer state in critical section, + * otherwise any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); + MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + + /* drop lock, but keep pin */ + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + /* chain to a new overflow page */ - nbuf = _hash_addovflpage(rel, metabuf, nbuf); + nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); - /* we don't need nopaque within the loop */ + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } - /* - * Insert tuple on new page, using _hash_pgaddtup to ensure - * correct ordering by hashkey. This is a tad inefficient - * since we may have to shuffle itempointers repeatedly. - * Possible future improvement: accumulate all the items for - * the new page and qsort them before insertion. - */ - (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); - - /* - * Mark tuple for deletion from old page. - */ - deletable[ndeletable++] = ooffnum; + itups[nitups++] = new_itup; + all_tups_size += itemsz; } else { @@ -870,35 +1209,378 @@ _hash_splitbucket(Relation rel, oblkno = oopaque->hasho_nextblkno; - /* - * Done scanning this old page. If we moved any tuples, delete them - * from the old page. - */ - if (ndeletable > 0) - { - PageIndexMultiDelete(opage, deletable, ndeletable); - _hash_wrtbuf(rel, obuf); - } + /* retain the pin on the old primary bucket */ + if (obuf == bucket_obuf) + LockBuffer(obuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) + { + /* + * Change the shared buffer state in critical section, otherwise + * any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); + MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + + if (nbuf == bucket_nbuf) + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, nbuf); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); break; + } /* Else, advance to next old page */ - obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning - * the tuples. Before quitting, call _hash_squeezebucket to ensure the - * tuples remaining in the old bucket (including the overflow pages) are - * packed as tightly as possible. The new bucket is already tight. + * the tuples. Mark the old and new buckets to indicate split is + * finished. + * + * To avoid deadlocks due to locking order of buckets, first lock the old + * bucket and then the new bucket. + */ + LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); + opage = BufferGetPage(bucket_obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); + npage = BufferGetPage(bucket_nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + + START_CRIT_SECTION(); + + oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; + nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; + + /* + * After the split is finished, mark the old bucket to indicate that it + * contains deletable tuples. Vacuum will clear split-cleanup flag after + * deleting such tuples. + */ + oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; + + /* + * now write the buffers, here we don't release the locks as caller is + * responsible to release locks. + */ + MarkBufferDirty(bucket_obuf); + MarkBufferDirty(bucket_nbuf); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_split_complete xlrec; + + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + + XLogBeginInsert(); + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); + + XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); + XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); + + PageSetLSN(BufferGetPage(bucket_obuf), recptr); + PageSetLSN(BufferGetPage(bucket_nbuf), recptr); + } + + END_CRIT_SECTION(); +} + +/* + * _hash_finish_split() -- Finish the previously interrupted split operation + * + * To complete the split operation, we form the hash table of TIDs in new + * bucket which is then used by split operation to skip tuples that are + * already moved before the split operation was previously interrupted. + * + * The caller must hold a pin, but no lock, on the metapage and old bucket's + * primary page buffer. The buffers are returned in the same state. (The + * metapage is only touched if it becomes necessary to add or remove overflow + * pages.) + */ +void +_hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, + uint32 maxbucket, uint32 highmask, uint32 lowmask) +{ + HASHCTL hash_ctl; + HTAB *tidhtab; + Buffer bucket_nbuf = InvalidBuffer; + Buffer nbuf; + Page npage; + BlockNumber nblkno; + BlockNumber bucket_nblkno; + HashPageOpaque npageopaque; + Bucket nbucket; + bool found; + + /* Initialize hash tables used to track TIDs */ + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(ItemPointerData); + hash_ctl.entrysize = sizeof(ItemPointerData); + hash_ctl.hcxt = CurrentMemoryContext; + + tidhtab = + hash_create("bucket ctids", + 256, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket); + + /* + * Scan the new bucket and build hash table of TIDs + */ + for (;;) + { + OffsetNumber noffnum; + OffsetNumber nmaxoffnum; + + nbuf = _hash_getbuf(rel, nblkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + /* remember the primary bucket buffer to acquire cleanup lock on it. */ + if (nblkno == bucket_nblkno) + bucket_nbuf = nbuf; + + npage = BufferGetPage(nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + + /* Scan each tuple in new page */ + nmaxoffnum = PageGetMaxOffsetNumber(npage); + for (noffnum = FirstOffsetNumber; + noffnum <= nmaxoffnum; + noffnum = OffsetNumberNext(noffnum)) + { + IndexTuple itup; + + /* Fetch the item's TID and insert it in hash table. */ + itup = (IndexTuple) PageGetItem(npage, + PageGetItemId(npage, noffnum)); + + (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found); + + Assert(!found); + } + + nblkno = npageopaque->hasho_nextblkno; + + /* + * release our write lock without modifying buffer and ensure to + * retain the pin on primary bucket. + */ + if (nbuf == bucket_nbuf) + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, nbuf); + + /* Exit loop if no more overflow pages in new bucket */ + if (!BlockNumberIsValid(nblkno)) + break; + } + + /* + * Conditionally get the cleanup lock on old and new buckets to perform + * the split operation. If we don't get the cleanup locks, silently give + * up and next insertion on old bucket will try again to complete the + * split. + */ + if (!ConditionalLockBufferForCleanup(obuf)) + { + hash_destroy(tidhtab); + return; + } + if (!ConditionalLockBufferForCleanup(bucket_nbuf)) + { + LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + hash_destroy(tidhtab); + return; + } + + npage = BufferGetPage(bucket_nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nbucket = npageopaque->hasho_bucket; + + _hash_splitbucket(rel, metabuf, obucket, + nbucket, obuf, bucket_nbuf, tidhtab, + maxbucket, highmask, lowmask); + + _hash_relbuf(rel, bucket_nbuf); + LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + hash_destroy(tidhtab); +} + +/* + * log_split_page() -- Log the split operation + * + * We log the split operation when the new page in new bucket gets full, + * so we log the entire page. + * + * 'buf' must be locked by the caller which is also responsible for unlocking + * it. + */ +static void +log_split_page(Relation rel, Buffer buf) +{ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + } +} + +/* + * _hash_getcachedmetap() -- Returns cached metapage data. + * + * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on + * the metapage. If not set, we'll set it before returning if we have to + * refresh the cache, and return with a pin but no lock on it; caller is + * responsible for releasing the pin. + * + * We refresh the cache if it's not initialized yet or force_refresh is true. + */ +HashMetaPage +_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) +{ + Page page; + + Assert(metabuf); + if (force_refresh || rel->rd_amcache == NULL) + { + char *cache = NULL; + + /* + * It's important that we don't set rd_amcache to an invalid value. + * Either MemoryContextAlloc or _hash_getbuf could fail, so don't + * install a pointer to the newly-allocated storage in the actual + * relcache entry until both have succeeeded. + */ + if (rel->rd_amcache == NULL) + cache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(HashMetaPageData)); + + /* Read the metapage. */ + if (BufferIsValid(*metabuf)) + LockBuffer(*metabuf, BUFFER_LOCK_SHARE); + else + *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, + LH_META_PAGE); + page = BufferGetPage(*metabuf); + + /* Populate the cache. */ + if (rel->rd_amcache == NULL) + rel->rd_amcache = cache; + memcpy(rel->rd_amcache, HashPageGetMeta(page), + sizeof(HashMetaPageData)); + + /* Release metapage lock, but keep the pin. */ + LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK); + } + + return (HashMetaPage) rel->rd_amcache; +} + +/* + * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given + * hashkey. + * + * Bucket pages do not move or get removed once they are allocated. This give + * us an opportunity to use the previously saved metapage contents to reach + * the target bucket buffer, instead of reading from the metapage every time. + * This saves one buffer access every time we want to reach the target bucket + * buffer, which is very helpful savings in bufmgr traffic and contention. + * + * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the + * bucket buffer has to be locked for reading or writing. + * + * The out parameter cachedmetap is set with metapage contents used for + * hashkey to bucket buffer mapping. Some callers need this info to reach the + * old bucket in case of bucket split, see _hash_doinsert(). + */ +Buffer +_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, + HashMetaPage *cachedmetap) +{ + HashMetaPage metap; + Buffer buf; + Buffer metabuf = InvalidBuffer; + Page page; + Bucket bucket; + BlockNumber blkno; + HashPageOpaque opaque; + + /* We read from target bucket buffer, hence locking is must. */ + Assert(access == HASH_READ || access == HASH_WRITE); + + metap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(metap != NULL); + + /* + * Loop until we get a lock on the correct target bucket. */ - _hash_wrtbuf(rel, nbuf); + for (;;) + { + /* + * Compute the target bucket number, and convert to block number. + */ + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); + + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + Assert(opaque->hasho_prevblkno != InvalidBlockNumber); - _hash_squeezebucket(rel, obucket, start_oblkno, NULL); + /* + * If this bucket hasn't been split, we're done. + */ + if (opaque->hasho_prevblkno <= metap->hashm_maxbucket) + break; + + /* Drop lock on this buffer, update cached metapage, and retry. */ + _hash_relbuf(rel, buf); + metap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(metap != NULL); + } + + if (BufferIsValid(metabuf)) + _hash_dropbuf(rel, metabuf); + + if (cachedmetap) + *cachedmetap = metap; + + return buf; } diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c deleted file mode 100644 index fe97ef201a..0000000000 --- a/src/backend/access/hash/hashscan.c +++ /dev/null @@ -1,153 +0,0 @@ -/*------------------------------------------------------------------------- - * - * hashscan.c - * manage scans on hash tables - * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/access/hash/hashscan.c - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" - -#include "access/hash.h" -#include "access/relscan.h" -#include "utils/memutils.h" -#include "utils/rel.h" -#include "utils/resowner.h" - - -/* - * We track all of a backend's active scans on hash indexes using a list - * of HashScanListData structs, which are allocated in TopMemoryContext. - * It's okay to use a long-lived context because we rely on the ResourceOwner - * mechanism to clean up unused entries after transaction or subtransaction - * abort. We can't safely keep the entries in the executor's per-query - * context, because that might be already freed before we get a chance to - * clean up the list. (XXX seems like there should be a better way to - * manage this...) - */ -typedef struct HashScanListData -{ - IndexScanDesc hashsl_scan; - ResourceOwner hashsl_owner; - struct HashScanListData *hashsl_next; -} HashScanListData; - -typedef HashScanListData *HashScanList; - -static HashScanList HashScans = NULL; - - -/* - * ReleaseResources_hash() --- clean up hash subsystem resources. - * - * This is here because it needs to touch this module's static var HashScans. - */ -void -ReleaseResources_hash(void) -{ - HashScanList l; - HashScanList prev; - HashScanList next; - - /* - * Release all HashScanList items belonging to the current ResourceOwner. - * Note that we do not release the underlying IndexScanDesc; that's in - * executor memory and will go away on its own (in fact quite possibly has - * gone away already, so we mustn't try to touch it here). - * - * Note: this should be a no-op during normal query shutdown. However, in - * an abort situation ExecutorEnd is not called and so there may be open - * index scans to clean up. - */ - prev = NULL; - - for (l = HashScans; l != NULL; l = next) - { - next = l->hashsl_next; - if (l->hashsl_owner == CurrentResourceOwner) - { - if (prev == NULL) - HashScans = next; - else - prev->hashsl_next = next; - - pfree(l); - /* prev does not change */ - } - else - prev = l; - } -} - -/* - * _hash_regscan() -- register a new scan. - */ -void -_hash_regscan(IndexScanDesc scan) -{ - HashScanList new_el; - - new_el = (HashScanList) MemoryContextAlloc(TopMemoryContext, - sizeof(HashScanListData)); - new_el->hashsl_scan = scan; - new_el->hashsl_owner = CurrentResourceOwner; - new_el->hashsl_next = HashScans; - HashScans = new_el; -} - -/* - * _hash_dropscan() -- drop a scan from the scan list - */ -void -_hash_dropscan(IndexScanDesc scan) -{ - HashScanList chk, - last; - - last = NULL; - for (chk = HashScans; - chk != NULL && chk->hashsl_scan != scan; - chk = chk->hashsl_next) - last = chk; - - if (chk == NULL) - elog(ERROR, "hash scan list trashed; cannot find 0x%p", (void *) scan); - - if (last == NULL) - HashScans = chk->hashsl_next; - else - last->hashsl_next = chk->hashsl_next; - - pfree(chk); -} - -/* - * Is there an active scan in this bucket? - */ -bool -_hash_has_active_scan(Relation rel, Bucket bucket) -{ - Oid relid = RelationGetRelid(rel); - HashScanList l; - - for (l = HashScans; l != NULL; l = l->hashsl_next) - { - if (relid == l->hashsl_scan->indexRelation->rd_id) - { - HashScanOpaque so = (HashScanOpaque) l->hashsl_scan->opaque; - - if (so->hashso_bucket_valid && - so->hashso_bucket == bucket) - return true; - } - } - - return false; -} diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 48255584e1..2d9204903f 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -3,7 +3,7 @@ * hashsearch.c * search code for postgres hash tables * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -63,47 +63,149 @@ _hash_next(IndexScanDesc scan, ScanDirection dir) } /* - * Advance to next page in a bucket, if any. + * Advance to next page in a bucket, if any. If we are scanning the bucket + * being populated during split operation then this function advances to the + * bucket being split after the last bucket page of bucket being populated. */ static void -_hash_readnext(Relation rel, +_hash_readnext(IndexScanDesc scan, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) { BlockNumber blkno; + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool block_found = false; blkno = (*opaquep)->hasho_nextblkno; - _hash_relbuf(rel, *bufp); + + /* + * Retain the pin on primary bucket page till the end of scan. Refer the + * comments in _hash_first to know the reason of retaining pin. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + LockBuffer(*bufp, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, *bufp); + *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); if (BlockNumberIsValid(blkno)) { *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + block_found = true; + } + else if (so->hashso_buc_populated && !so->hashso_buc_split) + { + /* + * end of bucket, scan bucket being split if there was a split in + * progress at the start of scan. + */ + *bufp = so->hashso_split_bucket_buf; + + /* + * buffer for bucket being split must be valid as we acquire the pin + * on it before the start of scan and retain it till end of scan. + */ + Assert(BufferIsValid(*bufp)); + + LockBuffer(*bufp, BUFFER_LOCK_SHARE); + + /* + * setting hashso_buc_split to true indicates that we are scanning + * bucket being split. + */ + so->hashso_buc_split = true; + + block_found = true; + } + + if (block_found) + { *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); } } /* - * Advance to previous page in a bucket, if any. + * Advance to previous page in a bucket, if any. If the current scan has + * started during split operation then this function advances to bucket + * being populated after the first bucket page of bucket being split. */ static void -_hash_readprev(Relation rel, +_hash_readprev(IndexScanDesc scan, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) { BlockNumber blkno; + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool haveprevblk; blkno = (*opaquep)->hasho_prevblkno; - _hash_relbuf(rel, *bufp); + + /* + * Retain the pin on primary bucket page till the end of scan. Refer the + * comments in _hash_first to know the reason of retaining pin. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + { + LockBuffer(*bufp, BUFFER_LOCK_UNLOCK); + haveprevblk = false; + } + else + { + _hash_relbuf(rel, *bufp); + haveprevblk = true; + } + *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); - if (BlockNumberIsValid(blkno)) + + if (haveprevblk) { + Assert(BlockNumberIsValid(blkno)); *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + + /* + * We always maintain the pin on bucket page for whole scan operation, + * so releasing the additional pin we have acquired here. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + _hash_dropbuf(rel, *bufp); + } + else if (so->hashso_buc_populated && so->hashso_buc_split) + { + /* + * end of bucket, scan bucket being populated if there was a split in + * progress at the start of scan. + */ + *bufp = so->hashso_bucket_buf; + + /* + * buffer for bucket being populated must be valid as we acquire the + * pin on it before the start of scan and retain it till end of scan. + */ + Assert(BufferIsValid(*bufp)); + + LockBuffer(*bufp, BUFFER_LOCK_SHARE); + *pagep = BufferGetPage(*bufp); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + + /* move to the end of bucket chain */ + while (BlockNumberIsValid((*opaquep)->hasho_nextblkno)) + _hash_readnext(scan, bufp, pagep, opaquep); + + /* + * setting hashso_buc_split to false indicates that we are scanning + * bucket being populated. + */ + so->hashso_buc_split = false; } } @@ -124,14 +226,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) ScanKey cur; uint32 hashkey; Bucket bucket; - BlockNumber blkno; - BlockNumber oldblkno = InvalidBuffer; - bool retry = false; Buffer buf; - Buffer metabuf; Page page; HashPageOpaque opaque; - HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; @@ -186,70 +283,77 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) so->hashso_sk_hash = hashkey; - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - page = BufferGetPage(metabuf); - metap = HashPageGetMeta(page); + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL); + page = BufferGetPage(buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket = opaque->hasho_bucket; + + so->hashso_bucket_buf = buf; /* - * Loop until we get a lock on the correct target bucket. + * If a bucket split is in progress, then while scanning the bucket being + * populated, we need to skip tuples that were copied from bucket being + * split. We also need to maintain a pin on the bucket being split to + * ensure that split-cleanup work done by vacuum doesn't remove tuples + * from it till this scan is done. We need to maintain a pin on the + * bucket being populated to ensure that vacuum doesn't squeeze that + * bucket till this scan is complete; otherwise, the ordering of tuples + * can't be maintained during forward and backward scans. Here, we have + * to be cautious about locking order: first, acquire the lock on bucket + * being split; then, release the lock on it but not the pin; then, + * acquire a lock on bucket being populated and again re-verify whether + * the bucket split is still in progress. Acquiring the lock on bucket + * being split first ensures that the vacuum waits for this scan to + * finish. */ - for (;;) + if (H_BUCKET_BEING_POPULATED(opaque)) { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); + BlockNumber old_blkno; + Buffer old_buf; - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* Release metapage lock, but keep pin. */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket); /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old lock - * and lock what now appears to be the correct bucket. + * release the lock on new bucket and re-acquire it after acquiring + * the lock on old bucket. */ - if (retry) - { - if (oldblkno == blkno) - break; - _hash_droplock(rel, oldblkno, HASH_SHARE); - } - _hash_getlock(rel, blkno, HASH_SHARE); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE); + TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf)); /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. + * remember the split bucket buffer so as to use it later for + * scanning. */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - retry = true; - } - - /* done with the metapage */ - _hash_dropbuf(rel, metabuf); + so->hashso_split_bucket_buf = old_buf; + LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); - /* Update scan opaque state to show we have lock on the bucket */ - so->hashso_bucket = bucket; - so->hashso_bucket_valid = true; - so->hashso_bucket_blkno = blkno; + LockBuffer(buf, BUFFER_LOCK_SHARE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); - page = BufferGetPage(buf); - opaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == bucket); + if (H_BUCKET_BEING_POPULATED(opaque)) + so->hashso_buc_populated = true; + else + { + _hash_dropbuf(rel, so->hashso_split_bucket_buf); + so->hashso_split_bucket_buf = InvalidBuffer; + } + } /* If a backwards scan is requested, move to the end of the chain */ if (ScanDirectionIsBackward(dir)) { - while (BlockNumberIsValid(opaque->hasho_nextblkno)) - _hash_readnext(rel, &buf, &page, &opaque); + /* + * Backward scans that start during split needs to start from end of + * bucket being split. + */ + while (BlockNumberIsValid(opaque->hasho_nextblkno) || + (so->hashso_buc_populated && !so->hashso_buc_split)) + _hash_readnext(scan, &buf, &page, &opaque); } /* Now find the first tuple satisfying the qualification */ @@ -273,6 +377,12 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) * false. Else, return true and set the hashso_curpos for the * scan to the right thing. * + * Here we need to ensure that if the scan has started during split, then + * skip the tuples that are moved by split while scanning bucket being + * populated and then scan the bucket being split to cover all such + * tuples. This is done to ensure that we don't miss tuples in the scans + * that are started during split. + * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. @@ -338,14 +448,31 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Assert(offnum >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + /* + * skip the tuples that are moved by split operation + * for the scan that has started when split was in + * progress + */ + if (so->hashso_buc_populated && !so->hashso_buc_split && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) + { + offnum = OffsetNumberNext(offnum); /* move forward */ + continue; + } + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _hash_kill_items(scan); + /* * ran off the end of this page, try the next */ - _hash_readnext(rel, &buf, &page, &opaque); + _hash_readnext(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); @@ -353,7 +480,6 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) } else { - /* end of bucket */ itup = NULL; break; /* exit for-loop */ } @@ -379,22 +505,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Assert(offnum <= maxoff); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + /* + * skip the tuples that are moved by split operation + * for the scan that has started when split was in + * progress + */ + if (so->hashso_buc_populated && !so->hashso_buc_split && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) + { + offnum = OffsetNumberPrev(offnum); /* move back */ + continue; + } + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _hash_kill_items(scan); + /* * ran off the end of this page, try the next */ - _hash_readprev(rel, &buf, &page, &opaque); + _hash_readprev(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { + TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } else { - /* end of bucket */ itup = NULL; break; /* exit for-loop */ } @@ -410,9 +553,16 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) if (itup == NULL) { - /* we ran off the end of the bucket without finding a match */ + /* + * We ran off the end of the bucket without finding a match. + * Release the pin on bucket buffers. Normally, such pins are + * released at end of scan, however scrolling cursors can + * reacquire the bucket lock and pin in the same scan multiple + * times. + */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); + _hash_dropscanbuf(rel, so); return false; } diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c index 8938ab5b24..41d615df8b 100644 --- a/src/backend/access/hash/hashsort.c +++ b/src/backend/access/hash/hashsort.c @@ -14,7 +14,7 @@ * plenty of locality of access. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -37,7 +37,15 @@ struct HSpool { Tuplesortstate *sortstate; /* state data for tuplesort.c */ Relation index; - uint32 hash_mask; /* bitmask for hash codes */ + + /* + * We sort the hash keys based on the buckets they belong to. Below masks + * are used in _hash_hashkey2bucket to determine the bucket of given hash + * key. + */ + uint32 high_mask; + uint32 low_mask; + uint32 max_buckets; }; @@ -56,11 +64,12 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets) * num_buckets buckets in the index, the appropriate mask can be computed * as follows. * - * Note: at present, the passed-in num_buckets is always a power of 2, so - * we could just compute num_buckets - 1. We prefer not to assume that - * here, though. + * NOTE : This hash mask calculation should be in sync with similar + * calculation in _hash_init_metabuffer. */ - hspool->hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1; + hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1; + hspool->low_mask = (hspool->high_mask >> 1); + hspool->max_buckets = num_buckets - 1; /* * We size the sort area as maintenance_work_mem rather than work_mem to @@ -69,7 +78,9 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets) */ hspool->sortstate = tuplesort_begin_index_hash(heap, index, - hspool->hash_mask, + hspool->high_mask, + hspool->low_mask, + hspool->max_buckets, maintenance_work_mem, false); @@ -101,18 +112,16 @@ _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull) * create an entire index. */ void -_h_indexbuild(HSpool *hspool) +_h_indexbuild(HSpool *hspool, Relation heapRel) { IndexTuple itup; - bool should_free; #ifdef USE_ASSERT_CHECKING uint32 hashkey = 0; #endif tuplesort_performsort(hspool->sortstate); - while ((itup = tuplesort_getindextuple(hspool->sortstate, - true, &should_free)) != NULL) + while ((itup = tuplesort_getindextuple(hspool->sortstate, true)) != NULL) { /* * Technically, it isn't critical that hash keys be found in sorted @@ -124,12 +133,12 @@ _h_indexbuild(HSpool *hspool) #ifdef USE_ASSERT_CHECKING uint32 lasthashkey = hashkey; - hashkey = _hash_get_indextuple_hashkey(itup) & hspool->hash_mask; + hashkey = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + hspool->max_buckets, hspool->high_mask, + hspool->low_mask); Assert(hashkey >= lasthashkey); #endif - _hash_doinsert(hspool->index, itup); - if (should_free) - pfree(itup); + _hash_doinsert(hspool->index, itup, heapRel); } } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 822862db7a..c513c3b842 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -3,7 +3,7 @@ * hashutil.c * Utility code for Postgres hash implementation. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -19,7 +19,10 @@ #include "access/relscan.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "storage/buf_internals.h" +#define CALC_NEW_BUCKET(old_bucket, lowmask) \ + old_bucket | (lowmask + 1) /* * _hash_checkqual -- does the index tuple satisfy the scan conditions? @@ -147,10 +150,76 @@ _hash_log2(uint32 num) } /* + * _hash_spareindex -- returns spare index / global splitpoint phase of the + * bucket + */ +uint32 +_hash_spareindex(uint32 num_bucket) +{ + uint32 splitpoint_group; + uint32 splitpoint_phases; + + splitpoint_group = _hash_log2(num_bucket); + + if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) + return splitpoint_group; + + /* account for single-phase groups */ + splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; + + /* account for multi-phase groups before splitpoint_group */ + splitpoint_phases += + ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) << + HASH_SPLITPOINT_PHASE_BITS); + + /* account for phases within current group */ + splitpoint_phases += + (((num_bucket - 1) >> + (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) & + HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */ + + return splitpoint_phases; +} + +/* + * _hash_get_totalbuckets -- returns total number of buckets allocated till + * the given splitpoint phase. + */ +uint32 +_hash_get_totalbuckets(uint32 splitpoint_phase) +{ + uint32 splitpoint_group; + uint32 total_buckets; + uint32 phases_within_splitpoint_group; + + if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) + return (1 << splitpoint_phase); + + /* get splitpoint's group */ + splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; + splitpoint_group += + ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >> + HASH_SPLITPOINT_PHASE_BITS); + + /* account for buckets before splitpoint_group */ + total_buckets = (1 << (splitpoint_group - 1)); + + /* account for buckets within splitpoint_group */ + phases_within_splitpoint_group = + (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) & + HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */ + total_buckets += + (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) * + phases_within_splitpoint_group); + + return total_buckets; +} + +/* * _hash_checkpage -- sanity checks on the format of all hash pages * - * If flags is not zero, it is a bitwise OR of the acceptable values of - * hasho_flag. + * If flags is not zero, it is a bitwise OR of the acceptable page types + * (values of hasho_flag & LH_PAGE_TYPE). */ void _hash_checkpage(Relation rel, Buffer buf, int flags) @@ -352,3 +421,163 @@ _hash_binsearch_last(Page page, uint32 hash_value) return lower; } + +/* + * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket + * from which current (new) bucket is being split. + */ +BlockNumber +_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) +{ + Bucket old_bucket; + uint32 mask; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + /* + * To get the old bucket from the current bucket, we need a mask to modulo + * into lower half of table. This mask is stored in meta page as + * hashm_lowmask, but here we can't rely on the same, because we need a + * value of lowmask that was prevalent at the time when bucket split was + * started. Masking the most significant bit of new bucket would give us + * old bucket. + */ + mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1; + old_bucket = new_bucket & mask; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + blkno = BUCKET_TO_BLKNO(metap, old_bucket); + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket + * that will be generated after split from old bucket. + * + * This is used to find the new bucket from old bucket based on current table + * half. It is mainly required to finish the incomplete splits where we are + * sure that not more than one bucket could have split in progress from old + * bucket. + */ +BlockNumber +_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) +{ + Bucket new_bucket; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, + metap->hashm_lowmask, + metap->hashm_maxbucket); + blkno = BUCKET_TO_BLKNO(metap, new_bucket); + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be + * generated after split from current (old) bucket. + * + * This is used to find the new bucket from old bucket. New bucket can be + * obtained by OR'ing old bucket with most significant bit of current table + * half (lowmask passed in this function can be used to identify msb of + * current table half). There could be multiple buckets that could have + * been split from current bucket. We need the first such bucket that exists. + * Caller must ensure that no more than one split has happened from old + * bucket. + */ +Bucket +_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, + uint32 lowmask, uint32 maxbucket) +{ + Bucket new_bucket; + + new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); + if (new_bucket > maxbucket) + { + lowmask = lowmask >> 1; + new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); + } + + return new_bucket; +} + +/* + * _hash_kill_items - set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * scan->opaque, referenced locally through so, contains information about the + * current page and killed tuples thereon (generally, this should only be + * called if so->numKilled > 0). + * + * We match items by heap TID before assuming they are the right ones to + * delete. + */ +void +_hash_kill_items(IndexScanDesc scan) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + Page page; + HashPageOpaque opaque; + OffsetNumber offnum, + maxoff; + int numKilled = so->numKilled; + int i; + bool killedsomething = false; + + Assert(so->numKilled > 0); + Assert(so->killedItems != NULL); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; + + page = BufferGetPage(so->hashso_curbuf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < numKilled; i++) + { + offnum = so->killedItems[i].indexOffset; + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid)) + { + /* found the item */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. Whenever + * we mark anything LP_DEAD, we also set the page's + * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. + */ + if (killedsomething) + { + opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; + MarkBufferDirtyHint(so->hashso_curbuf, true); + } +} diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c index d8c5ed4d98..f914c015bd 100644 --- a/src/backend/access/hash/hashvalidate.c +++ b/src/backend/access/hash/hashvalidate.c @@ -3,7 +3,7 @@ * hashvalidate.c * Opclass validator for hash. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -25,6 +25,7 @@ #include "parser/parse_coerce.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/regproc.h" #include "utils/syscache.h" diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 2368340b08..05fd372664 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3,7 +3,7 @@ * heapam.c * heap access method code * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -38,6 +38,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/heapam.h" #include "access/heapam_xlog.h" #include "access/hio.h" @@ -98,11 +99,8 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tup, bool all_visible_cleared, bool new_all_visible_cleared); -static void HeapSatisfiesHOTandKeyUpdate(Relation relation, - Bitmapset *hot_attrs, - Bitmapset *key_attrs, Bitmapset *id_attrs, - bool *satisfies_hot, bool *satisfies_key, - bool *satisfies_id, +static Bitmapset *HeapDetermineModifiedColumns(Relation relation, + Bitmapset *interesting_cols, HeapTuple oldtup, HeapTuple newtup); static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, LockWaitPolicy wait_policy, @@ -1134,7 +1132,7 @@ relation_open(Oid relationId, LOCKMODE lockmode) /* Make note that we've accessed a temporary relation */ if (RelationUsesLocalBuffers(r)) - MyXactAccessedTempRel = true; + MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL; pgstat_initstats(r); @@ -1180,7 +1178,7 @@ try_relation_open(Oid relationId, LOCKMODE lockmode) /* Make note that we've accessed a temporary relation */ if (RelationUsesLocalBuffers(r)) - MyXactAccessedTempRel = true; + MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL; pgstat_initstats(r); @@ -1760,6 +1758,22 @@ retry: } /* ---------------- + * heap_update_snapshot + * + * Update snapshot info in heap scan descriptor. + * ---------------- + */ +void +heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot) +{ + Assert(IsMVCCSnapshot(snapshot)); + + RegisterSnapshot(snapshot); + scan->rs_snapshot = snapshot; + scan->rs_temp_snap = true; +} + +/* ---------------- * heap_getnext - retrieve next tuple in scan * * Fix to work with index relations. @@ -2337,6 +2351,17 @@ FreeBulkInsertState(BulkInsertState bistate) pfree(bistate); } +/* + * ReleaseBulkInsertStatePin - release a buffer currently held in bistate + */ +void +ReleaseBulkInsertStatePin(BulkInsertState bistate) +{ + if (bistate->current_buf != InvalidBuffer) + ReleaseBuffer(bistate->current_buf); + bistate->current_buf = InvalidBuffer; +} + /* * heap_insert - insert tuple into a heap @@ -2520,7 +2545,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, heaptup->t_len - SizeofHeapTupleHeader); /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP_ID, info); @@ -2862,7 +2887,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, XLogRegisterBufData(0, tupledata, totaldatalen); /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP2_ID, info); @@ -3324,7 +3349,7 @@ l1: } /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); @@ -3351,7 +3376,7 @@ l1: Assert(!HeapTupleHasExternal(&tp)); } else if (HeapTupleHasExternal(&tp)) - toast_delete(relation, &tp); + toast_delete(relation, &tp, false); /* * Mark tuple for invalidation from system caches at next command @@ -3459,6 +3484,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, Bitmapset *hot_attrs; Bitmapset *key_attrs; Bitmapset *id_attrs; + Bitmapset *interesting_attrs; + Bitmapset *modified_attrs; ItemId lp; HeapTupleData oldtup; HeapTuple heaptup; @@ -3476,10 +3503,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, pagefree; bool have_tuple_lock = false; bool iscombo; - bool satisfies_hot; - bool satisfies_key; - bool satisfies_id; bool use_hot_update = false; + bool hot_attrs_checked = false; bool key_intact; bool all_visible_cleared = false; bool all_visible_cleared_new = false; @@ -3505,26 +3530,51 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, errmsg("cannot update tuples during a parallel operation"))); /* - * Fetch the list of attributes to be checked for HOT update. This is - * wasted effort if we fail to update or have to put the new tuple on a - * different page. But we must compute the list before obtaining buffer - * lock --- in the worst case, if we are doing an update on one of the - * relevant system catalogs, we could deadlock if we try to fetch the list - * later. In any case, the relcache caches the data so this is usually - * pretty cheap. + * Fetch the list of attributes to be checked for various operations. + * + * For HOT considerations, this is wasted effort if we fail to update or + * have to put the new tuple on a different page. But we must compute the + * list before obtaining buffer lock --- in the worst case, if we are + * doing an update on one of the relevant system catalogs, we could + * deadlock if we try to fetch the list later. In any case, the relcache + * caches the data so this is usually pretty cheap. * - * Note that we get a copy here, so we need not worry about relcache flush - * happening midway through. + * We also need columns used by the replica identity and columns that are + * considered the "key" of rows in the table. + * + * Note that we get copies of each bitmap, so we need not worry about + * relcache flush happening midway through. */ hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL); key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); id_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_IDENTITY_KEY); + block = ItemPointerGetBlockNumber(otid); buffer = ReadBuffer(relation, block); page = BufferGetPage(buffer); + interesting_attrs = NULL; + + /* + * If the page is already full, there is hardly any chance of doing a HOT + * update on this page. It might be wasteful effort to look for index + * column updates only to later reject HOT updates for lack of space in + * the same page. So we be conservative and only fetch hot_attrs if the + * page is not already full. Since we are already holding a pin on the + * buffer, there is no chance that the buffer can get cleaned up + * concurrently and even if that was possible, in the worst case we lose a + * chance to do a HOT update. + */ + if (!PageIsFull(page)) + { + interesting_attrs = bms_add_members(interesting_attrs, hot_attrs); + hot_attrs_checked = true; + } + interesting_attrs = bms_add_members(interesting_attrs, key_attrs); + interesting_attrs = bms_add_members(interesting_attrs, id_attrs); + /* * Before locking the buffer, pin the visibility map page if it appears to * be necessary. Since we haven't got the lock yet, someone else might be @@ -3540,7 +3590,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, Assert(ItemIdIsNormal(lp)); /* - * Fill in enough data in oldtup for HeapSatisfiesHOTandKeyUpdate to work + * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work * properly. */ oldtup.t_tableOid = RelationGetRelid(relation); @@ -3566,6 +3616,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, Assert(!(newtup->t_data->t_infomask & HEAP_HASOID)); } + /* Determine columns modified by the update. */ + modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs, + &oldtup, newtup); + /* * If we're not updating any "key" column, we can grab a weaker lock type. * This allows for more concurrency when we are running simultaneously @@ -3577,10 +3631,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, * is updates that don't manipulate key columns, not those that * serendipitiously arrive at the same key values. */ - HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs, id_attrs, - &satisfies_hot, &satisfies_key, - &satisfies_id, &oldtup, newtup); - if (satisfies_key) + if (!bms_overlap(modified_attrs, key_attrs)) { *lockmode = LockTupleNoKeyExclusive; mxact_status = MultiXactStatusNoKeyUpdate; @@ -3818,6 +3869,9 @@ l2: ReleaseBuffer(vmbuffer); bms_free(hot_attrs); bms_free(key_attrs); + bms_free(id_attrs); + bms_free(modified_attrs); + bms_free(interesting_attrs); return result; } @@ -4123,9 +4177,10 @@ l2: /* * Since the new tuple is going into the same page, we might be able * to do a HOT update. Check if any of the index columns have been - * changed. If not, then HOT update is possible. + * changed. If the page was already full, we may have skipped checking + * for index columns. If so, HOT update is possible. */ - if (satisfies_hot) + if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs)) use_hot_update = true; } else @@ -4140,7 +4195,9 @@ l2: * ExtractReplicaIdentity() will return NULL if nothing needs to be * logged. */ - old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, !satisfies_id, &old_key_copied); + old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, + bms_overlap(modified_attrs, id_attrs), + &old_key_copied); /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -4287,13 +4344,16 @@ l2: bms_free(hot_attrs); bms_free(key_attrs); + bms_free(id_attrs); + bms_free(modified_attrs); + bms_free(interesting_attrs); return HeapTupleMayBeUpdated; } /* * Check if the specified attribute's value is same in both given tuples. - * Subroutine for HeapSatisfiesHOTandKeyUpdate. + * Subroutine for HeapDetermineModifiedColumns. */ static bool heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, @@ -4330,7 +4390,7 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, /* * Extract the corresponding values. XXX this is pretty inefficient if - * there are many indexed columns. Should HeapSatisfiesHOTandKeyUpdate do + * there are many indexed columns. Should HeapDetermineModifiedColumns do * a single heap_deform_tuple call on each tuple, instead? But that * doesn't work for system columns ... */ @@ -4375,114 +4435,30 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, /* * Check which columns are being updated. * - * This simultaneously checks conditions for HOT updates, for FOR KEY - * SHARE updates, and REPLICA IDENTITY concerns. Since much of the time they - * will be checking very similar sets of columns, and doing the same tests on - * them, it makes sense to optimize and do them together. - * - * We receive three bitmapsets comprising the three sets of columns we're - * interested in. Note these are destructively modified; that is OK since - * this is invoked at most once in heap_update. + * Given an updated tuple, determine (and return into the output bitmapset), + * from those listed as interesting, the set of columns that changed. * - * hot_result is set to TRUE if it's okay to do a HOT update (i.e. it does not - * modified indexed columns); key_result is set to TRUE if the update does not - * modify columns used in the key; id_result is set to TRUE if the update does - * not modify columns in any index marked as the REPLICA IDENTITY. + * The input bitmapset is destructively modified; that is OK since this is + * invoked at most once in heap_update. */ -static void -HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, - Bitmapset *key_attrs, Bitmapset *id_attrs, - bool *satisfies_hot, bool *satisfies_key, - bool *satisfies_id, +static Bitmapset * +HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols, HeapTuple oldtup, HeapTuple newtup) { - int next_hot_attnum; - int next_key_attnum; - int next_id_attnum; - bool hot_result = true; - bool key_result = true; - bool id_result = true; - - /* If REPLICA IDENTITY is set to FULL, id_attrs will be empty. */ - Assert(bms_is_subset(id_attrs, key_attrs)); - Assert(bms_is_subset(key_attrs, hot_attrs)); - - /* - * If one of these sets contains no remaining bits, bms_first_member will - * return -1, and after adding FirstLowInvalidHeapAttributeNumber (which - * is negative!) we'll get an attribute number that can't possibly be - * real, and thus won't match any actual attribute number. - */ - next_hot_attnum = bms_first_member(hot_attrs); - next_hot_attnum += FirstLowInvalidHeapAttributeNumber; - next_key_attnum = bms_first_member(key_attrs); - next_key_attnum += FirstLowInvalidHeapAttributeNumber; - next_id_attnum = bms_first_member(id_attrs); - next_id_attnum += FirstLowInvalidHeapAttributeNumber; + int attnum; + Bitmapset *modified = NULL; - for (;;) + while ((attnum = bms_first_member(interesting_cols)) >= 0) { - bool changed; - int check_now; + attnum += FirstLowInvalidHeapAttributeNumber; - /* - * Since the HOT attributes are a superset of the key attributes and - * the key attributes are a superset of the id attributes, this logic - * is guaranteed to identify the next column that needs to be checked. - */ - if (hot_result && next_hot_attnum > FirstLowInvalidHeapAttributeNumber) - check_now = next_hot_attnum; - else if (key_result && next_key_attnum > FirstLowInvalidHeapAttributeNumber) - check_now = next_key_attnum; - else if (id_result && next_id_attnum > FirstLowInvalidHeapAttributeNumber) - check_now = next_id_attnum; - else - break; - - /* See whether it changed. */ - changed = !heap_tuple_attr_equals(RelationGetDescr(relation), - check_now, oldtup, newtup); - if (changed) - { - if (check_now == next_hot_attnum) - hot_result = false; - if (check_now == next_key_attnum) - key_result = false; - if (check_now == next_id_attnum) - id_result = false; - - /* if all are false now, we can stop checking */ - if (!hot_result && !key_result && !id_result) - break; - } - - /* - * Advance the next attribute numbers for the sets that contain the - * attribute we just checked. As we work our way through the columns, - * the next_attnum values will rise; but when each set becomes empty, - * bms_first_member() will return -1 and the attribute number will end - * up with a value less than FirstLowInvalidHeapAttributeNumber. - */ - if (hot_result && check_now == next_hot_attnum) - { - next_hot_attnum = bms_first_member(hot_attrs); - next_hot_attnum += FirstLowInvalidHeapAttributeNumber; - } - if (key_result && check_now == next_key_attnum) - { - next_key_attnum = bms_first_member(key_attrs); - next_key_attnum += FirstLowInvalidHeapAttributeNumber; - } - if (id_result && check_now == next_id_attnum) - { - next_id_attnum = bms_first_member(id_attrs); - next_id_attnum += FirstLowInvalidHeapAttributeNumber; - } + if (!heap_tuple_attr_equals(RelationGetDescr(relation), + attnum, oldtup, newtup)) + modified = bms_add_member(modified, + attnum - FirstLowInvalidHeapAttributeNumber); } - *satisfies_hot = hot_result; - *satisfies_key = key_result; - *satisfies_id = id_result; + return modified; } /* @@ -5745,6 +5721,17 @@ l4: goto out_locked; } + /* + * Also check Xmin: if this tuple was created by an aborted + * (sub)transaction, then we already locked the last live one in the + * chain, thus we're done, so return success. + */ + if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) + { + UnlockReleaseBuffer(buf); + return HeapTupleMayBeUpdated; + } + old_infomask = mytup.t_data->t_infomask; old_infomask2 = mytup.t_data->t_infomask2; xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); @@ -6047,7 +6034,7 @@ heap_finish_speculative(Relation relation, HeapTuple tuple) XLogBeginInsert(); /* We want the same filtering on this as on a plain insert */ - XLogIncludeOrigin(); + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); @@ -6082,7 +6069,8 @@ heap_finish_speculative(Relation relation, HeapTuple tuple) * could deadlock with each other, which would not be acceptable. * * This is somewhat redundant with heap_delete, but we prefer to have a - * dedicated routine with stripped down requirements. + * dedicated routine with stripped down requirements. Note that this is also + * used to delete the TOAST tuples created during speculative insertion. * * This routine does not affect logical decoding as it only looks at * confirmation records. @@ -6126,7 +6114,7 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) */ if (tp.t_data->t_choice.t_heap.t_xmin != xid) elog(ERROR, "attempted to kill a tuple inserted by another transaction"); - if (!HeapTupleHeaderIsSpeculative(tp.t_data)) + if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) elog(ERROR, "attempted to kill a non-speculative tuple"); Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); @@ -6196,7 +6184,10 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); if (HeapTupleHasExternal(&tp)) - toast_delete(relation, &tp); + { + Assert(!IsToastRelation(relation)); + toast_delete(relation, &tp, true); + } /* * Never need to mark tuple for invalidation, since catalogs don't support @@ -6770,8 +6761,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, * Note: it might seem we could make the changes without exclusive lock, since * TransactionId read/write is assumed atomic anyway. However there is a race * condition: someone who just fetched an old XID that we overwrite here could - * conceivably not finish checking the XID against pg_clog before we finish - * the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting + * conceivably not finish checking the XID against pg_xact before we finish + * the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting * exclusive lock ensures no other backend is in process of checking the * tuple status. Also, getting exclusive lock makes it safe to adjust the * infomask bits. @@ -7711,7 +7702,7 @@ log_heap_update(Relation reln, Buffer oldbuf, } /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP_ID, info); @@ -9139,3 +9130,80 @@ heap_sync(Relation rel) heap_close(toastrel, AccessShareLock); } } + +/* + * Mask a heap page before performing consistency checks on it. + */ +void +heap_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See heap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, heap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, heap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions in master. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index c90fb71965..6529fe3d6b 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -3,7 +3,7 @@ * hio.c * POSTGRES heap access method input/output code. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 200861eef1..4f41511764 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -3,7 +3,7 @@ * pruneheap.c * heap page pruning and HOT-chain management code * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index f9ce9861e2..60dcb67a20 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -92,7 +92,7 @@ * heap's TOAST table will go through the normal bufmgr. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * * IDENTIFICATION @@ -119,6 +119,8 @@ #include "lib/ilist.h" +#include "pgstat.h" + #include "replication/logical.h" #include "replication/slot.h" @@ -209,7 +211,7 @@ typedef struct RewriteMappingFile } RewriteMappingFile; /* - * A single In-Memeory logical rewrite mapping, hanging of + * A single In-Memory logical rewrite mapping, hanging off * RewriteMappingFile->mappings. */ typedef struct RewriteMappingDataEntry @@ -258,9 +260,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm */ rw_cxt = AllocSetContextCreate(CurrentMemoryContext, "Table rewrite", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); old_cxt = MemoryContextSwitchTo(rw_cxt); /* Create and fill in the state struct */ @@ -918,7 +918,8 @@ logical_heap_rewrite_flush_mappings(RewriteState state) * Note that we deviate from the usual WAL coding practices here, * check the above "Logical rewrite support" comment for reasoning. */ - written = FileWrite(src->vfd, waldata_start, len); + written = FileWrite(src->vfd, waldata_start, len, + WAIT_EVENT_LOGICAL_REWRITE_WRITE); if (written != len) ereport(ERROR, (errcode_for_file_access(), @@ -959,7 +960,7 @@ logical_end_heap_rewrite(RewriteState state) hash_seq_init(&seq_status, state->rs_logical_mappings); while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) { - if (FileSync(src->vfd) != 0) + if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", src->path))); @@ -1143,11 +1144,13 @@ heap_xlog_logical_rewrite(XLogReaderState *r) * Truncate all data that's not guaranteed to have been safely fsynced (by * previous record or by the last checkpoint). */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE); if (ftruncate(fd, xlrec->offset) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u: %m", path, (uint32) xlrec->offset))); + pgstat_report_wait_end(); /* now seek to the position we want to write our data to */ if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset) @@ -1161,20 +1164,24 @@ heap_xlog_logical_rewrite(XLogReaderState *r) len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData); /* write out tail end of mapping file (again) */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE); if (write(fd, data, len) != len) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", path))); + pgstat_report_wait_end(); /* * Now fsync all previously written data. We could improve things and only * do this for the last write to a file, but the required bookkeeping * doesn't seem worth the trouble. */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC); if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); CloseTransientFile(fd); } @@ -1196,7 +1203,7 @@ CheckPointLogicalRewriteHeap(void) XLogRecPtr redo; DIR *mappings_dir; struct dirent *mapping_de; - char path[MAXPGPATH]; + char path[MAXPGPATH + 20]; /* * We start of with a minimum of the last redo pointer. No new decoding @@ -1227,7 +1234,7 @@ CheckPointLogicalRewriteHeap(void) strcmp(mapping_de->d_name, "..") == 0) continue; - snprintf(path, MAXPGPATH, "pg_logical/mappings/%s", mapping_de->d_name); + snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name); if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) continue; @@ -1268,10 +1275,12 @@ CheckPointLogicalRewriteHeap(void) * changed or have only been created since the checkpoint's start, * but it's currently not deemed worth the effort. */ - else if (pg_fsync(fd) != 0) + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC); + if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); CloseTransientFile(fd); } } diff --git a/src/backend/access/heap/syncscan.c b/src/backend/access/heap/syncscan.c index a0f500edc8..20640cbbaf 100644 --- a/src/backend/access/heap/syncscan.c +++ b/src/backend/access/heap/syncscan.c @@ -36,7 +36,7 @@ * ss_report_location - update current scan location * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -48,6 +48,8 @@ #include "access/heapam.h" #include "miscadmin.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" #include "utils/rel.h" diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index 452a9ecb68..b9963ab5ef 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -4,7 +4,7 @@ * Support routines for external and compressed storage of * variable size attributes. * - * Copyright (c) 2000-2016, PostgreSQL Global Development Group + * Copyright (c) 2000-2017, PostgreSQL Global Development Group * * * IDENTIFICATION @@ -67,7 +67,7 @@ typedef struct toast_compress_header #define TOAST_COMPRESS_SET_RAWSIZE(ptr, len) \ (((toast_compress_header *) (ptr))->rawsize = (len)) -static void toast_delete_datum(Relation rel, Datum value); +static void toast_delete_datum(Relation rel, Datum value, bool is_speculative); static Datum toast_save_datum(Relation rel, Datum value, struct varlena * oldexternal, int options); static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); @@ -461,7 +461,7 @@ toast_datum_size(Datum value) * ---------- */ void -toast_delete(Relation rel, HeapTuple oldtup) +toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative) { TupleDesc tupleDesc; Form_pg_attribute *att; @@ -508,7 +508,7 @@ toast_delete(Relation rel, HeapTuple oldtup) if (toast_isnull[i]) continue; else if (VARATT_IS_EXTERNAL_ONDISK(PointerGetDatum(value))) - toast_delete_datum(rel, value); + toast_delete_datum(rel, value, is_speculative); } } } @@ -1068,7 +1068,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, if (need_delold) for (i = 0; i < numAttrs; i++) if (toast_delold[i]) - toast_delete_datum(rel, toast_oldvalues[i]); + toast_delete_datum(rel, toast_oldvalues[i], false); return result_tuple; } @@ -1296,6 +1296,74 @@ toast_flatten_tuple_to_datum(HeapTupleHeader tup, /* ---------- + * toast_build_flattened_tuple - + * + * Build a tuple containing no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * + * This is essentially just like heap_form_tuple, except that it will + * expand any external-data pointers beforehand. + * + * It's not very clear whether it would be preferable to decompress + * in-line compressed datums while at it. For now, we don't. + * ---------- + */ +HeapTuple +toast_build_flattened_tuple(TupleDesc tupleDesc, + Datum *values, + bool *isnull) +{ + HeapTuple new_tuple; + Form_pg_attribute *att = tupleDesc->attrs; + int numAttrs = tupleDesc->natts; + int num_to_free; + int i; + Datum new_values[MaxTupleAttributeNumber]; + Pointer freeable_values[MaxTupleAttributeNumber]; + + /* + * We can pass the caller's isnull array directly to heap_form_tuple, but + * we potentially need to modify the values array. + */ + Assert(numAttrs <= MaxTupleAttributeNumber); + memcpy(new_values, values, numAttrs * sizeof(Datum)); + + num_to_free = 0; + for (i = 0; i < numAttrs; i++) + { + /* + * Look at non-null varlena attributes + */ + if (!isnull[i] && att[i]->attlen == -1) + { + struct varlena *new_value; + + new_value = (struct varlena *) DatumGetPointer(new_values[i]); + if (VARATT_IS_EXTERNAL(new_value)) + { + new_value = heap_tuple_fetch_attr(new_value); + new_values[i] = PointerGetDatum(new_value); + freeable_values[num_to_free++] = (Pointer) new_value; + } + } + } + + /* + * Form the reconfigured tuple. + */ + new_tuple = heap_form_tuple(tupleDesc, new_values, isnull); + + /* + * Free allocated temp values + */ + for (i = 0; i < num_to_free; i++) + pfree(freeable_values[i]); + + return new_tuple; +} + + +/* ---------- * toast_compress_datum - * * Create a compressed version of a varlena datum @@ -1611,7 +1679,9 @@ toast_save_datum(Relation rel, Datum value, * Create the index entry. We cheat a little here by not using * FormIndexDatum: this relies on the knowledge that the index columns * are the same as the initial columns of the table for all the - * indexes. + * indexes. We also cheat by not providing an IndexInfo: this is okay + * for now because btree doesn't need one, but we might have to be + * more honest someday. * * Note also that there had better not be any user-created index on * the TOAST table, since we don't bother to update anything else. @@ -1624,7 +1694,8 @@ toast_save_datum(Relation rel, Datum value, &(toasttup->t_self), toastrel, toastidxs[i]->rd_index->indisunique ? - UNIQUE_CHECK_YES : UNIQUE_CHECK_NO); + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + NULL); } /* @@ -1663,7 +1734,7 @@ toast_save_datum(Relation rel, Datum value, * ---------- */ static void -toast_delete_datum(Relation rel, Datum value) +toast_delete_datum(Relation rel, Datum value, bool is_speculative) { struct varlena *attr = (struct varlena *) DatumGetPointer(value); struct varatt_external toast_pointer; @@ -1714,7 +1785,10 @@ toast_delete_datum(Relation rel, Datum value) /* * Have a chunk, delete it */ - simple_heap_delete(toastrel, &toasttup->t_self); + if (is_speculative) + heap_abort_speculative(toastrel, toasttup); + else + simple_heap_delete(toastrel, &toasttup->t_self); } /* diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 3ad4a9f587..e5616ce051 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -3,7 +3,7 @@ * visibilitymap.c * bitmap for tracking visibility of heap tuples * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -508,6 +508,9 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); @@ -523,7 +526,20 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) */ map[truncByte] &= (1 << truncOffset) - 1; + /* + * Truncation of a relation is WAL-logged at a higher-level, and we + * will be called at WAL replay. But if checksums are enabled, we need + * to still write a WAL record to protect against a torn page, if the + * page is flushed to disk before the truncation WAL record. We cannot + * use MarkBufferDirtyHint here, because that will not dirty the page + * during recovery. + */ MarkBufferDirty(mapBuffer); + if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) + log_newpage_buffer(mapBuffer, false); + + END_CRIT_SECTION(); + UnlockReleaseBuffer(mapBuffer); } else diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index 28f6cde896..7b597a072f 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -3,7 +3,7 @@ * amapi.c * Support routines for API for Postgres index access methods. * - * Copyright (c) 2015-2016, PostgreSQL Global Development Group + * Copyright (c) 2015-2017, PostgreSQL Global Development Group * * * IDENTIFICATION @@ -17,6 +17,7 @@ #include "access/htup_details.h" #include "catalog/pg_am.h" #include "catalog/pg_opclass.h" +#include "utils/builtins.h" #include "utils/syscache.h" diff --git a/src/backend/access/index/amvalidate.c b/src/backend/access/index/amvalidate.c index 1a3c5f16b9..80865e9ff9 100644 --- a/src/backend/access/index/amvalidate.c +++ b/src/backend/access/index/amvalidate.c @@ -3,7 +3,7 @@ * amvalidate.c * Support routines for index access methods' amvalidate functions. * - * Copyright (c) 2016, PostgreSQL Global Development Group + * Copyright (c) 2016-2017, PostgreSQL Global Development Group * * * IDENTIFICATION diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 65c941d812..a91fda7bcd 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -3,7 +3,7 @@ * genam.c * general index access method routines * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -119,6 +119,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itup = NULL; scan->xs_itupdesc = NULL; + scan->xs_hitup = NULL; + scan->xs_hitupdesc = NULL; ItemPointerSetInvalid(&scan->xs_ctup.t_self); scan->xs_ctup.t_data = NULL; @@ -166,6 +168,10 @@ IndexScanEnd(IndexScanDesc scan) * The passed-in values/nulls arrays are the "raw" input to the index AM, * e.g. results of FormIndexDatum --- this is not necessarily what is stored * in the index, but it's what the user perceives to be stored. + * + * Note: if you change anything here, check whether + * ExecBuildSlotPartitionKeyDescription() in execMain.c needs a similar + * change. */ char * BuildIndexValueDescription(Relation indexRelation, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 54b71cb2f7..cc5ac8b857 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -3,7 +3,7 @@ * indexam.c * general index access method routines * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -20,6 +20,10 @@ * index_insert - insert an index tuple into a relation * index_markpos - mark a scan position * index_restrpos - restore a scan position + * index_parallelscan_estimate - estimate shared memory for parallel scan + * index_parallelscan_initialize - initialize parallel scan + * index_parallelrescan - (re)start a parallel scan of an index + * index_beginscan_parallel - join parallel index scan * index_getnext_tid - get the next TID from a scan * index_fetch_heap - get the scan's next heap tuple * index_getnext - get the next heap tuple from a scan @@ -120,7 +124,8 @@ do { \ } while(0) static IndexScanDesc index_beginscan_internal(Relation indexRelation, - int nkeys, int norderbys, Snapshot snapshot); + int nkeys, int norderbys, Snapshot snapshot, + ParallelIndexScanDesc pscan, bool temp_snap); /* ---------------------------------------------------------------- @@ -191,7 +196,8 @@ index_insert(Relation indexRelation, bool *isnull, ItemPointer heap_t_ctid, Relation heapRelation, - IndexUniqueCheck checkUnique) + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) { RELATION_CHECKS; CHECK_REL_PROCEDURE(aminsert); @@ -203,7 +209,7 @@ index_insert(Relation indexRelation, return indexRelation->rd_amroutine->aminsert(indexRelation, values, isnull, heap_t_ctid, heapRelation, - checkUnique); + checkUnique, indexInfo); } /* @@ -219,7 +225,7 @@ index_beginscan(Relation heapRelation, { IndexScanDesc scan; - scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot); + scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false); /* * Save additional parameters into the scandesc. Everything else was set @@ -244,7 +250,7 @@ index_beginscan_bitmap(Relation indexRelation, { IndexScanDesc scan; - scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot); + scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false); /* * Save additional parameters into the scandesc. Everything else was set @@ -260,8 +266,11 @@ index_beginscan_bitmap(Relation indexRelation, */ static IndexScanDesc index_beginscan_internal(Relation indexRelation, - int nkeys, int norderbys, Snapshot snapshot) + int nkeys, int norderbys, Snapshot snapshot, + ParallelIndexScanDesc pscan, bool temp_snap) { + IndexScanDesc scan; + RELATION_CHECKS; CHECK_REL_PROCEDURE(ambeginscan); @@ -276,8 +285,13 @@ index_beginscan_internal(Relation indexRelation, /* * Tell the AM to open a scan. */ - return indexRelation->rd_amroutine->ambeginscan(indexRelation, nkeys, + scan = indexRelation->rd_amroutine->ambeginscan(indexRelation, nkeys, norderbys); + /* Initialize information for parallel scan. */ + scan->parallel_scan = pscan; + scan->xs_temp_snap = temp_snap; + + return scan; } /* ---------------- @@ -341,6 +355,9 @@ index_endscan(IndexScanDesc scan) /* Release index refcount acquired by index_beginscan */ RelationDecrementReferenceCount(scan->indexRelation); + if (scan->xs_temp_snap) + UnregisterSnapshot(scan->xs_snapshot); + /* Release the scan data structure itself */ IndexScanEnd(scan); } @@ -389,6 +406,115 @@ index_restrpos(IndexScanDesc scan) scan->indexRelation->rd_amroutine->amrestrpos(scan); } +/* + * index_parallelscan_estimate - estimate shared memory for parallel scan + * + * Currently, we don't pass any information to the AM-specific estimator, + * so it can probably only return a constant. In the future, we might need + * to pass more information. + */ +Size +index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot) +{ + Size nbytes; + + RELATION_CHECKS; + + nbytes = offsetof(ParallelIndexScanDescData, ps_snapshot_data); + nbytes = add_size(nbytes, EstimateSnapshotSpace(snapshot)); + nbytes = MAXALIGN(nbytes); + + /* + * If amestimateparallelscan is not provided, assume there is no + * AM-specific data needed. (It's hard to believe that could work, but + * it's easy enough to cater to it here.) + */ + if (indexRelation->rd_amroutine->amestimateparallelscan != NULL) + nbytes = add_size(nbytes, + indexRelation->rd_amroutine->amestimateparallelscan()); + + return nbytes; +} + +/* + * index_parallelscan_initialize - initialize parallel scan + * + * We initialize both the ParallelIndexScanDesc proper and the AM-specific + * information which follows it. + * + * This function calls access method specific initialization routine to + * initialize am specific information. Call this just once in the leader + * process; then, individual workers attach via index_beginscan_parallel. + */ +void +index_parallelscan_initialize(Relation heapRelation, Relation indexRelation, + Snapshot snapshot, ParallelIndexScanDesc target) +{ + Size offset; + + RELATION_CHECKS; + + offset = add_size(offsetof(ParallelIndexScanDescData, ps_snapshot_data), + EstimateSnapshotSpace(snapshot)); + offset = MAXALIGN(offset); + + target->ps_relid = RelationGetRelid(heapRelation); + target->ps_indexid = RelationGetRelid(indexRelation); + target->ps_offset = offset; + SerializeSnapshot(snapshot, target->ps_snapshot_data); + + /* aminitparallelscan is optional; assume no-op if not provided by AM */ + if (indexRelation->rd_amroutine->aminitparallelscan != NULL) + { + void *amtarget; + + amtarget = OffsetToPointer(target, offset); + indexRelation->rd_amroutine->aminitparallelscan(amtarget); + } +} + +/* ---------------- + * index_parallelrescan - (re)start a parallel scan of an index + * ---------------- + */ +void +index_parallelrescan(IndexScanDesc scan) +{ + SCAN_CHECKS; + + /* amparallelrescan is optional; assume no-op if not provided by AM */ + if (scan->indexRelation->rd_amroutine->amparallelrescan != NULL) + scan->indexRelation->rd_amroutine->amparallelrescan(scan); +} + +/* + * index_beginscan_parallel - join parallel index scan + * + * Caller must be holding suitable locks on the heap and the index. + */ +IndexScanDesc +index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys, + int norderbys, ParallelIndexScanDesc pscan) +{ + Snapshot snapshot; + IndexScanDesc scan; + + Assert(RelationGetRelid(heaprel) == pscan->ps_relid); + snapshot = RestoreSnapshot(pscan->ps_snapshot_data); + RegisterSnapshot(snapshot); + scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot, + pscan, true); + + /* + * Save additional parameters into the scandesc. Everything else was set + * up by index_beginscan_internal. + */ + scan->heapRelation = heaprel; + scan->xs_snapshot = snapshot; + + return scan; +} + /* ---------------- * index_getnext_tid - get the next TID from a scan * @@ -409,8 +535,8 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_ctup.t_self. It should also set - * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention - * to those fields here. + * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * pay no attention to those fields here. */ found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction); diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 067d15c803..a3f11da8d5 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -521,11 +521,12 @@ because it allows running applications to continue while the standby changes state into a normally running server. The interlocking required to avoid returning incorrect results from -MVCC scans is not required on standby nodes. That is because +non-MVCC scans is not required on standby nodes. That is because HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(), HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only ever used during write transactions, which cannot exist on the standby. -This leaves HeapTupleSatisfiesMVCC() and HeapTupleSatisfiesToast(). +MVCC scans are already protected by definition, so HeapTupleSatisfiesMVCC() +is not a problem. That leaves concern only for HeapTupleSatisfiesToast(). HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's because it doesn't need to - if the main heap row is visible then the toast rows will also be visible. So as long as we follow a toast diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 0d60da61cc..4b131efb87 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -3,7 +3,7 @@ * nbtcompare.c * Comparison functions for btree access method. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index ef69290b6c..6dca8109fd 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -3,7 +3,7 @@ * nbtinsert.c * Item insertion in Lehman and Yao btrees for Postgres. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -17,6 +17,7 @@ #include "access/heapam.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/transam.h" #include "access/xloginsert.h" #include "miscadmin.h" diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 2001dc14fb..f815fd40b2 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -4,7 +4,7 @@ * BTree-specific page management code for the Postgres btree access * method. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -23,6 +23,7 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4668c5ee59..116f5f32f6 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -8,7 +8,7 @@ * This file contains only the public interface routines. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -23,11 +23,14 @@ #include "access/xlog.h" #include "catalog/index.h" #include "commands/vacuum.h" +#include "pgstat.h" +#include "storage/condition_variable.h" #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/smgr.h" #include "tcop/tcopprot.h" /* pgrminclude ignore */ +#include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" @@ -62,6 +65,45 @@ typedef struct MemoryContext pagedelcontext; } BTVacState; +/* + * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. + * + * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to + * a new page; others must wait. + * + * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan + * to a new page; some process can start doing that. + * + * BTPARALLEL_DONE indicates that the scan is complete (including error exit). + * We reach this state once for every distinct combination of array keys. + */ +typedef enum +{ + BTPARALLEL_NOT_INITIALIZED, + BTPARALLEL_ADVANCING, + BTPARALLEL_IDLE, + BTPARALLEL_DONE +} BTPS_State; + +/* + * BTParallelScanDescData contains btree specific shared information required + * for parallel scan. + */ +typedef struct BTParallelScanDescData +{ + BlockNumber btps_scanPage; /* latest or next page to be scanned */ + BTPS_State btps_pageStatus;/* indicates whether next page is available + * for scan. see above for possible states of + * parallel scan. */ + int btps_arrayKeyCount; /* count indicating number of array + * scan keys processed by parallel + * scan */ + slock_t btps_mutex; /* protects above variables */ + ConditionVariable btps_cv; /* used to synchronize parallel scan */ +} BTParallelScanDescData; + +typedef struct BTParallelScanDescData *BTParallelScanDesc; + static void btbuildCallback(Relation index, HeapTuple htup, @@ -98,6 +140,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amstorage = false; amroutine->amclusterable = true; amroutine->ampredlocks = true; + amroutine->amcanparallel = true; amroutine->amkeytype = InvalidOid; amroutine->ambuild = btbuild; @@ -117,6 +160,9 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amendscan = btendscan; amroutine->ammarkpos = btmarkpos; amroutine->amrestrpos = btrestrpos; + amroutine->amestimateparallelscan = btestimateparallelscan; + amroutine->aminitparallelscan = btinitparallelscan; + amroutine->amparallelrescan = btparallelrescan; PG_RETURN_POINTER(amroutine); } @@ -242,13 +288,18 @@ btbuildempty(Relation index) metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(metapage, P_NONE, 0); - /* Write the page. If archiving/streaming, XLOG it. */ + /* + * Write the page and log it. It might seem that an immediate sync would + * be sufficient to guarantee that the file exists on disk, but recovery + * itself might remove it while replaying, for example, an + * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need + * this even when wal_level=minimal. + */ PageSetChecksumInplace(metapage, BTREE_METAPAGE); smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, (char *) metapage, true); - if (XLogIsNeeded()) - log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - BTREE_METAPAGE, metapage, false); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BTREE_METAPAGE, metapage, false); /* * An immediate sync is required even if we xlog'd the page, because the @@ -267,7 +318,8 @@ btbuildempty(Relation index) bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, - IndexUniqueCheck checkUnique) + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) { bool result; IndexTuple itup; @@ -481,6 +533,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, } so->markItemIndex = -1; + so->arrayKeyCount = 0; BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); @@ -643,6 +696,217 @@ btrestrpos(IndexScanDesc scan) } /* + * btestimateparallelscan -- estimate storage for BTParallelScanDescData + */ +Size +btestimateparallelscan(void) +{ + return sizeof(BTParallelScanDescData); +} + +/* + * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan + */ +void +btinitparallelscan(void *target) +{ + BTParallelScanDesc bt_target = (BTParallelScanDesc) target; + + SpinLockInit(&bt_target->btps_mutex); + bt_target->btps_scanPage = InvalidBlockNumber; + bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + bt_target->btps_arrayKeyCount = 0; + ConditionVariableInit(&bt_target->btps_cv); +} + +/* + * btparallelrescan() -- reset parallel scan + */ +void +btparallelrescan(IndexScanDesc scan) +{ + BTParallelScanDesc btscan; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + + Assert(parallel_scan); + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + /* + * In theory, we don't need to acquire the spinlock here, because there + * shouldn't be any other workers running at this point, but we do so for + * consistency. + */ + SpinLockAcquire(&btscan->btps_mutex); + btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrayKeyCount = 0; + SpinLockRelease(&btscan->btps_mutex); +} + +/* + * _bt_parallel_seize() -- Begin the process of advancing the scan to a new + * page. Other scans must wait until we call bt_parallel_release() or + * bt_parallel_done(). + * + * The return value is true if we successfully seized the scan and false + * if we did not. The latter case occurs if no pages remain for the current + * set of scankeys. + * + * If the return value is true, *pageno returns the next or current page + * of the scan (depending on the scan direction). An invalid block number + * means the scan hasn't yet started, and P_NONE means we've reached the end. + * The first time a participating process reaches the last page, it will return + * true and set *pageno to P_NONE; after that, further attempts to seize the + * scan will return false. + * + * Callers should ignore the value of pageno if the return value is false. + */ +bool +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTPS_State pageStatus; + bool exit_loop = false; + bool status = true; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + *pageno = P_NONE; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + while (1) + { + SpinLockAcquire(&btscan->btps_mutex); + pageStatus = btscan->btps_pageStatus; + + if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + { + /* Parallel scan has already advanced to a new set of scankeys. */ + status = false; + } + else if (pageStatus == BTPARALLEL_DONE) + { + /* + * We're done with this set of scankeys. This may be the end, or + * there could be more sets to try. + */ + status = false; + } + else if (pageStatus != BTPARALLEL_ADVANCING) + { + /* + * We have successfully seized control of the scan for the purpose + * of advancing it to a new page! + */ + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + *pageno = btscan->btps_scanPage; + exit_loop = true; + } + SpinLockRelease(&btscan->btps_mutex); + if (exit_loop || !status) + break; + ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE); + } + ConditionVariableCancelSleep(); + + return status; +} + +/* + * _bt_parallel_release() -- Complete the process of advancing the scan to a + * new page. We now have the new value btps_scanPage; some other backend + * can now begin advancing the scan. + */ +void +_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) +{ + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + SpinLockAcquire(&btscan->btps_mutex); + btscan->btps_scanPage = scan_page; + btscan->btps_pageStatus = BTPARALLEL_IDLE; + SpinLockRelease(&btscan->btps_mutex); + ConditionVariableSignal(&btscan->btps_cv); +} + +/* + * _bt_parallel_done() -- Mark the parallel scan as complete. + * + * When there are no pages left to scan, this function should be called to + * notify other workers. Otherwise, they might wait forever for the scan to + * advance to the next page. + */ +void +_bt_parallel_done(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + bool status_changed = false; + + /* Do nothing, for non-parallel scans */ + if (parallel_scan == NULL) + return; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + /* + * Mark the parallel scan as done for this combination of scan keys, + * unless some other process already did so. See also + * _bt_advance_array_keys. + */ + SpinLockAcquire(&btscan->btps_mutex); + if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && + btscan->btps_pageStatus != BTPARALLEL_DONE) + { + btscan->btps_pageStatus = BTPARALLEL_DONE; + status_changed = true; + } + SpinLockRelease(&btscan->btps_mutex); + + /* wake up all the workers associated with this parallel scan */ + if (status_changed) + ConditionVariableBroadcast(&btscan->btps_cv); +} + +/* + * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array + * keys. + * + * Updates the count of array keys processed for both local and parallel + * scans. + */ +void +_bt_parallel_advance_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + so->arrayKeyCount++; + SpinLockAcquire(&btscan->btps_mutex); + if (btscan->btps_pageStatus == BTPARALLEL_DONE) + { + btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrayKeyCount++; + } + SpinLockRelease(&btscan->btps_mutex); +} + +/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. @@ -763,9 +1027,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); /* * The outer loop iterates over all index pages except the metapage, in diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index ee46023c5a..2f32b2e78d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -4,7 +4,7 @@ * Search code for postgres btrees. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -30,9 +30,13 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); +static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, + ScanDirection dir); static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); /* @@ -544,8 +548,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; + bool status = true; StrategyNumber strat_total; BTScanPosItem *currItem; + BlockNumber blkno; Assert(!BTScanPosIsValid(so->currPos)); @@ -564,6 +570,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!so->qual_ok) return false; + /* + * For parallel scans, get the starting page from shared state. If the + * scan has not started, proceed to find out first leaf page in the usual + * way while keeping other participating processes waiting. If the scan + * has already begun, use the page number from the shared structure. + */ + if (scan->parallel_scan != NULL) + { + status = _bt_parallel_seize(scan, &blkno); + if (!status) + return false; + else if (blkno == P_NONE) + { + _bt_parallel_done(scan); + return false; + } + else if (blkno != InvalidBlockNumber) + { + if (!_bt_parallel_readpage(scan, blkno, dir)) + return false; + goto readcomplete; + } + } + /*---------- * Examine the scan keys to discover where we need to start the scan. * @@ -743,7 +773,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * there. */ if (keysCount == 0) - return _bt_endpoint(scan, dir); + { + bool match; + + match = _bt_endpoint(scan, dir); + + if (!match) + { + /* No match, so mark (parallel) scan finished */ + _bt_parallel_done(scan); + } + + return match; + } /* * We want to start the scan somewhere within the index. Set up an @@ -773,7 +815,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Assert(subkey->sk_flags & SK_ROW_MEMBER); if (subkey->sk_flags & SK_ISNULL) + { + _bt_parallel_done(scan); return false; + } memcpy(scankeys + i, subkey, sizeof(ScanKeyData)); /* @@ -993,25 +1038,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * because nothing finer to lock exists. */ PredicateLockRelation(rel, scan->xs_snapshot); + + /* + * mark parallel scan as done, so that all the workers can finish + * their scan + */ + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; } else PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); - /* initialize moreLeft/moreRight appropriately for scan direction */ - if (ScanDirectionIsForward(dir)) - { - so->currPos.moreLeft = false; - so->currPos.moreRight = true; - } - else - { - so->currPos.moreLeft = true; - so->currPos.moreRight = false; - } - so->numKilled = 0; /* just paranoia */ - Assert(so->markItemIndex == -1); + _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); @@ -1060,6 +1101,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_drop_lock_and_maybe_pin(scan, &so->currPos); } +readcomplete: /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_ctup.t_self = currItem->heapTid; @@ -1132,6 +1174,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports * that there can be no more matching tuples in the current scan direction. * + * In the case of a parallel scan, caller must have called _bt_parallel_seize + * prior to calling this function; this function will invoke + * _bt_parallel_release before returning. + * * Returns true if any matching items found on the page, false if none. */ static bool @@ -1154,6 +1200,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* allow next page be processed by parallel worker */ + if (scan->parallel_scan) + { + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, opaque->btpo_next); + else + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -1278,21 +1334,16 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, * if pinned, we'll drop the pin before moving to next page. The buffer is * not locked on entry. * - * On success exit, so->currPos is updated to contain data from the next - * interesting page. For success on a scan using a non-MVCC snapshot we hold - * a pin, but not a read lock, on that page. If we do not hold the pin, we - * set so->currPos.buf to InvalidBuffer. We return TRUE to indicate success. - * - * If there are no more matching records in the given direction, we drop all - * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE. + * For success on a scan using a non-MVCC snapshot we hold a pin, but not a + * read lock, on that page. If we do not hold the pin, we set so->currPos.buf + * to InvalidBuffer. We return TRUE to indicate success. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - Relation rel; - Page page; - BTPageOpaque opaque; + BlockNumber blkno = InvalidBlockNumber; + bool status = true; Assert(BTScanPosIsValid(so->currPos)); @@ -1319,25 +1370,103 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->markItemIndex = -1; } - rel = scan->indexRelation; - if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ - /* We must rely on the previously saved nextPage link! */ - BlockNumber blkno = so->currPos.nextPage; + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the next block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so use the previously-saved nextPage link. */ + blkno = so->currPos.nextPage; + } /* Remember we left a page with data */ so->currPos.moreLeft = true; /* release the previous buffer, if pinned */ BTScanPosUnpinIfPinned(so->currPos); + } + else + { + /* Remember we left a page with data */ + so->currPos.moreRight = true; + + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the current block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + BTScanPosUnpinIfPinned(so->currPos); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so just use our own notion of the current page */ + blkno = so->currPos.currPage; + } + } + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + return true; +} +/* + * _bt_readnextpage() -- Read next page containing valid data for scan + * + * On success exit, so->currPos is updated to contain data from the next + * interesting page. Caller is responsible to release lock and pin on + * buffer on success. We return TRUE to indicate success. + * + * If there are no more matching records in the given direction, we drop all + * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE. + */ +static bool +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel; + Page page; + BTPageOpaque opaque; + bool status = true; + + rel = scan->indexRelation; + + if (ScanDirectionIsForward(dir)) + { for (;;) { - /* if we're at end of scan, give up */ + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ if (blkno == P_NONE || !so->currPos.moreRight) { + _bt_parallel_done(scan); BTScanPosInvalidate(so->currPos); return false; } @@ -1345,10 +1474,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) CHECK_FOR_INTERRUPTS(); /* step right one page */ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); - /* check for deleted page */ page = BufferGetPage(so->currPos.buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* check for deleted page */ if (!P_IGNORE(opaque)) { PredicateLockPage(rel, blkno, scan->xs_snapshot); @@ -1359,14 +1488,32 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) } /* nope, keep going */ - blkno = opaque->btpo_next; + if (scan->parallel_scan != NULL) + { + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + _bt_relbuf(rel, so->currPos.buf); + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + blkno = opaque->btpo_next; _bt_relbuf(rel, so->currPos.buf); } } else { - /* Remember we left a page with data */ - so->currPos.moreRight = true; + /* + * Should only happen in parallel cases, when some other backend + * advanced the scan. + */ + if (so->currPos.currPage != blkno) + { + BTScanPosUnpinIfPinned(so->currPos); + so->currPos.currPage = blkno; + } /* * Walk left to the next page with data. This is much more complex @@ -1401,6 +1548,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) if (!so->currPos.moreLeft) { _bt_relbuf(rel, so->currPos.buf); + _bt_parallel_done(scan); BTScanPosInvalidate(so->currPos); return false; } @@ -1412,6 +1560,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) /* if we're physically at end of index, return failure */ if (so->currPos.buf == InvalidBuffer) { + _bt_parallel_done(scan); BTScanPosInvalidate(so->currPos); return false; } @@ -1432,9 +1581,46 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) break; } + + /* + * For parallel scans, get the last page scanned as it is quite + * possible that by the time we try to seize the scan, some other + * worker has already advanced the scan to a different page. We + * must continue based on the latest page scanned by any worker. + */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + } } } + return true; +} + +/* + * _bt_parallel_readpage() -- Read current page containing valid data for scan + * + * On success, release lock and maybe pin on buffer. We return TRUE to + * indicate success. + */ +static bool +_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + _bt_initialize_more_data(so, dir); + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + /* Drop the lock, and maybe the pin, on the current page */ _bt_drop_lock_and_maybe_pin(scan, &so->currPos); @@ -1712,19 +1898,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) /* remember which buffer we have pinned */ so->currPos.buf = buf; - /* initialize moreLeft/moreRight appropriately for scan direction */ - if (ScanDirectionIsForward(dir)) - { - so->currPos.moreLeft = false; - so->currPos.moreRight = true; - } - else - { - so->currPos.moreLeft = true; - so->currPos.moreRight = false; - } - so->numKilled = 0; /* just paranoia */ - so->markItemIndex = -1; /* ditto */ + _bt_initialize_more_data(so, dir); /* * Now load data from the first page of the scan. @@ -1753,3 +1927,25 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return true; } + +/* + * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately + * for scan direction + */ +static inline void +_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) +{ + /* initialize moreLeft/moreRight appropriately for scan direction */ + if (ScanDirectionIsForward(dir)) + { + so->currPos.moreLeft = false; + so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } + so->numKilled = 0; /* just paranoia */ + so->markItemIndex = -1; /* ditto */ +} diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 99a014e8f4..3d041c47c0 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -55,7 +55,7 @@ * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -680,9 +680,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) bool merge = (btspool2 != NULL); IndexTuple itup, itup2 = NULL; - bool should_free, - should_free2, - load1; + bool load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = RelationGetNumberOfAttributes(wstate->index); @@ -697,10 +695,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) */ /* the preparation of merge */ - itup = tuplesort_getindextuple(btspool->sortstate, - true, &should_free); - itup2 = tuplesort_getindextuple(btspool2->sortstate, - true, &should_free2); + itup = tuplesort_getindextuple(btspool->sortstate, true); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); indexScanKey = _bt_mkscankey_nodata(wstate->index); /* Prepare SortSupport data for each column */ @@ -775,18 +771,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (load1) { _bt_buildadd(wstate, state, itup); - if (should_free) - pfree(itup); - itup = tuplesort_getindextuple(btspool->sortstate, - true, &should_free); + itup = tuplesort_getindextuple(btspool->sortstate, true); } else { _bt_buildadd(wstate, state, itup2); - if (should_free2) - pfree(itup2); - itup2 = tuplesort_getindextuple(btspool2->sortstate, - true, &should_free2); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); } } pfree(sortKeys); @@ -795,15 +785,13 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, - true, &should_free)) != NULL) + true)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); _bt_buildadd(wstate, state, itup); - if (should_free) - pfree(itup); } } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 5d335c7f97..5b259a31d9 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3,7 +3,7 @@ * nbtutils.c * Utility code for Postgres btree implementation. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -232,10 +232,8 @@ _bt_preprocess_array_keys(IndexScanDesc scan) */ if (so->arrayContext == NULL) so->arrayContext = AllocSetContextCreate(CurrentMemoryContext, - "BTree Array Context", - ALLOCSET_SMALL_MINSIZE, - ALLOCSET_SMALL_INITSIZE, - ALLOCSET_SMALL_MAXSIZE); + "BTree array context", + ALLOCSET_SMALL_SIZES); else MemoryContextReset(so->arrayContext); @@ -592,6 +590,10 @@ _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) break; } + /* advance parallel scan */ + if (scan->parallel_scan != NULL) + _bt_parallel_advance_array_keys(scan); + return found; } diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c index 7d0bdabc1d..88e33f54cd 100644 --- a/src/backend/access/nbtree/nbtvalidate.c +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -3,7 +3,7 @@ * nbtvalidate.c * Opclass validator for btree. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -22,6 +22,7 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "utils/builtins.h" +#include "utils/regproc.h" #include "utils/syscache.h" diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index c536e22432..ac60db0d49 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -4,7 +4,7 @@ * WAL replay logic for btrees. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -14,8 +14,10 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/heapam_xlog.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xlogutils.h" @@ -1028,3 +1030,52 @@ btree_redo(XLogReaderState *record) elog(PANIC, "btree_redo: unknown op code %u", info); } } + +/* + * Mask a btree page before performing consistency checks on it. + */ +void +btree_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + BTPageOpaque maskopaq; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_ISDELETED(maskopaq)) + { + /* + * Mask page content on a DELETED page since it will be re-initialized + * during replay. See btree_xlog_unlink_page() for details. + */ + mask_page_content(page); + } + else if (P_ISLEAF(maskopaq)) + { + /* + * In btree leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * _bt_killitems(), _bt_check_unique() for details. + */ + mask_lp_flags(page); + } + + /* + * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See + * _bt_killitems(), _bt_check_unique() for details. + */ + maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* + * During replay of a btree page split, we don't set the BTP_SPLIT_END + * flag of the right sibling and initialize the cycle_id to 0 for the same + * page. See btree_xlog_split() for details. + */ + maskopaq->btpo_flags &= ~BTP_SPLIT_END; + maskopaq->btpo_cycleid = 0; +} diff --git a/src/backend/access/rmgrdesc/brindesc.c b/src/backend/access/rmgrdesc/brindesc.c index 433526f5ec..637ebf30f8 100644 --- a/src/backend/access/rmgrdesc/brindesc.c +++ b/src/backend/access/rmgrdesc/brindesc.c @@ -3,7 +3,7 @@ * brindesc.c * rmgr descriptor routines for BRIN indexes * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -61,6 +61,13 @@ brin_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "targetBlk %u", xlrec->targetBlk); } + else if (info == XLOG_BRIN_DESUMMARIZE) + { + xl_brin_desummarize *xlrec = (xl_brin_desummarize *) rec; + + appendStringInfo(buf, "pagesPerRange %u, heapBlk %u, page offset %u", + xlrec->pagesPerRange, xlrec->heapBlk, xlrec->regOffset); + } } const char * @@ -91,6 +98,9 @@ brin_identify(uint8 info) case XLOG_BRIN_REVMAP_EXTEND: id = "REVMAP_EXTEND"; break; + case XLOG_BRIN_DESUMMARIZE: + id = "DESUMMARIZE"; + break; } return id; diff --git a/src/backend/access/rmgrdesc/clogdesc.c b/src/backend/access/rmgrdesc/clogdesc.c index 41ea254710..9181154ffd 100644 --- a/src/backend/access/rmgrdesc/clogdesc.c +++ b/src/backend/access/rmgrdesc/clogdesc.c @@ -3,7 +3,7 @@ * clogdesc.c * rmgr descriptor routines for access/transam/clog.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -23,12 +23,20 @@ clog_desc(StringInfo buf, XLogReaderState *record) char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - if (info == CLOG_ZEROPAGE || info == CLOG_TRUNCATE) + if (info == CLOG_ZEROPAGE) { int pageno; memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "%d", pageno); + appendStringInfo(buf, "page %d", pageno); + } + else if (info == CLOG_TRUNCATE) + { + xl_clog_truncate xlrec; + + memcpy(&xlrec, rec, sizeof(xl_clog_truncate)); + appendStringInfo(buf, "page %d; oldestXact %u", + xlrec.pageno, xlrec.oldestXact); } } diff --git a/src/backend/access/rmgrdesc/committsdesc.c b/src/backend/access/rmgrdesc/committsdesc.c index 527e5dc724..3e670bd543 100644 --- a/src/backend/access/rmgrdesc/committsdesc.c +++ b/src/backend/access/rmgrdesc/committsdesc.c @@ -3,7 +3,7 @@ * committsdesc.c * rmgr descriptor routines for access/transam/commit_ts.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -33,10 +33,10 @@ commit_ts_desc(StringInfo buf, XLogReaderState *record) } else if (info == COMMIT_TS_TRUNCATE) { - int pageno; + xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) rec; - memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "%d", pageno); + appendStringInfo(buf, "pageno %d, oldestXid %u", + trunc->pageno, trunc->oldestXid); } else if (info == COMMIT_TS_SETTS) { diff --git a/src/backend/access/rmgrdesc/dbasedesc.c b/src/backend/access/rmgrdesc/dbasedesc.c index 83720ce765..768242cfd5 100644 --- a/src/backend/access/rmgrdesc/dbasedesc.c +++ b/src/backend/access/rmgrdesc/dbasedesc.c @@ -3,7 +3,7 @@ * dbasedesc.c * rmgr descriptor routines for commands/dbcommands.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/rmgrdesc/genericdesc.c b/src/backend/access/rmgrdesc/genericdesc.c index 22f81570a5..c4705428f1 100644 --- a/src/backend/access/rmgrdesc/genericdesc.c +++ b/src/backend/access/rmgrdesc/genericdesc.c @@ -4,7 +4,7 @@ * rmgr descriptor routines for access/transam/generic_xlog.c * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/rmgrdesc/genericdesc.c diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index db832a5f78..df51f3ce1f 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -3,7 +3,7 @@ * gindesc.c * rmgr descriptor routines for access/transam/gin/ginxlog.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xlogutils.h" #include "lib/stringinfo.h" #include "storage/relfilenode.h" @@ -87,13 +87,13 @@ gin_desc(StringInfo buf, XLogReaderState *record) case XLOG_GIN_INSERT: { ginxlogInsert *xlrec = (ginxlogInsert *) rec; - char *payload = rec + sizeof(ginxlogInsert); appendStringInfo(buf, "isdata: %c isleaf: %c", (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); if (!(xlrec->flags & GIN_INSERT_ISLEAF)) { + char *payload = rec + sizeof(ginxlogInsert); BlockNumber leftChildBlkno; BlockNumber rightChildBlkno; @@ -104,27 +104,32 @@ gin_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, " children: %u/%u", leftChildBlkno, rightChildBlkno); } - if (!(xlrec->flags & GIN_INSERT_ISDATA)) - appendStringInfo(buf, " isdelete: %c", - (((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F'); - else if (xlrec->flags & GIN_INSERT_ISLEAF) + if (XLogRecHasBlockImage(record, 0)) { - ginxlogRecompressDataLeaf *insertData = - (ginxlogRecompressDataLeaf *) payload; - - if (XLogRecHasBlockImage(record, 0)) + if (XLogRecBlockImageApply(record, 0)) appendStringInfoString(buf, " (full page image)"); else - desc_recompress_leaf(buf, insertData); + appendStringInfoString(buf, " (full page image, for WAL verification)"); } else { - ginxlogInsertDataInternal *insertData = (ginxlogInsertDataInternal *) payload; + char *payload = XLogRecGetBlockData(record, 0, NULL); + + if (!(xlrec->flags & GIN_INSERT_ISDATA)) + appendStringInfo(buf, " isdelete: %c", + (((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F'); + else if (xlrec->flags & GIN_INSERT_ISLEAF) + desc_recompress_leaf(buf, (ginxlogRecompressDataLeaf *) payload); + else + { + ginxlogInsertDataInternal *insertData = + (ginxlogInsertDataInternal *) payload; - appendStringInfo(buf, " pitem: %u-%u/%u", + appendStringInfo(buf, " pitem: %u-%u/%u", PostingItemGetBlockNumber(&insertData->newitem), ItemPointerGetBlockNumber(&insertData->newitem.key), - ItemPointerGetOffsetNumber(&insertData->newitem.key)); + ItemPointerGetOffsetNumber(&insertData->newitem.key)); + } } } break; @@ -144,12 +149,20 @@ gin_desc(StringInfo buf, XLogReaderState *record) break; case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: { - ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec; - if (XLogRecHasBlockImage(record, 0)) - appendStringInfoString(buf, " (full page image)"); + { + if (XLogRecBlockImageApply(record, 0)) + appendStringInfoString(buf, " (full page image)"); + else + appendStringInfoString(buf, " (full page image, for WAL verification)"); + } else + { + ginxlogVacuumDataLeafPage *xlrec = + (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, NULL); + desc_recompress_leaf(buf, &xlrec->data); + } } break; case XLOG_GIN_DELETE_PAGE: diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index 90bb88109f..dc0506913c 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -3,7 +3,7 @@ * gistdesc.c * rmgr descriptor routines for access/gist/gistxlog.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/gist_private.h" +#include "access/gistxlog.h" #include "lib/stringinfo.h" #include "storage/relfilenode.h" diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c index d37c9b1aae..35d86dc893 100644 --- a/src/backend/access/rmgrdesc/hashdesc.c +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -3,7 +3,7 @@ * hashdesc.c * rmgr descriptor routines for access/hash/hash.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,15 +14,158 @@ */ #include "postgres.h" -#include "access/hash.h" +#include "access/hash_xlog.h" void hash_desc(StringInfo buf, XLogReaderState *record) { + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + { + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec; + + appendStringInfo(buf, "num_tuples %g, fillfactor %d", + xlrec->num_tuples, xlrec->ffactor); + break; + } + case XLOG_HASH_INIT_BITMAP_PAGE: + { + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec; + + appendStringInfo(buf, "bmsize %d", xlrec->bmsize); + break; + } + case XLOG_HASH_INSERT: + { + xl_hash_insert *xlrec = (xl_hash_insert *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + break; + } + case XLOG_HASH_ADD_OVFL_PAGE: + { + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec; + + appendStringInfo(buf, "bmsize %d, bmpage_found %c", + xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + { + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec; + + appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c", + xlrec->new_bucket, + (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F', + (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_COMPLETE: + { + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec; + + appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u", + xlrec->old_bucket_flag, xlrec->new_bucket_flag); + break; + } + case XLOG_HASH_MOVE_PAGE_CONTENTS: + { + xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec; + + appendStringInfo(buf, "ntups %d, is_primary %c", + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_SQUEEZE_PAGE: + { + xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec; + + appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c", + xlrec->prevblkno, + xlrec->nextblkno, + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_DELETE: + { + xl_hash_delete *xlrec = (xl_hash_delete *) rec; + + appendStringInfo(buf, "clear_dead_marking %c, is_primary %c", + xlrec->clear_dead_marking ? 'T' : 'F', + xlrec->is_primary_bucket_page ? 'T' : 'F'); + break; + } + case XLOG_HASH_UPDATE_META_PAGE: + { + xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec; + + appendStringInfo(buf, "ntuples %g", + xlrec->ntuples); + break; + } + case XLOG_HASH_VACUUM_ONE_PAGE: + { + xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec; + + appendStringInfo(buf, "ntuples %d", + xlrec->ntuples); + break; + } + } } const char * hash_identify(uint8 info) { - return NULL; + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_HASH_INIT_META_PAGE: + id = "INIT_META_PAGE"; + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + id = "INIT_BITMAP_PAGE"; + break; + case XLOG_HASH_INSERT: + id = "INSERT"; + break; + case XLOG_HASH_ADD_OVFL_PAGE: + id = "ADD_OVFL_PAGE"; + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + id = "SPLIT_ALLOCATE_PAGE"; + break; + case XLOG_HASH_SPLIT_PAGE: + id = "SPLIT_PAGE"; + break; + case XLOG_HASH_SPLIT_COMPLETE: + id = "SPLIT_COMPLETE"; + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + id = "MOVE_PAGE_CONTENTS"; + break; + case XLOG_HASH_SQUEEZE_PAGE: + id = "SQUEEZE_PAGE"; + break; + case XLOG_HASH_DELETE: + id = "DELETE"; + break; + case XLOG_HASH_SPLIT_CLEANUP: + id = "SPLIT_CLEANUP"; + break; + case XLOG_HASH_UPDATE_META_PAGE: + id = "UPDATE_META_PAGE"; + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + id = "VACUUM_ONE_PAGE"; + } + + return id; } diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 7c763b6b0e..44d2d6333f 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -3,7 +3,7 @@ * heapdesc.c * rmgr descriptor routines for access/heap/heapam.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/rmgrdesc/logicalmsgdesc.c b/src/backend/access/rmgrdesc/logicalmsgdesc.c index 525826efd3..8287751e48 100644 --- a/src/backend/access/rmgrdesc/logicalmsgdesc.c +++ b/src/backend/access/rmgrdesc/logicalmsgdesc.c @@ -3,7 +3,7 @@ * logicalmsgdesc.c * rmgr descriptor routines for replication/logical/message.c * - * Portions Copyright (c) 2015-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2017, PostgreSQL Global Development Group * * * IDENTIFICATION diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 27c1fb0fc0..9c17447744 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -3,7 +3,7 @@ * mxactdesc.c * rmgr descriptor routines for access/transam/multixact.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 7631cb5c73..fbde9d6555 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -3,7 +3,7 @@ * nbtdesc.c * rmgr descriptor routines for access/nbtree/nbtxlog.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/nbtree.h" +#include "access/nbtxlog.h" void btree_desc(StringInfo buf, XLogReaderState *record) diff --git a/src/backend/access/rmgrdesc/relmapdesc.c b/src/backend/access/rmgrdesc/relmapdesc.c index 097a709fb2..4cbdf37c70 100644 --- a/src/backend/access/rmgrdesc/relmapdesc.c +++ b/src/backend/access/rmgrdesc/relmapdesc.c @@ -3,7 +3,7 @@ * relmapdesc.c * rmgr descriptor routines for utils/cache/relmapper.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/rmgrdesc/replorigindesc.c b/src/backend/access/rmgrdesc/replorigindesc.c index ce8ba10c14..c43f850f8e 100644 --- a/src/backend/access/rmgrdesc/replorigindesc.c +++ b/src/backend/access/rmgrdesc/replorigindesc.c @@ -3,7 +3,7 @@ * replorigindesc.c * rmgr descriptor routines for replication/logical/origin.c * - * Portions Copyright (c) 2015-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2017, PostgreSQL Global Development Group * * * IDENTIFICATION diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c index 5855715735..2209f7284e 100644 --- a/src/backend/access/rmgrdesc/seqdesc.c +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -3,7 +3,7 @@ * seqdesc.c * rmgr descriptor routines for commands/sequence.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index f36b1099a9..7ad46bcf7a 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -3,7 +3,7 @@ * smgrdesc.c * rmgr descriptor routines for catalog/storage.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/rmgrdesc/spgdesc.c b/src/backend/access/rmgrdesc/spgdesc.c index 8be47f9526..24d6cb58fd 100644 --- a/src/backend/access/rmgrdesc/spgdesc.c +++ b/src/backend/access/rmgrdesc/spgdesc.c @@ -3,7 +3,7 @@ * spgdesc.c * rmgr descriptor routines for access/spgist/spgxlog.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/spgist_private.h" +#include "access/spgxlog.h" void spg_desc(StringInfo buf, XLogReaderState *record) diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c index 13797a3d2f..278546a728 100644 --- a/src/backend/access/rmgrdesc/standbydesc.c +++ b/src/backend/access/rmgrdesc/standbydesc.c @@ -3,7 +3,7 @@ * standbydesc.c * rmgr descriptor routines for storage/ipc/standby.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -122,12 +122,10 @@ standby_desc_invalidations(StringInfo buf, appendStringInfoString(buf, " smgr"); /* not expected, but print something anyway */ else if (msg->id == SHAREDINVALRELMAP_ID) - appendStringInfoString(buf, " relmap"); - else if (msg->id == SHAREDINVALRELMAP_ID) appendStringInfo(buf, " relmap db %u", msg->rm.dbId); else if (msg->id == SHAREDINVALSNAPSHOT_ID) appendStringInfo(buf, " snapshot %u", msg->sn.relId); else - appendStringInfo(buf, " unknown id %d", msg->id); + appendStringInfo(buf, " unrecognized id %d", msg->id); } } diff --git a/src/backend/access/rmgrdesc/tblspcdesc.c b/src/backend/access/rmgrdesc/tblspcdesc.c index 15440271a3..47c42328f3 100644 --- a/src/backend/access/rmgrdesc/tblspcdesc.c +++ b/src/backend/access/rmgrdesc/tblspcdesc.c @@ -3,7 +3,7 @@ * tblspcdesc.c * rmgr descriptor routines for commands/tablespace.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index d2965b70b7..063e66834d 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -3,7 +3,7 @@ * xactdesc.c * rmgr descriptor routines for access/transam/xact.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -26,7 +26,7 @@ * understand format. * * This routines are in xactdesc.c because they're accessed in backend (when - * replaying WAL) and frontend (pg_xlogdump) code. This file is the only xact + * replaying WAL) and frontend (pg_waldump) code. This file is the only xact * specific one shared between both. They're complicated enough that * duplication would be bothersome. */ diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 62ed1dc04b..5f07eb1499 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -3,7 +3,7 @@ * xlogdesc.c * rmgr descriptor routines for access/transam/xlog.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c index f090ca528b..90c6534139 100644 --- a/src/backend/access/spgist/spgdoinsert.c +++ b/src/backend/access/spgist/spgdoinsert.c @@ -4,7 +4,7 @@ * implementation of insert algorithm * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -17,6 +17,7 @@ #include "access/genam.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "storage/bufmgr.h" @@ -1705,17 +1706,40 @@ spgSplitNodeAction(Relation index, SpGistState *state, /* Should not be applied to nulls */ Assert(!SpGistPageStoresNulls(current->page)); + /* Check opclass gave us sane values */ + if (out->result.splitTuple.prefixNNodes <= 0 || + out->result.splitTuple.prefixNNodes > SGITMAXNNODES) + elog(ERROR, "invalid number of prefix nodes: %d", + out->result.splitTuple.prefixNNodes); + if (out->result.splitTuple.childNodeN < 0 || + out->result.splitTuple.childNodeN >= + out->result.splitTuple.prefixNNodes) + elog(ERROR, "invalid child node number: %d", + out->result.splitTuple.childNodeN); + /* - * Construct new prefix tuple, containing a single node with the specified - * label. (We'll update the node's downlink to point to the new postfix - * tuple, below.) + * Construct new prefix tuple with requested number of nodes. We'll fill + * in the childNodeN'th node's downlink below. */ - node = spgFormNodeTuple(state, out->result.splitTuple.nodeLabel, false); + nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * + out->result.splitTuple.prefixNNodes); + + for (i = 0; i < out->result.splitTuple.prefixNNodes; i++) + { + Datum label = (Datum) 0; + bool labelisnull; + + labelisnull = (out->result.splitTuple.prefixNodeLabels == NULL); + if (!labelisnull) + label = out->result.splitTuple.prefixNodeLabels[i]; + nodes[i] = spgFormNodeTuple(state, label, labelisnull); + } prefixTuple = spgFormInnerTuple(state, out->result.splitTuple.prefixHasPrefix, out->result.splitTuple.prefixPrefixDatum, - 1, &node); + out->result.splitTuple.prefixNNodes, + nodes); /* it must fit in the space that innerTuple now occupies */ if (prefixTuple->size > innerTuple->size) @@ -1807,10 +1831,12 @@ spgSplitNodeAction(Relation index, SpGistState *state, * the postfix tuple first.) We have to update the local copy of the * prefixTuple too, because that's what will be written to WAL. */ - spgUpdateNodeLink(prefixTuple, 0, postfixBlkno, postfixOffset); + spgUpdateNodeLink(prefixTuple, out->result.splitTuple.childNodeN, + postfixBlkno, postfixOffset); prefixTuple = (SpGistInnerTuple) PageGetItem(current->page, PageGetItemId(current->page, current->offnum)); - spgUpdateNodeLink(prefixTuple, 0, postfixBlkno, postfixOffset); + spgUpdateNodeLink(prefixTuple, out->result.splitTuple.childNodeN, + postfixBlkno, postfixOffset); MarkBufferDirty(current->buffer); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 44fd644e42..9a37259916 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -5,7 +5,7 @@ * * All the actual insertion logic is in spgdoinsert.c. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -18,6 +18,7 @@ #include "access/genam.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "catalog/index.h" @@ -134,9 +135,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST build temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, spgistBuildCallback, (void *) &buildstate); @@ -163,13 +162,18 @@ spgbuildempty(Relation index) page = (Page) palloc(BLCKSZ); SpGistInitMetapage(page); - /* Write the page. If archiving/streaming, XLOG it. */ + /* + * Write the page and log it unconditionally. This is important + * particularly for indexes created on tablespaces and databases whose + * creation happened after the last redo pointer as recovery removes any + * of their existing content when the corresponding create records are + * replayed. + */ PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, (char *) page, true); - if (XLogIsNeeded()) - log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_METAPAGE_BLKNO, page, false); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_METAPAGE_BLKNO, page, false); /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); @@ -177,9 +181,8 @@ spgbuildempty(Relation index) PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO, (char *) page, true); - if (XLogIsNeeded()) - log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_ROOT_BLKNO, page, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_ROOT_BLKNO, page, true); /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); @@ -187,9 +190,8 @@ spgbuildempty(Relation index) PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO, (char *) page, true); - if (XLogIsNeeded()) - log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_NULL_BLKNO, page, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_NULL_BLKNO, page, true); /* * An immediate sync is required even if we xlog'd the pages, because the @@ -205,7 +207,8 @@ spgbuildempty(Relation index) bool spginsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, - IndexUniqueCheck checkUnique) + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) { SpGistState spgstate; MemoryContext oldCtx; @@ -213,9 +216,7 @@ spginsert(Relation index, Datum *values, bool *isnull, insertCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST insert temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(insertCtx); initSpGistState(&spgstate, index); diff --git a/src/backend/access/spgist/spgkdtreeproc.c b/src/backend/access/spgist/spgkdtreeproc.c index 1ab93350e1..9a2649bf2a 100644 --- a/src/backend/access/spgist/spgkdtreeproc.c +++ b/src/backend/access/spgist/spgkdtreeproc.c @@ -4,7 +4,7 @@ * implementation of k-d tree over points for SP-GiST * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/spgist/spgquadtreeproc.c b/src/backend/access/spgist/spgquadtreeproc.c index 40ab760b0f..6ad73f448d 100644 --- a/src/backend/access/spgist/spgquadtreeproc.c +++ b/src/backend/access/spgist/spgquadtreeproc.c @@ -4,7 +4,7 @@ * implementation of quad tree over points for SP-GiST * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 6f9e223f43..2d96c0094e 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -4,7 +4,7 @@ * routines for scanning SP-GiST indexes * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -92,11 +92,11 @@ resetSpGistScanOpaque(SpGistScanOpaque so) if (so->want_itup) { - /* Must pfree IndexTuples to avoid memory leak */ + /* Must pfree reconstructed tuples to avoid memory leak */ int i; for (i = 0; i < so->nPtrs; i++) - pfree(so->indexTups[i]); + pfree(so->reconTups[i]); } so->iPtr = so->nPtrs = 0; } @@ -193,12 +193,10 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) initSpGistState(&so->state, scan->indexRelation); so->tempCxt = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST search temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); - /* Set up indexTupDesc and xs_itupdesc in case it's an index-only scan */ - so->indexTupDesc = scan->xs_itupdesc = RelationGetDescr(rel); + /* Set up indexTupDesc and xs_hitupdesc in case it's an index-only scan */ + so->indexTupDesc = scan->xs_hitupdesc = RelationGetDescr(rel); scan->opaque = so; @@ -593,12 +591,12 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, if (so->want_itup) { /* - * Reconstruct desired IndexTuple. We have to copy the datum out of - * the temp context anyway, so we may as well create the tuple here. + * Reconstruct index data. We have to copy the datum out of the temp + * context anyway, so we may as well create the tuple here. */ - so->indexTups[so->nPtrs] = index_form_tuple(so->indexTupDesc, - &leafValue, - &isnull); + so->reconTups[so->nPtrs] = heap_form_tuple(so->indexTupDesc, + &leafValue, + &isnull); } so->nPtrs++; } @@ -621,18 +619,18 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) /* continuing to return tuples from a leaf page */ scan->xs_ctup.t_self = so->heapPtrs[so->iPtr]; scan->xs_recheck = so->recheck[so->iPtr]; - scan->xs_itup = so->indexTups[so->iPtr]; + scan->xs_hitup = so->reconTups[so->iPtr]; so->iPtr++; return true; } if (so->want_itup) { - /* Must pfree IndexTuples to avoid memory leak */ + /* Must pfree reconstructed tuples to avoid memory leak */ int i; for (i = 0; i < so->nPtrs; i++) - pfree(so->indexTups[i]); + pfree(so->reconTups[i]); } so->iPtr = so->nPtrs = 0; diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c index e0d8f30ef1..53f298b6c2 100644 --- a/src/backend/access/spgist/spgtextproc.c +++ b/src/backend/access/spgist/spgtextproc.c @@ -29,7 +29,7 @@ * No new entries ever get pushed into a -2-labeled child, either. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -45,6 +45,7 @@ #include "utils/builtins.h" #include "utils/datum.h" #include "utils/pg_locale.h" +#include "utils/varlena.h" /* @@ -212,9 +213,14 @@ spg_text_choose(PG_FUNCTION_ARGS) out->result.splitTuple.prefixPrefixDatum = formTextDatum(prefixStr, commonLen); } - out->result.splitTuple.nodeLabel = + out->result.splitTuple.prefixNNodes = 1; + out->result.splitTuple.prefixNodeLabels = + (Datum *) palloc(sizeof(Datum)); + out->result.splitTuple.prefixNodeLabels[0] = Int16GetDatum(*(unsigned char *) (prefixStr + commonLen)); + out->result.splitTuple.childNodeN = 0; + if (prefixSize - commonLen == 1) { out->result.splitTuple.postfixHasPrefix = false; @@ -280,7 +286,10 @@ spg_text_choose(PG_FUNCTION_ARGS) out->resultType = spgSplitTuple; out->result.splitTuple.prefixHasPrefix = in->hasPrefix; out->result.splitTuple.prefixPrefixDatum = in->prefixDatum; - out->result.splitTuple.nodeLabel = Int16GetDatum(-2); + out->result.splitTuple.prefixNNodes = 1; + out->result.splitTuple.prefixNodeLabels = (Datum *) palloc(sizeof(Datum)); + out->result.splitTuple.prefixNodeLabels[0] = Int16GetDatum(-2); + out->result.splitTuple.childNodeN = 0; out->result.splitTuple.postfixHasPrefix = false; } else @@ -559,8 +568,9 @@ spg_text_leaf_consistent(PG_FUNCTION_ARGS) leafValue = DatumGetTextPP(in->leafDatum); + /* As above, in->reconstructedValue isn't toasted or short. */ if (DatumGetPointer(in->reconstructedValue)) - reconstrValue = DatumGetTextP(in->reconstructedValue); + reconstrValue = (text *) DatumGetPointer(in->reconstructedValue); Assert(reconstrValue == NULL ? level == 0 : VARSIZE_ANY_EXHDR(reconstrValue) == level); diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index d570ae5992..e57ac49c6b 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -4,7 +4,7 @@ * various support functions for SP-GiST * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -22,6 +22,7 @@ #include "storage/bufmgr.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/lsyscache.h" @@ -48,6 +49,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->amstorage = false; amroutine->amclusterable = false; amroutine->ampredlocks = false; + amroutine->amcanparallel = false; amroutine->amkeytype = InvalidOid; amroutine->ambuild = spgbuild; @@ -67,6 +69,9 @@ spghandler(PG_FUNCTION_ARGS) amroutine->amendscan = spgendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 15b867f24c..cce9b3f618 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -4,7 +4,7 @@ * vacuum for SP-GiST * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -17,6 +17,7 @@ #include "access/genam.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/transam.h" #include "access/xloginsert.h" #include "catalog/storage_xlog.h" diff --git a/src/backend/access/spgist/spgvalidate.c b/src/backend/access/spgist/spgvalidate.c index 6297111a7c..1bc5bce72e 100644 --- a/src/backend/access/spgist/spgvalidate.c +++ b/src/backend/access/spgist/spgvalidate.c @@ -3,7 +3,7 @@ * spgvalidate.c * Opclass validator for SP-GiST. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -22,6 +22,7 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "utils/builtins.h" +#include "utils/regproc.h" #include "utils/syscache.h" diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 01a4e0f252..c007601efd 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -4,7 +4,7 @@ * WAL replay logic for SP-GiST * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -14,7 +14,9 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xlogutils.h" @@ -1014,9 +1016,7 @@ spg_xlog_startup(void) { opCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); } void @@ -1025,3 +1025,23 @@ spg_xlog_cleanup(void) MemoryContextDelete(opCtx); opCtx = NULL; } + +/* + * Mask a SpGist page before performing consistency checks on it. + */ +void +spg_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + + /* + * Any SpGist page other than meta contains unused space which needs to be + * masked. + */ + if (!SpGistPageIsMeta(page)) + mask_unused_space(page); +} diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c index bc45fd1bed..5f6d478159 100644 --- a/src/backend/access/tablesample/bernoulli.c +++ b/src/backend/access/tablesample/bernoulli.c @@ -13,7 +13,7 @@ * cutoff value computed from the selection probability by BeginSampleScan. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index 65f8c58d67..e270cbc4a0 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -13,7 +13,7 @@ * cutoff value computed from the selection probability by BeginSampleScan. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c index d96aa5b2a3..10d2bc91b3 100644 --- a/src/backend/access/tablesample/tablesample.c +++ b/src/backend/access/tablesample/tablesample.c @@ -3,7 +3,7 @@ * tablesample.c * Support functions for TABLESAMPLE feature * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 4ae4715339..e7dd19fd7b 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -331,17 +331,17 @@ of the xid fields is atomic, so assuming it for xmin as well is no extra risk. -pg_clog and pg_subtrans +pg_xact and pg_subtrans ----------------------- -pg_clog and pg_subtrans are permanent (on-disk) storage of transaction related +pg_xact and pg_subtrans are permanent (on-disk) storage of transaction related information. There is a limited number of pages of each kept in memory, so in many cases there is no need to actually read from disk. However, if there's a long running transaction or a backend sitting idle with an open transaction, it may be necessary to be able to read and write this information from disk. They also allow information to be permanent across server restarts. -pg_clog records the commit status for each transaction that has been assigned +pg_xact records the commit status for each transaction that has been assigned an XID. A transaction can be in progress, committed, aborted, or "sub-committed". This last state means that it's a subtransaction that's no longer running, but its parent has not updated its state yet. It is not @@ -381,9 +381,9 @@ each transaction we keep a "cache" of Xids that are known to be part of the transaction tree, so we can skip looking at pg_subtrans unless we know the cache has been overflowed. See storage/ipc/procarray.c for the gory details. -slru.c is the supporting mechanism for both pg_clog and pg_subtrans. It +slru.c is the supporting mechanism for both pg_xact and pg_subtrans. It implements the LRU policy for in-memory buffer pages. The high-level routines -for pg_clog are implemented in transam.c, while the low-level functions are in +for pg_xact are implemented in transam.c, while the low-level functions are in clog.c. pg_subtrans is contained completely in subtrans.c. diff --git a/src/backend/access/transam/README.parallel b/src/backend/access/transam/README.parallel index db9ac3d504..5c33c40ae9 100644 --- a/src/backend/access/transam/README.parallel +++ b/src/backend/access/transam/README.parallel @@ -154,7 +154,7 @@ parallelism was started before all parallel workers have exited; and it's even more clearly crazy for a parallel worker to try to subcommit or subabort the current subtransaction and execute in some other transaction context than was present in the initiating backend. It might be practical to allow internal -sub-transactions (e.g. to implement a PL/pgsql EXCEPTION block) to be used in +sub-transactions (e.g. to implement a PL/pgSQL EXCEPTION block) to be used in parallel mode, provided that they are XID-less, because other backends wouldn't really need to know about those transactions or do anything differently because of them. Right now, we don't even allow that. @@ -198,7 +198,7 @@ pattern looks like this: EnterParallelMode(); /* prohibit unsafe state changes */ - pcxt = CreateParallelContext(entrypoint, nworkers); + pcxt = CreateParallelContext("library_name", "function_name", nworkers); /* Allow space for application-specific data here. */ shm_toc_estimate_chunk(&pcxt->estimator, size); diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 0fcccfc3a7..9c6964d79c 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -24,7 +24,7 @@ * be that such transactions failed anyway. * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -84,7 +84,8 @@ static SlruCtlData ClogCtlData; static int ZeroCLOGPage(int pageno, bool writeXlog); static bool CLOGPagePrecedes(int page1, int page2); static void WriteZeroPageXlogRec(int pageno); -static void WriteTruncateXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact, + Oid oldestXidDb); static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn, int pageno); @@ -460,7 +461,7 @@ CLOGShmemInit(void) { ClogCtl->PagePrecedes = CLOGPagePrecedes; SimpleLruInit(ClogCtl, "clog", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, - CLogControlLock, "pg_clog", LWTRANCHE_CLOG_BUFFERS); + CLogControlLock, "pg_xact", LWTRANCHE_CLOG_BUFFERS); } /* @@ -586,6 +587,13 @@ ShutdownCLOG(void) /* Flush dirty CLOG pages to disk */ TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false); SimpleLruFlush(ClogCtl, false); + + /* + * fsync pg_xact to ensure that any files flushed previously are durably + * on disk. + */ + fsync_fname("pg_xact", true); + TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false); } @@ -598,6 +606,13 @@ CheckPointCLOG(void) /* Flush dirty CLOG pages to disk */ TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); SimpleLruFlush(ClogCtl, true); + + /* + * fsync pg_xact to ensure that any files flushed previously are durably + * on disk. + */ + fsync_fname("pg_xact", true); + TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); } @@ -703,7 +718,7 @@ ExtendCLOG(TransactionId newestXact) * the XLOG flush unless we have confirmed that there is a removable segment. */ void -TruncateCLOG(TransactionId oldestXact) +TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) { int cutoffPage; @@ -717,8 +732,26 @@ TruncateCLOG(TransactionId oldestXact) if (!SlruScanDirectory(ClogCtl, SlruScanDirCbReportPresence, &cutoffPage)) return; /* nothing to remove */ - /* Write XLOG record and flush XLOG to disk */ - WriteTruncateXlogRec(cutoffPage); + /* + * Advance oldestClogXid before truncating clog, so concurrent xact status + * lookups can ensure they don't attempt to access truncated-away clog. + * + * It's only necessary to do this if we will actually truncate away clog + * pages. + */ + AdvanceOldestClogXid(oldestXact); + + /* vac_truncate_clog already advanced oldestXid */ + Assert(TransactionIdPrecedesOrEquals(oldestXact, + ShmemVariableCache->oldestXid)); + + /* + * Write XLOG record and flush XLOG to disk. We record the oldest xid + * we're keeping information about here so we can ensure that it's always + * ahead of clog truncation in case we crash, and so a standby finds out + * the new valid xid before the next checkpoint. + */ + WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid); /* Now we can remove the old CLOG segment(s) */ SimpleLruTruncate(ClogCtl, cutoffPage); @@ -767,12 +800,17 @@ WriteZeroPageXlogRec(int pageno) * in TruncateCLOG(). */ static void -WriteTruncateXlogRec(int pageno) +WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb) { XLogRecPtr recptr; + xl_clog_truncate xlrec; + + xlrec.pageno = pageno; + xlrec.oldestXact = oldestXact; + xlrec.oldestXactDb = oldestXactDb; XLogBeginInsert(); - XLogRegisterData((char *) (&pageno), sizeof(int)); + XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate)); recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE); XLogFlush(recptr); } @@ -805,17 +843,19 @@ clog_redo(XLogReaderState *record) } else if (info == CLOG_TRUNCATE) { - int pageno; + xl_clog_truncate xlrec; - memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate)); /* * During XLOG replay, latest_page_number isn't set up yet; insert a * suitable value to bypass the sanity test in SimpleLruTruncate. */ - ClogCtl->shared->latest_page_number = pageno; + ClogCtl->shared->latest_page_number = xlrec.pageno; + + AdvanceOldestClogXid(xlrec.oldestXact); - SimpleLruTruncate(ClogCtl, pageno); + SimpleLruTruncate(ClogCtl, xlrec.pageno); } else elog(PANIC, "clog_redo: unknown op code %u", info); diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 35b7451e6f..cb75430bc9 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -3,7 +3,7 @@ * commit_ts.c * PostgreSQL commit timestamp manager * - * This module is a pg_clog-like system that stores the commit timestamp + * This module is a pg_xact-like system that stores the commit timestamp * for each transaction. * * XLOG interactions: this module generates an XLOG record whenever a new @@ -15,7 +15,7 @@ * re-perform the status update on redo; so we need make no additional XLOG * entry here. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/commit_ts.c @@ -32,6 +32,7 @@ #include "funcapi.h" #include "miscadmin.h" #include "pg_trace.h" +#include "storage/shmem.h" #include "utils/builtins.h" #include "utils/snapmgr.h" #include "utils/timestamp.h" @@ -112,7 +113,7 @@ static bool CommitTsPagePrecedes(int page1, int page2); static void ActivateCommitTs(void); static void DeactivateCommitTs(void); static void WriteZeroPageXlogRec(int pageno); -static void WriteTruncateXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid); static void WriteSetTimestampXlogRec(TransactionId mainxid, int nsubxids, TransactionId *subxids, TimestampTz timestamp, RepOriginId nodeid); @@ -288,11 +289,18 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, TransactionId oldestCommitTsXid; TransactionId newestCommitTsXid; - /* error if the given Xid doesn't normally commit */ - if (!TransactionIdIsNormal(xid)) + if (!TransactionIdIsValid(xid)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot retrieve commit timestamp for transaction %u", xid))); + else if (!TransactionIdIsNormal(xid)) + { + /* frozen and bootstrap xids are always committed far in the past */ + *ts = 0; + if (nodeid) + *nodeid = 0; + return false; + } LWLockAcquire(CommitTsLock, LW_SHARED); @@ -607,7 +615,7 @@ CommitTsParameterChange(bool newvalue, bool oldvalue) /* * Activate this module whenever necessary. - * This must happen during postmaster or standalong-backend startup, + * This must happen during postmaster or standalone-backend startup, * or during WAL replay anytime the track_commit_timestamp setting is * changed in the master. * @@ -738,6 +746,12 @@ ShutdownCommitTs(void) { /* Flush dirty CommitTs pages to disk */ SimpleLruFlush(CommitTsCtl, false); + + /* + * fsync pg_commit_ts to ensure that any files flushed previously are + * durably on disk. + */ + fsync_fname("pg_commit_ts", true); } /* @@ -748,6 +762,12 @@ CheckPointCommitTs(void) { /* Flush dirty CommitTs pages to disk */ SimpleLruFlush(CommitTsCtl, true); + + /* + * fsync pg_commit_ts to ensure that any files flushed previously are + * durably on disk. + */ + fsync_fname("pg_commit_ts", true); } /* @@ -843,7 +863,7 @@ TruncateCommitTs(TransactionId oldestXact) return; /* nothing to remove */ /* Write XLOG record */ - WriteTruncateXlogRec(cutoffPage); + WriteTruncateXlogRec(cutoffPage, oldestXact); /* Now we can remove the old CommitTs segment(s) */ SimpleLruTruncate(CommitTsCtl, cutoffPage); @@ -870,6 +890,8 @@ SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact) else { Assert(ShmemVariableCache->newestCommitTsXid == InvalidTransactionId); + ShmemVariableCache->oldestCommitTsXid = oldestXact; + ShmemVariableCache->newestCommitTsXid = newestXact; } LWLockRelease(CommitTsLock); } @@ -927,10 +949,15 @@ WriteZeroPageXlogRec(int pageno) * Write a TRUNCATE xlog record */ static void -WriteTruncateXlogRec(int pageno) +WriteTruncateXlogRec(int pageno, TransactionId oldestXid) { + xl_commit_ts_truncate xlrec; + + xlrec.pageno = pageno; + xlrec.oldestXid = oldestXid; + XLogBeginInsert(); - XLogRegisterData((char *) (&pageno), sizeof(int)); + XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate); (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE); } @@ -984,17 +1011,17 @@ commit_ts_redo(XLogReaderState *record) } else if (info == COMMIT_TS_TRUNCATE) { - int pageno; + xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) XLogRecGetData(record); - memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + AdvanceOldestCommitTsXid(trunc->oldestXid); /* * During XLOG replay, latest_page_number isn't set up yet; insert a * suitable value to bypass the sanity test in SimpleLruTruncate. */ - CommitTsCtl->shared->latest_page_number = pageno; + CommitTsCtl->shared->latest_page_number = trunc->pageno; - SimpleLruTruncate(CommitTsCtl, pageno); + SimpleLruTruncate(CommitTsCtl, trunc->pageno); } else if (info == COMMIT_TS_SETTS) { diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index 1926d98de0..fbc6810c2f 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -4,7 +4,7 @@ * Implementation of generic xlog records. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/generic_xlog.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/generic_xlog.h" #include "access/xlogutils.h" #include "miscadmin.h" @@ -533,3 +534,14 @@ generic_redo(XLogReaderState *record) UnlockReleaseBuffer(buffers[block_id]); } } + +/* + * Mask a generic page before performing consistency checks on it. + */ +void +generic_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn(page); + + mask_unused_space(page); +} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index c2e4fa377d..1a7824b5d4 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -3,7 +3,7 @@ * multixact.c * PostgreSQL multi-transaction-log manager * - * The pg_multixact manager is a pg_clog-like manager that stores an array of + * The pg_multixact manager is a pg_xact-like manager that stores an array of * MultiXactMember for each MultiXactId. It is a fundamental part of the * shared-row-lock implementation. Each MultiXactMember is comprised of a * TransactionId and a set of flag bits. The name is a bit historical: @@ -59,7 +59,7 @@ * counter does not fall within the wraparound horizon considering the global * minimum value. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/multixact.c @@ -363,7 +363,7 @@ static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance); -static bool SetOffsetVacuumLimit(void); +static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); static void WriteMZeroPageXlogRec(int pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, @@ -1570,10 +1570,8 @@ mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) /* The cache only lives as long as the current transaction */ debug_elog2(DEBUG2, "CachePut: initializing memory context"); MXactContext = AllocSetContextCreate(TopTransactionContext, - "MultiXact Cache Context", - ALLOCSET_SMALL_MINSIZE, - ALLOCSET_SMALL_INITSIZE, - ALLOCSET_SMALL_MAXSIZE); + "MultiXact cache context", + ALLOCSET_SMALL_SIZES); } entry = (mXactCacheEnt *) @@ -2097,7 +2095,7 @@ TrimMultiXact(void) LWLockRelease(MultiXactGenLock); /* Now compute how far away the next members wraparound is. */ - SetMultiXactIdLimit(oldestMXact, oldestMXactDB); + SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); } /* @@ -2188,9 +2186,13 @@ MultiXactSetNextMXact(MultiXactId nextMulti, * Determine the last safe MultiXactId to allocate given the currently oldest * datminmxid (ie, the oldest MultiXactId that might exist in any database * of our cluster), and the OID of the (or a) database with that value. + * + * is_startup is true when we are just starting the cluster, false when we + * are updating state in a running cluster. This only affects log messages. */ void -SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) +SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, + bool is_startup) { MultiXactId multiVacLimit; MultiXactId multiWarnLimit; @@ -2279,7 +2281,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) Assert(!InRecovery); /* Set limits for offset vacuum. */ - needs_offset_vacuum = SetOffsetVacuumLimit(); + needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); /* * If past the autovacuum force point, immediately signal an autovac @@ -2372,7 +2374,7 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) Assert(InRecovery); if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) - SetMultiXactIdLimit(oldestMulti, oldestMultiDB); + SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); } /* @@ -2539,7 +2541,7 @@ GetOldestMultiXactId(void) * otherwise. */ static bool -SetOffsetVacuumLimit(void) +SetOffsetVacuumLimit(bool is_startup) { MultiXactId oldestMultiXactId; MultiXactId nextMXact; @@ -2621,9 +2623,10 @@ SetOffsetVacuumLimit(void) /* always leave one segment before the wraparound point */ offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - if (!prevOldestOffsetKnown && IsUnderPostmaster) + if (!prevOldestOffsetKnown && !is_startup) ereport(LOG, (errmsg("MultiXact member wraparound protections are now enabled"))); + ereport(DEBUG1, (errmsg("MultiXact member stop limit is now %u based on MultiXact %u", offsetStopLimit, oldestMultiXactId))); @@ -2802,7 +2805,7 @@ ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) * more aggressive in clamping this value. That not only causes autovacuum * to ramp up, but also makes any manual vacuums the user issues more * aggressive. This happens because vacuum_set_xid_limits() clamps the - * freeze table and and the minimum freeze age based on the effective + * freeze table and the minimum freeze age based on the effective * autovacuum_multixact_freeze_max_age this function returns. In the worst * case, we'll claim the freeze_max_age to zero, and every vacuum of any * table will try to freeze every multixact. @@ -3314,7 +3317,7 @@ multixact_redo(XLogReaderState *record) * Advance the horizon values, so they're current at the end of * recovery. */ - SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB); + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index c36a92cdb9..d3585c8449 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -3,7 +3,7 @@ * parallel.c * Infrastructure for launching parallel workers * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -19,11 +19,13 @@ #include "access/xlog.h" #include "catalog/namespace.h" #include "commands/async.h" +#include "executor/execParallel.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "libpq/pqmq.h" #include "miscadmin.h" #include "optimizer/planmain.h" +#include "pgstat.h" #include "pgxc/pgxcnode.h" #include "storage/ipc.h" #include "storage/sinval.h" @@ -61,7 +63,7 @@ #define PARALLEL_KEY_TRANSACTION_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0006) #define PARALLEL_KEY_ACTIVE_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0007) #define PARALLEL_KEY_TRANSACTION_STATE UINT64CONST(0xFFFFFFFFFFFF0008) -#define PARALLEL_KEY_EXTENSION_TRAMPOLINE UINT64CONST(0xFFFFFFFFFFFF0009) +#define PARALLEL_KEY_ENTRYPOINT UINT64CONST(0xFFFFFFFFFFFF0009) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -77,9 +79,6 @@ typedef struct FixedParallelState pid_t parallel_master_pid; BackendId parallel_master_backend_id; - /* Entrypoint for parallel workers. */ - parallel_worker_main_type entrypoint; - /* Mutex protects remaining fields. */ slock_t mutex; @@ -107,12 +106,26 @@ static FixedParallelState *MyFixedParallelState; /* List of active parallel contexts. */ static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list); +/* + * List of internal parallel worker entry points. We need this for + * reasons explained in LookupParallelWorkerFunction(), below. + */ +static const struct +{ + const char *fn_name; + parallel_worker_main_type fn_addr; +} InternalParallelWorkers[] = + +{ + { + "ParallelQueryMain", ParallelQueryMain + } +}; + /* Private functions. */ static void HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg); -static void ParallelErrorContext(void *arg); -static void ParallelExtensionTrampoline(dsm_segment *seg, shm_toc *toc); -static void ParallelWorkerMain(Datum main_arg); static void WaitForParallelWorkersToExit(ParallelContext *pcxt); +static parallel_worker_main_type LookupParallelWorkerFunction(const char *libraryname, const char *funcname); /* @@ -121,7 +134,8 @@ static void WaitForParallelWorkersToExit(ParallelContext *pcxt); * destroyed before exiting the current subtransaction. */ ParallelContext * -CreateParallelContext(parallel_worker_main_type entrypoint, int nworkers) +CreateParallelContext(const char *library_name, const char *function_name, + int nworkers) { MemoryContext oldcontext; ParallelContext *pcxt; @@ -154,7 +168,8 @@ CreateParallelContext(parallel_worker_main_type entrypoint, int nworkers) pcxt = palloc0(sizeof(ParallelContext)); pcxt->subid = GetCurrentSubTransactionId(); pcxt->nworkers = nworkers; - pcxt->entrypoint = entrypoint; + pcxt->library_name = pstrdup(library_name); + pcxt->function_name = pstrdup(function_name); pcxt->error_context_stack = error_context_stack; shm_toc_initialize_estimator(&pcxt->estimator); dlist_push_head(&pcxt_list, &pcxt->node); @@ -166,33 +181,6 @@ CreateParallelContext(parallel_worker_main_type entrypoint, int nworkers) } /* - * Establish a new parallel context that calls a function provided by an - * extension. This works around the fact that the library might get mapped - * at a different address in each backend. - */ -ParallelContext * -CreateParallelContextForExternalFunction(char *library_name, - char *function_name, - int nworkers) -{ - MemoryContext oldcontext; - ParallelContext *pcxt; - - /* We might be running in a very short-lived memory context. */ - oldcontext = MemoryContextSwitchTo(TopTransactionContext); - - /* Create the context. */ - pcxt = CreateParallelContext(ParallelExtensionTrampoline, nworkers); - pcxt->library_name = pstrdup(library_name); - pcxt->function_name = pstrdup(function_name); - - /* Restore previous memory context. */ - MemoryContextSwitchTo(oldcontext); - - return pcxt; -} - -/* * Establish the dynamic shared memory segment for a parallel context and * copy state and other bookkeeping information that will be needed by * parallel workers into it. @@ -251,15 +239,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); - /* Estimate how much we'll need for extension entrypoint info. */ - if (pcxt->library_name != NULL) - { - Assert(pcxt->entrypoint == ParallelExtensionTrampoline); - Assert(pcxt->function_name != NULL); - shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) - + strlen(pcxt->function_name) + 2); - shm_toc_estimate_keys(&pcxt->estimator, 1); - } + /* Estimate how much we'll need for the entrypoint info. */ + shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + + strlen(pcxt->function_name) + 2); + shm_toc_estimate_keys(&pcxt->estimator, 1); } /* @@ -299,7 +282,6 @@ InitializeParallelDSM(ParallelContext *pcxt) fps->parallel_master_pgproc = MyProc; fps->parallel_master_pid = MyProcPid; fps->parallel_master_backend_id = MyBackendId; - fps->entrypoint = pcxt->entrypoint; SpinLockInit(&fps->mutex); fps->last_xlog_end = 0; shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps); @@ -314,6 +296,8 @@ InitializeParallelDSM(ParallelContext *pcxt) char *asnapspace; char *tstatespace; char *error_queue_space; + char *entrypointstate; + Size lnamelen; /* Serialize shared libraries we have loaded. */ libraryspace = shm_toc_allocate(pcxt->toc, library_len); @@ -370,19 +354,19 @@ InitializeParallelDSM(ParallelContext *pcxt) } shm_toc_insert(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, error_queue_space); - /* Serialize extension entrypoint information. */ - if (pcxt->library_name != NULL) - { - Size lnamelen = strlen(pcxt->library_name); - char *extensionstate; - - extensionstate = shm_toc_allocate(pcxt->toc, lnamelen - + strlen(pcxt->function_name) + 2); - strcpy(extensionstate, pcxt->library_name); - strcpy(extensionstate + lnamelen + 1, pcxt->function_name); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXTENSION_TRAMPOLINE, - extensionstate); - } + /* + * Serialize entrypoint information. It's unsafe to pass function + * pointers across processes, as the function pointer may be different + * in each process in EXEC_BACKEND builds, so we always pass library + * and function name. (We use library name "postgres" for functions + * in the core backend.) + */ + lnamelen = strlen(pcxt->library_name); + entrypointstate = shm_toc_allocate(pcxt->toc, lnamelen + + strlen(pcxt->function_name) + 2); + strcpy(entrypointstate, pcxt->library_name); + strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); } /* Restore previous memory context. */ @@ -452,16 +436,18 @@ LaunchParallelWorkers(ParallelContext *pcxt) oldcontext = MemoryContextSwitchTo(TopTransactionContext); /* Configure a worker. */ + memset(&worker, 0, sizeof(worker)); snprintf(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %d", MyProcPid); worker.bgw_flags = - BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION + | BGWORKER_CLASS_PARALLEL; worker.bgw_start_time = BgWorkerStart_ConsistentState; worker.bgw_restart_time = BGW_NEVER_RESTART; - worker.bgw_main = ParallelWorkerMain; + sprintf(worker.bgw_library_name, "postgres"); + sprintf(worker.bgw_function_name, "ParallelWorkerMain"); worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(pcxt->seg)); worker.bgw_notify_pid = MyProcPid; - memset(&worker.bgw_extra, 0, BGW_EXTRALEN); /* * Start workers. @@ -542,7 +528,8 @@ WaitForParallelWorkersToFinish(ParallelContext *pcxt) if (!anyone_alive) break; - WaitLatch(&MyProc->procLatch, WL_LATCH_SET, -1); + WaitLatch(&MyProc->procLatch, WL_LATCH_SET, -1, + WAIT_EVENT_PARALLEL_FINISH); ResetLatch(&MyProc->procLatch); } @@ -670,6 +657,8 @@ DestroyParallelContext(ParallelContext *pcxt) } /* Free memory. */ + pfree(pcxt->library_name); + pfree(pcxt->function_name); pfree(pcxt); } @@ -704,6 +693,9 @@ void HandleParallelMessages(void) { dlist_iter iter; + MemoryContext oldcontext; + + static MemoryContext hpm_context = NULL; /* * This is invoked from ProcessInterrupts(), and since some of the @@ -714,6 +706,21 @@ HandleParallelMessages(void) */ HOLD_INTERRUPTS(); + /* + * Moreover, CurrentMemoryContext might be pointing almost anywhere. We + * don't want to risk leaking data into long-lived contexts, so let's do + * our work here in a private context that we can reset on each use. + */ + if (hpm_context == NULL) /* first time through? */ + hpm_context = AllocSetContextCreate(TopMemoryContext, + "HandleParallelMessages", + ALLOCSET_DEFAULT_SIZES); + else + MemoryContextReset(hpm_context); + + oldcontext = MemoryContextSwitchTo(hpm_context); + + /* OK to process messages. Reset the flag saying there are more to do. */ ParallelMessagePending = false; dlist_foreach(iter, &pcxt_list) @@ -760,6 +767,11 @@ HandleParallelMessages(void) } } + MemoryContextSwitchTo(oldcontext); + + /* Might as well clear the context on our way out */ + MemoryContextReset(hpm_context); + RESUME_INTERRUPTS(); } @@ -789,30 +801,43 @@ HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg) case 'N': /* NoticeResponse */ { ErrorData edata; - ErrorContextCallback errctx; ErrorContextCallback *save_error_context_stack; - /* - * Rethrow the error using the error context callbacks that - * were in effect when the context was created, not the - * current ones. - */ - save_error_context_stack = error_context_stack; - errctx.callback = ParallelErrorContext; - errctx.arg = NULL; - errctx.previous = pcxt->error_context_stack; - error_context_stack = &errctx; - /* Parse ErrorResponse or NoticeResponse. */ pq_parse_errornotice(msg, &edata); /* Death of a worker isn't enough justification for suicide. */ edata.elevel = Min(edata.elevel, ERROR); - /* Rethrow error or notice. */ + /* + * If desired, add a context line to show that this is a + * message propagated from a parallel worker. Otherwise, it + * can sometimes be confusing to understand what actually + * happened. (We don't do this in FORCE_PARALLEL_REGRESS mode + * because it causes test-result instability depending on + * whether a parallel worker is actually used or not.) + */ + if (force_parallel_mode != FORCE_PARALLEL_REGRESS) + { + if (edata.context) + edata.context = psprintf("%s\n%s", edata.context, + _("parallel worker")); + else + edata.context = pstrdup(_("parallel worker")); + } + + /* + * Context beyond that should use the error context callbacks + * that were in effect when the ParallelContext was created, + * not the current ones. + */ + save_error_context_stack = error_context_stack; + error_context_stack = pcxt->error_context_stack; + + /* Rethrow error or print notice. */ ThrowErrorData(&edata); - /* Restore previous context. */ + /* Not an error, so restore previous context stack. */ error_context_stack = save_error_context_stack; break; @@ -894,7 +919,7 @@ AtEOXact_Parallel(bool isCommit) /* * Main entrypoint for parallel workers. */ -static void +void ParallelWorkerMain(Datum main_arg) { dsm_segment *seg; @@ -904,6 +929,10 @@ ParallelWorkerMain(Datum main_arg) shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; + char *entrypointstate; + char *library_name; + char *function_name; + parallel_worker_main_type entrypt; char *gucspace; char *combocidspace; char *tsnapspace; @@ -926,10 +955,8 @@ ParallelWorkerMain(Datum main_arg) Assert(CurrentResourceOwner == NULL); CurrentResourceOwner = ResourceOwnerCreate(NULL, "parallel toplevel"); CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext, - "parallel worker", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + "Parallel worker", + ALLOCSET_DEFAULT_SIZES); /* * Now that we have a resource owner, we can attach to the dynamic shared @@ -1005,6 +1032,18 @@ ParallelWorkerMain(Datum main_arg) Assert(libraryspace != NULL); RestoreLibraryState(libraryspace); + /* + * Identify the entry point to be called. In theory this could result in + * loading an additional library, though most likely the entry point is in + * the core backend or in a library we just loaded. + */ + entrypointstate = shm_toc_lookup(toc, PARALLEL_KEY_ENTRYPOINT); + Assert(entrypointstate != NULL); + library_name = entrypointstate; + function_name = entrypointstate + strlen(library_name) + 1; + + entrypt = LookupParallelWorkerFunction(library_name, function_name); + /* Restore database connection. */ BackgroundWorkerInitializeConnectionByOid(fps->database_id, fps->authenticated_user_id); @@ -1072,11 +1111,8 @@ ParallelWorkerMain(Datum main_arg) /* * Time to do the real work: invoke the caller-supplied code. - * - * If you get a crash at this line, see the comments for - * ParallelExtensionTrampoline. */ - fps->entrypoint(seg, toc); + entrypt(seg, toc); /* Must exit parallel mode to pop active snapshot. */ ExitParallelMode(); @@ -1092,45 +1128,6 @@ ParallelWorkerMain(Datum main_arg) } /* - * It's unsafe for the entrypoint invoked by ParallelWorkerMain to be a - * function living in a dynamically loaded module, because the module might - * not be loaded in every process, or might be loaded but not at the same - * address. To work around that problem, CreateParallelContextForExtension() - * arranges to call this function rather than calling the extension-provided - * function directly; and this function then looks up the real entrypoint and - * calls it. - */ -static void -ParallelExtensionTrampoline(dsm_segment *seg, shm_toc *toc) -{ - char *extensionstate; - char *library_name; - char *function_name; - parallel_worker_main_type entrypt; - - extensionstate = shm_toc_lookup(toc, PARALLEL_KEY_EXTENSION_TRAMPOLINE); - Assert(extensionstate != NULL); - library_name = extensionstate; - function_name = extensionstate + strlen(library_name) + 1; - - entrypt = (parallel_worker_main_type) - load_external_function(library_name, function_name, true, NULL); - entrypt(seg, toc); -} - -/* - * Give the user a hint that this is a message propagated from a parallel - * worker. Otherwise, it can sometimes be confusing to understand what - * actually happened. - */ -static void -ParallelErrorContext(void *arg) -{ - if (force_parallel_mode != FORCE_PARALLEL_REGRESS) - errcontext("parallel worker"); -} - -/* * Update shared memory with the ending location of the last WAL record we * wrote, if it's greater than the value already stored there. */ @@ -1145,3 +1142,47 @@ ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end) fps->last_xlog_end = last_xlog_end; SpinLockRelease(&fps->mutex); } + +/* + * Look up (and possibly load) a parallel worker entry point function. + * + * For functions contained in the core code, we use library name "postgres" + * and consult the InternalParallelWorkers array. External functions are + * looked up, and loaded if necessary, using load_external_function(). + * + * The point of this is to pass function names as strings across process + * boundaries. We can't pass actual function addresses because of the + * possibility that the function has been loaded at a different address + * in a different process. This is obviously a hazard for functions in + * loadable libraries, but it can happen even for functions in the core code + * on platforms using EXEC_BACKEND (e.g., Windows). + * + * At some point it might be worthwhile to get rid of InternalParallelWorkers[] + * in favor of applying load_external_function() for core functions too; + * but that raises portability issues that are not worth addressing now. + */ +static parallel_worker_main_type +LookupParallelWorkerFunction(const char *libraryname, const char *funcname) +{ + /* + * If the function is to be loaded from postgres itself, search the + * InternalParallelWorkers array. + */ + if (strcmp(libraryname, "postgres") == 0) + { + int i; + + for (i = 0; i < lengthof(InternalParallelWorkers); i++) + { + if (strcmp(InternalParallelWorkers[i].fn_name, funcname) == 0) + return InternalParallelWorkers[i].fn_addr; + } + + /* We can only reach this by programming error. */ + elog(ERROR, "internal function \"%s\" not found", funcname); + } + + /* Otherwise load from external library. */ + return (parallel_worker_main_type) + load_external_function(libraryname, funcname, true, NULL); +} diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample index 5624eeca84..acb81afd66 100644 --- a/src/backend/access/transam/recovery.conf.sample +++ b/src/backend/access/transam/recovery.conf.sample @@ -67,10 +67,10 @@ # must set a recovery target. # # You may set a recovery target either by transactionId, by name, -# or by timestamp or by barrier. Recovery may either include or exclude the -# transaction(s) with the recovery target value (ie, stop either -# just after or just before the given target, respectively). In case of -# barrier, the recovery stops exactly at that point. +# or by timestamp or by WAL location (LSN) or by barrier. Recovery may either +# include or exclude the transaction(s) with the recovery target value (ie, +# stop either just after or just before the given target, respectively). In +# case of barrier, the recovery stops exactly at that point. # # #recovery_target_name = '' # e.g. 'daily backup 2011-01-26' @@ -81,6 +81,8 @@ # #recovery_target_barrier = '' # +#recovery_target_lsn = '' # e.g. '0/70006B8' +# #recovery_target_inclusive = true # # @@ -99,7 +101,7 @@ # # If recovery_target_action = 'pause', recovery will pause when the # recovery target is reached. The pause state will continue until -# pg_xlog_replay_resume() is called. This setting has no effect if +# pg_wal_replay_resume() is called. This setting has no effect if # no recovery target is set. If hot_standby is not enabled then the # server will shutdown instead, though you may request this in # any case by specifying 'shutdown'. diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index bc875677b0..db9899bb6f 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -9,15 +9,15 @@ #include "access/clog.h" #include "access/commit_ts.h" -#include "access/gin.h" -#include "access/gist_private.h" +#include "access/ginxlog.h" +#include "access/gistxlog.h" #include "access/generic_xlog.h" -#include "access/hash.h" +#include "access/hash_xlog.h" #include "access/heapam_xlog.h" #include "access/brin_xlog.h" #include "access/multixact.h" -#include "access/nbtree.h" -#include "access/spgist.h" +#include "access/nbtxlog.h" +#include "access/spgxlog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage_xlog.h" @@ -33,8 +33,8 @@ #include "utils/relmapper.h" /* must be kept in sync with RmgrData definition in xlog_internal.h */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ - { name, redo, desc, identify, startup, cleanup }, +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ + { name, redo, desc, identify, startup, cleanup, mask }, const RmgrData RmgrTable[RM_MAX_ID + 1] = { #include "access/rmgrlist.h" diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 37fdd5a5e1..ed90fb9232 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -38,7 +38,7 @@ * by re-setting the page's page_dirty flag. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/slru.c @@ -54,6 +54,7 @@ #include "access/slru.h" #include "access/transam.h" #include "access/xlog.h" +#include "pgstat.h" #include "storage/fd.h" #include "storage/shmem.h" #include "miscadmin.h" @@ -216,9 +217,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, Assert(strlen(name) + 1 < SLRU_MAX_NAME_LENGTH); strlcpy(shared->lwlock_tranche_name, name, SLRU_MAX_NAME_LENGTH); shared->lwlock_tranche_id = tranche_id; - shared->lwlock_tranche.name = shared->lwlock_tranche_name; - shared->lwlock_tranche.array_base = shared->buffer_locks; - shared->lwlock_tranche.array_stride = sizeof(LWLockPadded); ptr += BUFFERALIGN(offset); for (slotno = 0; slotno < nslots; slotno++) @@ -237,7 +235,8 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, Assert(found); /* Register SLRU tranche in the main tranches array */ - LWLockRegisterTranche(shared->lwlock_tranche_id, &shared->lwlock_tranche); + LWLockRegisterTranche(shared->lwlock_tranche_id, + shared->lwlock_tranche_name); /* * Initialize the unshared control struct, including directory path. We @@ -691,13 +690,16 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) } errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { + pgstat_report_wait_end(); slru_errcause = SLRU_READ_FAILED; slru_errno = errno; CloseTransientFile(fd); return false; } + pgstat_report_wait_end(); if (CloseTransientFile(fd)) { @@ -850,8 +852,10 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) } errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { + pgstat_report_wait_end(); /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; @@ -861,6 +865,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) CloseTransientFile(fd); return false; } + pgstat_report_wait_end(); /* * If not part of Flush, need to fsync now. We assume this happens @@ -868,13 +873,16 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) */ if (!fdata) { + pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); if (ctl->do_fsync && pg_fsync(fd)) { + pgstat_report_wait_end(); slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; CloseTransientFile(fd); return false; } + pgstat_report_wait_end(); if (CloseTransientFile(fd)) { @@ -1142,6 +1150,7 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) ok = true; for (i = 0; i < fdata.num_files; i++) { + pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC); if (ctl->do_fsync && pg_fsync(fdata.fd[i])) { slru_errcause = SLRU_FSYNC_FAILED; @@ -1149,6 +1158,7 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; ok = false; } + pgstat_report_wait_end(); if (CloseTransientFile(fdata.fd[i])) { diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 76069546cb..a0390bf25b 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -3,15 +3,14 @@ * subtrans.c * PostgreSQL subtransaction-log manager * - * The pg_subtrans manager is a pg_clog-like manager that stores the parent + * The pg_subtrans manager is a pg_xact-like manager that stores the parent * transaction Id for each transaction. It is a fundamental part of the * nested transactions implementation. A main transaction has a parent * of InvalidTransactionId, and each subtransaction has its immediate parent. * The tree can easily be walked from child to parent, but not in the * opposite direction. * - * This code is based on clog.c, but the robustness requirements - * are completely different from pg_clog, because we only need to remember + * are completely different from pg_xact, because we only need to remember * pg_subtrans information for currently-open transactions. Thus, there is * no need to preserve data over a crash and restart. * @@ -19,7 +18,7 @@ * data across crashes. During database startup, we simply force the * currently-active page of SUBTRANS to zeroes. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -69,11 +68,9 @@ static bool SubTransPagePrecedes(int page1, int page2); /* * Record the parent of a subtransaction in the subtrans log. - * - * In some cases we may need to overwrite an existing value. */ void -SubTransSetParent(TransactionId xid, TransactionId parent, bool overwriteOK) +SubTransSetParent(TransactionId xid, TransactionId parent) { int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); @@ -81,6 +78,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent, bool overwriteOK) TransactionId *ptr; Assert(TransactionIdIsValid(parent)); + Assert(TransactionIdFollows(xid, parent)); LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); @@ -88,13 +86,17 @@ SubTransSetParent(TransactionId xid, TransactionId parent, bool overwriteOK) ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; ptr += entryno; - /* Current state should be 0 */ - Assert(*ptr == InvalidTransactionId || - (*ptr == parent && overwriteOK)); - - *ptr = parent; - - SubTransCtl->shared->page_dirty[slotno] = true; + /* + * It's possible we'll try to set the parent xid multiple times but we + * shouldn't ever be changing the xid from one valid xid to another valid + * xid, which would corrupt the data structure. + */ + if (*ptr != parent) + { + Assert(*ptr == InvalidTransactionId); + *ptr = parent; + SubTransCtl->shared->page_dirty[slotno] = true; + } LWLockRelease(SubtransControlLock); } @@ -158,6 +160,15 @@ SubTransGetTopmostTransaction(TransactionId xid) if (TransactionIdPrecedes(parentXid, TransactionXmin)) break; parentXid = SubTransGetParent(parentXid); + + /* + * By convention the parent xid gets allocated first, so should always + * precede the child xid. Anything else points to a corrupted data + * structure that could lead to an infinite loop, so exit. + */ + if (!TransactionIdPrecedes(parentXid, previousXid)) + elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u", + previousXid, parentXid); } Assert(TransactionIdIsValid(previousXid)); diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index bd91573708..8cab8b9aa9 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -15,13 +15,13 @@ * <parentTLI> <switchpoint> <reason> * * parentTLI ID of the parent timeline - * switchpoint XLogRecPtr of the WAL position where the switch happened + * switchpoint XLogRecPtr of the WAL location where the switch happened * reason human-readable explanation of why the timeline was changed * * The fields are separated by tabs. Lines beginning with # are comments, and * are ignored. Empty lines are also ignored. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/timeline.c @@ -32,18 +32,18 @@ #include "postgres.h" #include <sys/stat.h> -#include <stdio.h> #include <unistd.h> #include "access/timeline.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogdefs.h" +#include "pgstat.h" #include "storage/fd.h" /* * Copies all timeline history files with id's between 'begin' and 'end' - * from archive to pg_xlog. + * from archive to pg_wal. */ void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end) @@ -151,7 +151,7 @@ readTimeLineHistory(TimeLineID targetTLI) if (nfields != 3) ereport(FATAL, (errmsg("syntax error in history file: %s", fline), - errhint("Expected a transaction log switchpoint location."))); + errhint("Expected a write-ahead log switchpoint location."))); if (result && tli <= lasttli) ereport(FATAL, @@ -191,7 +191,7 @@ readTimeLineHistory(TimeLineID targetTLI) result = lcons(entry, result); /* - * If the history file was fetched from archive, save it in pg_xlog for + * If the history file was fetched from archive, save it in pg_wal for * future reference. */ if (fromArchive) @@ -278,7 +278,7 @@ findNewestTimeLine(TimeLineID startTLI) * * newTLI: ID of the new timeline * parentTLI: ID of its immediate parent - * switchpoint: XLOG position where the system switched to the new timeline + * switchpoint: WAL location where the system switched to the new timeline * reason: human-readable explanation of why the timeline was switched * * Currently this is only used at the end recovery, and so there are no locking @@ -339,7 +339,9 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, for (;;) { errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ); nbytes = (int) read(srcfd, buffer, sizeof(buffer)); + pgstat_report_wait_end(); if (nbytes < 0 || errno != 0) ereport(ERROR, (errcode_for_file_access(), @@ -347,6 +349,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, if (nbytes == 0) break; errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE); if ((int) write(fd, buffer, nbytes) != nbytes) { int save_errno = errno; @@ -366,6 +369,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); } + pgstat_report_wait_end(); } CloseTransientFile(srcfd); } @@ -401,10 +405,12 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, errmsg("could not write to file \"%s\": %m", tmppath))); } + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC); if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); if (CloseTransientFile(fd)) ereport(ERROR, @@ -461,6 +467,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) errmsg("could not create file \"%s\": %m", tmppath))); errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE); if ((int) write(fd, content, size) != size) { int save_errno = errno; @@ -476,11 +483,14 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); } + pgstat_report_wait_end(); + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC); if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); if (CloseTransientFile(fd)) ereport(ERROR, diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index d1a549b2b2..b160973270 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -1,9 +1,9 @@ /*------------------------------------------------------------------------- * * transam.c - * postgres transaction log interface routines + * postgres transaction (commit) log interface routines * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -169,7 +169,7 @@ pgxc_is_inprogress(PG_FUNCTION_ARGS) * True iff transaction associated with the identifier did commit. * * Note: - * Assumes transaction identifier is valid. + * Assumes transaction identifier is valid and exists in clog. */ bool /* true if given transaction committed */ TransactionIdDidCommit(TransactionId transactionId) @@ -225,7 +225,7 @@ TransactionIdDidCommit(TransactionId transactionId) * True iff transaction associated with the identifier did abort. * * Note: - * Assumes transaction identifier is valid. + * Assumes transaction identifier is valid and exists in clog. */ bool /* true if given transaction aborted */ TransactionIdDidAbort(TransactionId transactionId) @@ -274,7 +274,7 @@ TransactionIdDidAbort(TransactionId transactionId) * True iff transaction associated with the identifier is currently * known to have either committed or aborted. * - * This does NOT look into pg_clog but merely probes our local cache + * This does NOT look into pg_xact but merely probes our local cache * (and so it's not named TransactionIdDidComplete, which would be the * appropriate name for a function that worked that way). The intended * use is just to short-circuit TransactionIdIsInProgress calls when doing diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index b65227922b..f6986d37db 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -3,7 +3,7 @@ * twophase.c * Two-phase commit support functions. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -46,8 +46,27 @@ * fsynced * * If COMMIT happens after checkpoint then backend reads state data from * files - * * In case of crash replay will move data from xlog to files, if that - * hasn't happened before. XXX TODO - move to shmem in replay also + * + * During replay and replication, TwoPhaseState also holds information + * about active prepared transactions that haven't been moved to disk yet. + * + * Replay of twophase records happens by the following rules: + * + * * At the beginning of recovery, pg_twophase is scanned once, filling + * TwoPhaseState with entries marked with gxact->inredo and + * gxact->ondisk. Two-phase file data older than the XID horizon of + * the redo position are discarded. + * * On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts. + * gxact->inredo is set to true for such entries. + * * On Checkpoint we iterate through TwoPhaseState->prepXacts entries + * that have gxact->inredo set and are behind the redo_horizon. We + * save them to disk and then switch gxact->ondisk to true. + * * On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts. + * If gxact->ondisk is true, the corresponding entry from the disk + * is additionally deleted. + * * RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions() + * and PrescanPreparedTransactions() have been modified to go through + * gxact->inredo entries that have not made it to disk. * *------------------------------------------------------------------------- */ @@ -55,7 +74,6 @@ #include <fcntl.h> #include <sys/stat.h> -#include <sys/types.h> #include <time.h> #include <unistd.h> @@ -157,11 +175,13 @@ typedef struct GlobalTransactionData */ XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */ XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */ + TransactionId xid; /* The GXACT id */ Oid owner; /* ID of user that executed the xact */ BackendId locking_backend; /* backend currently working on the xact */ bool valid; /* TRUE if PGPROC entry is in proc array */ bool ondisk; /* TRUE if prepare state file is on disk */ + bool inredo; /* TRUE if entry was added via xlog_redo */ char gid[GIDSIZE]; /* The GID assigned to the prepared xact */ } GlobalTransactionData; @@ -208,6 +228,14 @@ static void ProcessRecords(char *bufptr, TransactionId xid, static void RemoveGXact(GlobalTransaction gxact); static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len); +static char *ProcessTwoPhaseBuffer(TransactionId xid, + XLogRecPtr prepare_start_lsn, + bool fromdisk, bool setParent, bool setNextXid); +static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, + const char *gid, TimestampTz prepared_at, Oid owner, + Oid databaseid); +static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); +static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); /* * Initialization of shared memory @@ -352,18 +380,12 @@ PostPrepare_Twophase(void) /* * MarkAsPreparing * Reserve the GID for the given transaction. - * - * Internally, this creates a gxact struct and puts it into the active array. - * NOTE: this is also used when reloading a gxact after a crash; so avoid - * assuming that we can use very much backend context. */ GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid) { GlobalTransaction gxact; - PGPROC *proc; - PGXACT *pgxact; int i; if (strlen(gid) >= GIDSIZE) @@ -411,6 +433,37 @@ MarkAsPreparing(TransactionId xid, const char *gid, gxact = TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = gxact->next; + MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid); + + gxact->ondisk = false; + + /* And insert it into the active array */ + Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); + TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; + + LWLockRelease(TwoPhaseStateLock); + + return gxact; +} + +/* + * MarkAsPreparingGuts + * + * This uses a gxact struct and puts it into the active array. + * NOTE: this is also used when reloading a gxact after a crash; so avoid + * assuming that we can use very much backend context. + * + * Note: This function should be called with appropriate locks held. + */ +static void +MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, + TimestampTz prepared_at, Oid owner, Oid databaseid) +{ + PGPROC *proc; + PGXACT *pgxact; + int i; + + Assert(gxact != NULL); proc = &ProcGlobal->allProcs[gxact->pgprocno]; pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; @@ -429,6 +482,7 @@ MarkAsPreparing(TransactionId xid, const char *gid, proc->backendId = InvalidBackendId; proc->databaseId = databaseid; proc->roleId = owner; + proc->isBackgroundWorker = false; proc->lwWaiting = false; proc->lwWaitMode = 0; proc->waitLock = NULL; @@ -440,28 +494,18 @@ MarkAsPreparing(TransactionId xid, const char *gid, pgxact->nxids = 0; gxact->prepared_at = prepared_at; - /* initialize LSN to InvalidXLogRecPtr */ - gxact->prepare_start_lsn = InvalidXLogRecPtr; - gxact->prepare_end_lsn = InvalidXLogRecPtr; + gxact->xid = xid; gxact->owner = owner; gxact->locking_backend = MyBackendId; gxact->valid = false; - gxact->ondisk = false; + gxact->inredo = false; strcpy(gxact->gid, gid); - /* And insert it into the active array */ - Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); - TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; - /* * Remember that we have this GlobalTransaction entry locked for us. If we * abort after this, we must release it. */ MyLockedGxact = gxact; - - LWLockRelease(TwoPhaseStateLock); - - return gxact; } /* @@ -1220,8 +1264,10 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) */ buf = (char *) palloc(stat.st_size); + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ); if (read(fd, buf, stat.st_size) != stat.st_size) { + pgstat_report_wait_end(); CloseTransientFile(fd); if (give_warnings) ereport(WARNING, @@ -1232,6 +1278,7 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) return NULL; } + pgstat_report_wait_end(); CloseTransientFile(fd); hdr = (TwoPhaseFileHeader *) buf; @@ -1261,9 +1308,9 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) * Reads 2PC data from xlog. During checkpoint this data will be moved to * twophase files and ReadTwoPhaseFile should be used instead. * - * Note clearly that this function accesses WAL during normal operation, similarly - * to the way WALSender or Logical Decoding would do. It does not run during - * crash recovery or standby processing. + * Note clearly that this function can access WAL during normal operation, + * similarly to the way WALSender or Logical Decoding would do. + * */ static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) @@ -1272,20 +1319,18 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) XLogReaderState *xlogreader; char *errormsg; - Assert(!RecoveryInProgress()); - xlogreader = XLogReaderAllocate(&read_local_xlog_page, NULL); if (!xlogreader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"), - errdetail("Failed while allocating an XLog reading processor."))); + errdetail("Failed while allocating a WAL reading processor."))); record = XLogReadRecord(xlogreader, lsn, &errormsg); if (record == NULL) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read two-phase state from xlog at %X/%X", + errmsg("could not read two-phase state from WAL at %X/%X", (uint32) (lsn >> 32), (uint32) lsn))); @@ -1293,9 +1338,9 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) (XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE) ereport(ERROR, (errcode_for_file_access(), - errmsg("expected two-phase state data is not present in xlog at %X/%X", - (uint32) (lsn >> 32), - (uint32) lsn))); + errmsg("expected two-phase state data is not present in WAL at %X/%X", + (uint32) (lsn >> 32), + (uint32) lsn))); if (len != NULL) *len = XLogRecGetDataLen(xlogreader); @@ -1413,7 +1458,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) /* * The order of operations here is critical: make the XLOG entry for * commit or abort, then mark the transaction committed or aborted in - * pg_clog, then remove its PGPROC from the global ProcArray (which means + * pg_xact, then remove its PGPROC from the global ProcArray (which means * TransactionIdIsInProgress will stop saying the prepared xact is in * progress), then run the post-commit or post-abort callbacks. The * callbacks will release the locks the transaction held. @@ -1532,7 +1577,7 @@ ProcessRecords(char *bufptr, TransactionId xid, * If giveWarning is false, do not complain about file-not-present; * this is an expected case during WAL replay. */ -void +static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) { char path[MAXPGPATH]; @@ -1552,7 +1597,7 @@ RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) * * Note: content and len don't include CRC. */ -void +static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len) { char path[MAXPGPATH]; @@ -1576,8 +1621,10 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) path))); /* Write content and CRC */ + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE); if (write(fd, content, len) != len) { + pgstat_report_wait_end(); CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), @@ -1585,16 +1632,19 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) } if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c)) { + pgstat_report_wait_end(); CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } + pgstat_report_wait_end(); /* * We must fsync the file because the end-of-replay checkpoint will not do * so, there being no GXACT in shared memory yet to tell it to. */ + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC); if (pg_fsync(fd) != 0) { CloseTransientFile(fd); @@ -1602,6 +1652,7 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) (errcode_for_file_access(), errmsg("could not fsync two-phase state file: %m"))); } + pgstat_report_wait_end(); if (CloseTransientFile(fd) != 0) ereport(ERROR, @@ -1612,9 +1663,11 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) /* * CheckPointTwoPhase -- handle 2PC component of checkpointing. * - * We must fsync the state file of any GXACT that is valid and has a PREPARE - * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or - * has a later LSN, this checkpoint is not responsible for fsyncing it.) + * We must fsync the state file of any GXACT that is valid or has been + * generated during redo and has a PREPARE LSN <= the checkpoint's redo + * horizon. (If the gxact isn't valid yet, has not been generated in + * redo, or has a later LSN, this checkpoint is not responsible for + * fsyncing it.) * * This is deliberately run as late as possible in the checkpoint sequence, * because GXACTs ordinarily have short lifespans, and so it is quite @@ -1646,7 +1699,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) * * It's also possible to move I/O out of the lock, but on every error we * should check whether somebody committed our transaction in different - * backend. Let's leave this optimisation for future, if somebody will + * backend. Let's leave this optimization for future, if somebody will * spot that this place cause bottleneck. * * Note that it isn't possible for there to be a GXACT with a @@ -1656,10 +1709,13 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) LWLockAcquire(TwoPhaseStateLock, LW_SHARED); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { + /* + * Note that we are using gxact not pgxact so this works in recovery + * also + */ GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; - PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; - if (gxact->valid && + if ((gxact->valid || gxact->inredo) && !gxact->ondisk && gxact->prepare_end_lsn <= redo_horizon) { @@ -1667,20 +1723,30 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) int len; XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); - RecreateTwoPhaseFile(pgxact->xid, buf, len); + RecreateTwoPhaseFile(gxact->xid, buf, len); gxact->ondisk = true; + gxact->prepare_start_lsn = InvalidXLogRecPtr; + gxact->prepare_end_lsn = InvalidXLogRecPtr; pfree(buf); serialized_xacts++; } } LWLockRelease(TwoPhaseStateLock); + /* + * Flush unconditionally the parent directory to make any information + * durable on disk. Two-phase files could have been removed and those + * removals need to be made persistent as well as any files newly created + * previously since the last checkpoint. + */ + fsync_fname(TWOPHASE_DIR, true); + TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE(); if (log_checkpoints && serialized_xacts > 0) ereport(LOG, (errmsg_plural("%u two-phase state file was written " - "for long-running prepared transactions", + "for a long-running prepared transaction", "%u two-phase state files were written " "for long-running prepared transactions", serialized_xacts, @@ -1688,12 +1754,48 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) } /* + * restoreTwoPhaseData + * + * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data. + * This is called once at the beginning of recovery, saving any extra + * lookups in the future. Two-phase files that are newer than the + * minimum XID horizon are discarded on the way. + */ +void +restoreTwoPhaseData(void) +{ + DIR *cldir; + struct dirent *clde; + + cldir = AllocateDir(TWOPHASE_DIR); + while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) + { + if (strlen(clde->d_name) == 8 && + strspn(clde->d_name, "0123456789ABCDEF") == 8) + { + TransactionId xid; + char *buf; + + xid = (TransactionId) strtoul(clde->d_name, NULL, 16); + + buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr, + true, false, false); + if (buf == NULL) + continue; + + PrepareRedoAdd(buf, InvalidXLogRecPtr, InvalidXLogRecPtr); + } + } + FreeDir(cldir); +} + +/* * PrescanPreparedTransactions * - * Scan the pg_twophase directory and determine the range of valid XIDs - * present. This is run during database startup, after we have completed - * reading WAL. ShmemVariableCache->nextXid has been set to one more than - * the highest XID for which evidence exists in WAL. + * Scan the shared memory entries of TwoPhaseState and determine the range + * of valid XIDs present. This is run during database startup, after we + * have completed reading WAL. ShmemVariableCache->nextXid has been set to + * one more than the highest XID for which evidence exists in WAL. * * We throw away any prepared xacts with main XID beyond nextXid --- if any * are present, it suggests that the DBA has done a PITR recovery to an @@ -1719,119 +1821,57 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) { TransactionId origNextXid = ShmemVariableCache->nextXid; TransactionId result = origNextXid; - DIR *cldir; - struct dirent *clde; TransactionId *xids = NULL; int nxids = 0; int allocsize = 0; + int i; - cldir = AllocateDir(TWOPHASE_DIR); - while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - if (strlen(clde->d_name) == 8 && - strspn(clde->d_name, "0123456789ABCDEF") == 8) - { - TransactionId xid; - char *buf; - TwoPhaseFileHeader *hdr; - TransactionId *subxids; - int i; + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; - xid = (TransactionId) strtoul(clde->d_name, NULL, 16); + Assert(gxact->inredo); - /* Reject XID if too new */ - if (TransactionIdFollowsOrEquals(xid, origNextXid)) - { - ereport(WARNING, - (errmsg("removing future two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - continue; - } + xid = gxact->xid; - /* - * Note: we can't check if already processed because clog - * subsystem isn't up yet. - */ + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, false, true); - /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, true); - if (buf == NULL) - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - continue; - } - - /* Deconstruct header */ - hdr = (TwoPhaseFileHeader *) buf; - if (!TransactionIdEquals(hdr->xid, xid)) - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - pfree(buf); - continue; - } + if (buf == NULL) + continue; - /* - * OK, we think this file is valid. Incorporate xid into the - * running-minimum result. - */ - if (TransactionIdPrecedes(xid, result)) - result = xid; + /* + * OK, we think this file is valid. Incorporate xid into the + * running-minimum result. + */ + if (TransactionIdPrecedes(xid, result)) + result = xid; - /* - * Examine subtransaction XIDs ... they should all follow main - * XID, and they may force us to advance nextXid. - * - * We don't expect anyone else to modify nextXid, hence we don't - * need to hold a lock while examining it. We still acquire the - * lock to modify it, though. - */ - subxids = (TransactionId *) - (buf + MAXALIGN(sizeof(TwoPhaseFileHeader))); - for (i = 0; i < hdr->nsubxacts; i++) + if (xids_p) + { + if (nxids == allocsize) { - TransactionId subxid = subxids[i]; - - Assert(TransactionIdFollows(subxid, xid)); - if (TransactionIdFollowsOrEquals(subxid, - ShmemVariableCache->nextXid)) + if (nxids == 0) { - LWLockAcquire(XidGenLock, LW_EXCLUSIVE); - ShmemVariableCache->nextXid = subxid; - TransactionIdAdvance(ShmemVariableCache->nextXid); - LWLockRelease(XidGenLock); + allocsize = 10; + xids = palloc(allocsize * sizeof(TransactionId)); } - } - - - if (xids_p) - { - if (nxids == allocsize) + else { - if (nxids == 0) - { - allocsize = 10; - xids = palloc(allocsize * sizeof(TransactionId)); - } - else - { - allocsize = allocsize * 2; - xids = repalloc(xids, allocsize * sizeof(TransactionId)); - } + allocsize = allocsize * 2; + xids = repalloc(xids, allocsize * sizeof(TransactionId)); } - xids[nxids++] = xid; } - - pfree(buf); + xids[nxids++] = xid; } + + pfree(buf); } - FreeDir(cldir); + LWLockRelease(TwoPhaseStateLock); if (xids_p) { @@ -1845,211 +1885,294 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) /* * StandbyRecoverPreparedTransactions * - * Scan the pg_twophase directory and setup all the required information to - * allow standby queries to treat prepared transactions as still active. + * Scan the shared memory entries of TwoPhaseState and setup all the required + * information to allow standby queries to treat prepared transactions as still + * active. + * * This is never called at the end of recovery - we use * RecoverPreparedTransactions() at that point. * - * Currently we simply call SubTransSetParent() for any subxids of prepared - * transactions. If overwriteOK is true, it's OK if some XIDs have already - * been marked in pg_subtrans. + * The lack of calls to SubTransSetParent() calls here is by design; + * those calls are made by RecoverPreparedTransactions() at the end of recovery + * for those xacts that need this. */ void -StandbyRecoverPreparedTransactions(bool overwriteOK) +StandbyRecoverPreparedTransactions(void) { - DIR *cldir; - struct dirent *clde; + int i; - cldir = AllocateDir(TWOPHASE_DIR); - while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - if (strlen(clde->d_name) == 8 && - strspn(clde->d_name, "0123456789ABCDEF") == 8) - { - TransactionId xid; - char *buf; - TwoPhaseFileHeader *hdr; - TransactionId *subxids; - int i; - - xid = (TransactionId) strtoul(clde->d_name, NULL, 16); - - /* Already processed? */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) - { - ereport(WARNING, - (errmsg("removing stale two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - continue; - } + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; - /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, true); - if (buf == NULL) - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - continue; - } + Assert(gxact->inredo); - /* Deconstruct header */ - hdr = (TwoPhaseFileHeader *) buf; - if (!TransactionIdEquals(hdr->xid, xid)) - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - pfree(buf); - continue; - } + xid = gxact->xid; - /* - * Examine subtransaction XIDs ... they should all follow main - * XID. - */ - subxids = (TransactionId *) - (buf + MAXALIGN(sizeof(TwoPhaseFileHeader))); - for (i = 0; i < hdr->nsubxacts; i++) - { - TransactionId subxid = subxids[i]; - - Assert(TransactionIdFollows(subxid, xid)); - SubTransSetParent(xid, subxid, overwriteOK); - } - } + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, false, false); + if (buf != NULL) + pfree(buf); } - FreeDir(cldir); + LWLockRelease(TwoPhaseStateLock); } /* * RecoverPreparedTransactions * - * Scan the pg_twophase directory and reload shared-memory state for each - * prepared transaction (reacquire locks, etc). This is run during database - * startup. + * Scan the shared memory entries of TwoPhaseState and reload the state for + * each prepared transaction (reacquire locks, etc). + * + * This is run during database startup. + * + * At the end of recovery the way we take snapshots will change. We now need + * to mark all running transactions with their full SubTransSetParent() info + * to allow normal snapshots to work correctly if snapshots overflow. + * We do this here because by definition prepared transactions are the only + * type of write transaction still running, so this is necessary and + * complete. */ void RecoverPreparedTransactions(void) { - char dir[MAXPGPATH]; - DIR *cldir; - struct dirent *clde; - bool overwriteOK = false; - - snprintf(dir, MAXPGPATH, "%s", TWOPHASE_DIR); + int i; - cldir = AllocateDir(dir); - while ((clde = ReadDir(cldir, dir)) != NULL) + /* + * Don't need a lock in the recovery phase. + */ + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - if (strlen(clde->d_name) == 8 && - strspn(clde->d_name, "0123456789ABCDEF") == 8) - { - TransactionId xid; - char *buf; - char *bufptr; - TwoPhaseFileHeader *hdr; - TransactionId *subxids; - GlobalTransaction gxact; - const char *gid; - int i; + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + char *bufptr; + TwoPhaseFileHeader *hdr; + TransactionId *subxids; + const char *gid; - xid = (TransactionId) strtoul(clde->d_name, NULL, 16); + xid = gxact->xid; - /* Already processed? */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) - { - ereport(WARNING, - (errmsg("removing stale two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - continue; - } + /* + * Reconstruct subtrans state for the transaction --- needed because + * pg_subtrans is not preserved over a restart. Note that we are + * linking all the subtransactions directly to the top-level XID; + * there may originally have been a more complex hierarchy, but + * there's no need to restore that exactly. It's possible that + * SubTransSetParent has been set before, if the prepared transaction + * generated xid assignment records. + */ + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, true, false); + if (buf == NULL) + continue; - /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, true); - if (buf == NULL) - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state file \"%s\"", - clde->d_name))); - RemoveTwoPhaseFile(xid, true); - continue; - } + ereport(LOG, + (errmsg("recovering prepared transaction %u from shared memory", xid))); + + hdr = (TwoPhaseFileHeader *) buf; + Assert(TransactionIdEquals(hdr->xid, xid)); + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + gid = (const char *) bufptr; + bufptr += MAXALIGN(hdr->gidlen); + subxids = (TransactionId *) bufptr; + bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); + bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); + bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); + bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage)); - ereport(LOG, - (errmsg("recovering prepared transaction %u", xid))); - - /* Deconstruct header */ - hdr = (TwoPhaseFileHeader *) buf; - Assert(TransactionIdEquals(hdr->xid, xid)); - bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); - gid = (const char *) bufptr; - bufptr += MAXALIGN(hdr->gidlen); - subxids = (TransactionId *) bufptr; - bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); - bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); - bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); - bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage)); + /* + * Recreate its GXACT and dummy PGPROC. But, check whether it was + * added in redo and already has a shmem entry for it. + */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + MarkAsPreparingGuts(gxact, xid, gid, + hdr->prepared_at, + hdr->owner, hdr->database); - /* - * It's possible that SubTransSetParent has been set before, if - * the prepared transaction generated xid assignment records. Test - * here must match one used in AssignTransactionId(). - */ - if (InHotStandby && (hdr->nsubxacts >= PGPROC_MAX_CACHED_SUBXIDS || - XLogLogicalInfoActive())) - overwriteOK = true; + /* recovered, so reset the flag for entries generated by redo */ + gxact->inredo = false; - /* - * Reconstruct subtrans state for the transaction --- needed - * because pg_subtrans is not preserved over a restart. Note that - * we are linking all the subtransactions directly to the - * top-level XID; there may originally have been a more complex - * hierarchy, but there's no need to restore that exactly. - */ - for (i = 0; i < hdr->nsubxacts; i++) - SubTransSetParent(subxids[i], xid, overwriteOK); + LWLockRelease(TwoPhaseStateLock); - /* - * Recreate its GXACT and dummy PGPROC - */ - gxact = MarkAsPreparing(xid, gid, - hdr->prepared_at, - hdr->owner, hdr->database); - gxact->ondisk = true; - GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids); - MarkAsPrepared(gxact); + GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids); + MarkAsPrepared(gxact); - /* - * Recover other state (notably locks) using resource managers - */ - ProcessRecords(bufptr, xid, twophase_recover_callbacks); + /* + * Recover other state (notably locks) using resource managers + */ + ProcessRecords(bufptr, xid, twophase_recover_callbacks); - /* - * Release locks held by the standby process after we process each - * prepared transaction. As a result, we don't need too many - * additional locks at any one time. - */ - if (InHotStandby) - StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids); + /* + * Release locks held by the standby process after we process each + * prepared transaction. As a result, we don't need too many + * additional locks at any one time. + */ + if (InHotStandby) + StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids); + + /* + * We're done with recovering this transaction. Clear MyLockedGxact, + * like we do in PrepareTransaction() during normal operation. + */ + PostPrepare_Twophase(); + + pfree(buf); + } +} + +/* + * ProcessTwoPhaseBuffer + * + * Given a transaction id, read it either from disk or read it directly + * via shmem xlog record pointer using the provided "prepare_start_lsn". + * + * If setParent is true, set up subtransaction parent linkages. + * + * If setNextXid is true, set ShmemVariableCache->nextXid to the newest + * value scanned. + */ +static char * +ProcessTwoPhaseBuffer(TransactionId xid, + XLogRecPtr prepare_start_lsn, + bool fromdisk, + bool setParent, bool setNextXid) +{ + TransactionId origNextXid = ShmemVariableCache->nextXid; + TransactionId *subxids; + char *buf; + TwoPhaseFileHeader *hdr; + int i; + + if (!fromdisk) + Assert(prepare_start_lsn != InvalidXLogRecPtr); + + /* Already processed? */ + if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing stale two-phase state file for \"%u\"", + xid))); + RemoveTwoPhaseFile(xid, true); + } + else + { + ereport(WARNING, + (errmsg("removing stale two-phase state from" + " shared memory for \"%u\"", xid))); + PrepareRedoRemove(xid, true); + } + return NULL; + } + + /* Reject XID if too new */ + if (TransactionIdFollowsOrEquals(xid, origNextXid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing future two-phase state file for \"%u\"", + xid))); + RemoveTwoPhaseFile(xid, true); + } + else + { + ereport(WARNING, + (errmsg("removing future two-phase state from memory for \"%u\"", + xid))); + PrepareRedoRemove(xid, true); + } + return NULL; + } + + if (fromdisk) + { + /* Read and validate file */ + buf = ReadTwoPhaseFile(xid, true); + if (buf == NULL) + { + ereport(WARNING, + (errmsg("removing corrupt two-phase state file for \"%u\"", + xid))); + RemoveTwoPhaseFile(xid, true); + return NULL; + } + } + else + { + /* Read xlog data */ + XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL); + } + + /* Deconstruct header */ + hdr = (TwoPhaseFileHeader *) buf; + if (!TransactionIdEquals(hdr->xid, xid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing corrupt two-phase state file for \"%u\"", + xid))); + RemoveTwoPhaseFile(xid, true); + } + else + { + ereport(WARNING, + (errmsg("removing corrupt two-phase state from memory for \"%u\"", + xid))); + PrepareRedoRemove(xid, true); + } + pfree(buf); + return NULL; + } + + /* + * Examine subtransaction XIDs ... they should all follow main XID, and + * they may force us to advance nextXid. + */ + subxids = (TransactionId *) (buf + + MAXALIGN(sizeof(TwoPhaseFileHeader)) + + MAXALIGN(hdr->gidlen)); + for (i = 0; i < hdr->nsubxacts; i++) + { + TransactionId subxid = subxids[i]; + + Assert(TransactionIdFollows(subxid, xid)); + /* update nextXid if needed */ + if (setNextXid && + TransactionIdFollowsOrEquals(subxid, + ShmemVariableCache->nextXid)) + { /* - * We're done with recovering this transaction. Clear - * MyLockedGxact, like we do in PrepareTransaction() during normal - * operation. + * We don't expect anyone else to modify nextXid, hence we don't + * need to hold a lock while examining it. We still acquire the + * lock to modify it, though, so we recheck. */ - PostPrepare_Twophase(); - - pfree(buf); + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + if (TransactionIdFollowsOrEquals(subxid, + ShmemVariableCache->nextXid)) + { + ShmemVariableCache->nextXid = subxid; + TransactionIdAdvance(ShmemVariableCache->nextXid); + } + LWLockRelease(XidGenLock); } + + if (setParent) + SubTransSetParent(subxid, xid); } - FreeDir(cldir); + + return buf; } + /* * RecordTransactionCommitPrepared * @@ -2086,11 +2209,16 @@ RecordTransactionCommitPrepared(TransactionId xid, /* See notes in RecordTransactionCommit */ MyPgXact->delayChkpt = true; - /* Emit the XLOG commit record */ + /* + * Emit the XLOG commit record. Note that we mark 2PC commits as + * potentially having AccessExclusiveLocks since we don't know whether or + * not they do. + */ recptr = XactLogCommitRecord(committs, nchildren, children, nrels, rels, ninvalmsgs, invalmsgs, initfileinval, false, + MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK, xid); @@ -2123,7 +2251,7 @@ RecordTransactionCommitPrepared(TransactionId xid, /* Flush XLOG to disk */ XLogFlush(recptr); - /* Mark the transaction committed in pg_clog */ + /* Mark the transaction committed in pg_xact */ TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ @@ -2167,10 +2295,15 @@ RecordTransactionAbortPrepared(TransactionId xid, START_CRIT_SECTION(); - /* Emit the XLOG abort record */ + /* + * Emit the XLOG commit record. Note that we mark 2PC aborts as + * potentially having AccessExclusiveLocks since we don't know whether or + * not they do. + */ recptr = XactLogAbortRecord(GetCurrentTimestamp(), nchildren, children, nrels, rels, + MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK, xid); /* Always flush, since we're about to remove the 2PC state file */ @@ -2192,3 +2325,113 @@ RecordTransactionAbortPrepared(TransactionId xid, */ SyncRepWaitForLSN(recptr, false); } + +/* + * PrepareRedoAdd + * + * Store pointers to the start/end of the WAL record along with the xid in + * a gxact entry in shared memory TwoPhaseState structure. If caller + * specifies InvalidXLogRecPtr as WAL location to fetch the two-phase + * data, the entry is marked as located on disk. + */ +void +PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf; + char *bufptr; + const char *gid; + GlobalTransaction gxact; + + Assert(RecoveryInProgress()); + + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + gid = (const char *) bufptr; + + /* + * Reserve the GID for the given transaction in the redo code path. + * + * This creates a gxact struct and puts it into the active array. + * + * In redo, this struct is mainly used to track PREPARE/COMMIT entries in + * shared memory. Hence, we only fill up the bare minimum contents here. + * The gxact also gets marked with gxact->inredo set to true to indicate + * that it got added in the redo phase + */ + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + /* Get a free gxact from the freelist */ + if (TwoPhaseState->freeGXacts == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("maximum number of prepared transactions reached"), + errhint("Increase max_prepared_transactions (currently %d).", + max_prepared_xacts))); + gxact = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = gxact->next; + + gxact->prepared_at = hdr->prepared_at; + gxact->prepare_start_lsn = start_lsn; + gxact->prepare_end_lsn = end_lsn; + gxact->xid = hdr->xid; + gxact->owner = hdr->owner; + gxact->locking_backend = InvalidBackendId; + gxact->valid = false; + gxact->ondisk = XLogRecPtrIsInvalid(start_lsn); + gxact->inredo = true; /* yes, added in redo */ + strcpy(gxact->gid, gid); + + /* And insert it into the active array */ + Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); + TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; + + LWLockRelease(TwoPhaseStateLock); + + elog(DEBUG2, "Adding 2PC data to shared memory %u", gxact->xid); +} + +/* + * PrepareRedoRemove + * + * Remove the corresponding gxact entry from TwoPhaseState. Also + * remove the 2PC file if a prepared transaction was saved via + * an earlier checkpoint. + */ +void +PrepareRedoRemove(TransactionId xid, bool giveWarning) +{ + GlobalTransaction gxact = NULL; + int i; + bool found = false; + + Assert(RecoveryInProgress()); + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + gxact = TwoPhaseState->prepXacts[i]; + + if (gxact->xid == xid) + { + Assert(gxact->inredo); + found = true; + break; + } + } + LWLockRelease(TwoPhaseStateLock); + + /* + * Just leave if there is nothing, this is expected during WAL replay. + */ + if (!found) + return; + + /* + * And now we can clean up any files we may have left. + */ + elog(DEBUG2, "Removing 2PC data from shared memory %u", xid); + if (gxact->ondisk) + RemoveTwoPhaseFile(xid, giveWarning); + RemoveGXact(gxact); + + return; +} diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c index 9f56e61e54..cdcc382f34 100644 --- a/src/backend/access/transam/twophase_rmgr.c +++ b/src/backend/access/transam/twophase_rmgr.c @@ -3,7 +3,7 @@ * twophase_rmgr.c * Two-phase-commit resource managers tables * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index a4e67d9fc3..d94a1deeb1 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -5,7 +5,7 @@ * * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 2010-2012 Postgres-XC Development Group - * Copyright (c) 2000-2016, PostgreSQL Global Development Group + * Copyright (c) 2000-2017, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/access/transam/varsup.c @@ -536,7 +536,28 @@ ReadNewTransactionId(void) } /* - * Determine the last safe XID to allocate given the currently oldest + * Advance the cluster-wide value for the oldest valid clog entry. + * + * We must acquire CLogTruncationLock to advance the oldestClogXid. It's not + * necessary to hold the lock during the actual clog truncation, only when we + * advance the limit, as code looking up arbitrary xids is required to hold + * CLogTruncationLock from when it tests oldestClogXid through to when it + * completes the clog lookup. + */ +void +AdvanceOldestClogXid(TransactionId oldest_datfrozenxid) +{ + LWLockAcquire(CLogTruncationLock, LW_EXCLUSIVE); + if (TransactionIdPrecedes(ShmemVariableCache->oldestClogXid, + oldest_datfrozenxid)) + { + ShmemVariableCache->oldestClogXid = oldest_datfrozenxid; + } + LWLockRelease(CLogTruncationLock); +} + +/* + * Determine the last safe XID to allocate using the currently oldest * datfrozenxid (ie, the oldest XID that might exist in any database * of our cluster), and the OID of the (or a) database with that value. */ @@ -676,7 +697,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) * * We primarily check whether oldestXidDB is valid. The cases we have in * mind are that that database was dropped, or the field was reset to zero - * by pg_resetxlog. In either case we should force recalculation of the + * by pg_resetwal. In either case we should force recalculation of the * wrap limit. Also do it if oldestXid is old enough to be forcing * autovacuums or other actions; this ensures we update our state as soon * as possible once extra overhead is being incurred. diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 049aabc209..77666c4b80 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -6,7 +6,7 @@ * See src/backend/access/transam/README for more information. * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -58,9 +58,11 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/syncrep.h" #include "replication/walsender.h" +#include "storage/condition_variable.h" #include "storage/fd.h" #include "storage/lmgr.h" #include "storage/predicate.h" @@ -131,12 +133,13 @@ int nParallelCurrentXids = 0; TransactionId *ParallelCurrentXids; /* - * MyXactAccessedTempRel is set when a temporary relation is accessed. - * We don't allow PREPARE TRANSACTION in that case. (This is global - * so that it can be set from heapam.c.) + * Miscellaneous flag bits to record events which occur on the top level + * transaction. These flags are only persisted in MyXactFlags and are intended + * so we remember to do certain things later on in the transaction. This is + * globally accessible, so can be set from anywhere in the code that requires + * recording flags. */ -bool MyXactAccessedTempRel = false; - +int MyXactFlags; /* * transaction states - transaction state from server perspective @@ -430,7 +433,7 @@ static void TransactionRecordXidWait_Internal(TransactionState s, #endif static void ShowTransactionState(const char *str); -static void ShowTransactionStateRec(TransactionState state); +static void ShowTransactionStateRec(const char *str, TransactionState state); static const char *BlockStateAsString(TBlockState blockState); static const char *TransStateAsString(TransState state); static void PrepareTransaction(void); @@ -693,7 +696,7 @@ AssignTransactionId(TransactionState s) XactTopTransactionId = s->transactionId; if (isSubXact) - SubTransSetParent(s->transactionId, s->parent->transactionId, false); + SubTransSetParent(s->transactionId, s->parent->transactionId); /* * If it's a top-level transaction, the predicate locking system needs to @@ -1303,9 +1306,7 @@ AtStart_Memory(void) TopTransactionContext = AllocSetContextCreate(TopMemoryContext, "TopTransactionContext", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); /* * In a top-level transaction, CurTransactionContext is the same as @@ -1363,9 +1364,7 @@ AtSubStart_Memory(void) */ CurTransactionContext = AllocSetContextCreate(CurTransactionContext, "CurTransactionContext", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); s->curTransactionContext = CurTransactionContext; /* Make the CurTransactionContext active. */ @@ -1495,8 +1494,8 @@ RecordTransactionCommit(void) /* * Mark ourselves as within our "commit critical section". This * forces any concurrent checkpoint to wait until we've updated - * pg_clog. Without this, it is possible for the checkpoint to set - * REDO after the XLOG record but fail to flush the pg_clog update to + * pg_xact. Without this, it is possible for the checkpoint to set + * REDO after the XLOG record but fail to flush the pg_xact update to * disk, leading to loss of the transaction commit if the system * crashes a little later. * @@ -1518,6 +1517,7 @@ RecordTransactionCommit(void) nchildren, children, nrels, rels, nmsgs, invalMessages, RelcacheInitFileInval, forceSyncCommit, + MyXactFlags, InvalidTransactionId /* plain commit */ ); if (replorigin) @@ -1885,7 +1885,7 @@ RecordTransactionAbort(bool isSubXact) XactLogAbortRecord(xact_time, nchildren, children, nrels, rels, - InvalidTransactionId); + MyXactFlags, InvalidTransactionId); /* * Report the latest async abort LSN, so that the WAL writer knows to @@ -2158,8 +2158,8 @@ StartTransaction(void) #endif XactIsoLevel = DefaultXactIsoLevel; forceSyncCommit = false; - MyXactAccessedTempRel = false; XactLocalNodePrepared = false; + MyXactFlags = 0; /* * reinitialize within-transaction counters @@ -2498,7 +2498,7 @@ CommitTransaction(void) if (!is_parallel_worker) { /* - * We need to mark our XIDs as committed in pg_clog. This is where we + * We need to mark our XIDs as committed in pg_xact. This is where we * durably commit. */ #ifdef XCP @@ -2636,7 +2636,8 @@ CommitTransaction(void) AtEOXact_ComboCid(); AtEOXact_HashTables(true); AtEOXact_PgStat(true); - AtEOXact_Snapshot(true); + AtEOXact_Snapshot(true, false); + AtEOXact_ApplyLauncher(true); pgstat_report_xact_timestamp(0); CurrentResourceOwner = NULL; @@ -2896,7 +2897,7 @@ PrepareTransaction(void) * cases, such as a temp table created and dropped all within the * transaction. That seems to require much more bookkeeping though. */ - if (MyXactAccessedTempRel) + if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot PREPARE a transaction that has operated on temporary tables"))); @@ -3048,7 +3049,7 @@ PrepareTransaction(void) AtEOXact_ComboCid(); AtEOXact_HashTables(true); /* don't call AtEOXact_PgStat here; we fixed pgstat state above */ - AtEOXact_Snapshot(true); + AtEOXact_Snapshot(true, true); pgstat_report_xact_timestamp(0); CurrentResourceOwner = NULL; @@ -3209,6 +3210,9 @@ AbortTransaction(void) /* Reset WAL record construction state */ XLogResetInsertion(); + /* Cancel condition variable sleep */ + ConditionVariableCancelSleep(); + /* * Also clean up any open wait for lock, since the lock manager will choke * if we try to wait for another lock before doing this. @@ -3276,7 +3280,7 @@ AbortTransaction(void) AtAbort_Twophase(); /* - * Advertise the fact that we aborted in pg_clog (assuming that we got as + * Advertise the fact that we aborted in pg_xact (assuming that we got as * far as assigning an XID to advertise). But if we're inside a parallel * worker, skip this; the user backend must be the one to write the abort * record. @@ -3354,6 +3358,7 @@ AbortTransaction(void) AtEOXact_ComboCid(); AtEOXact_HashTables(false); AtEOXact_PgStat(false); + AtEOXact_ApplyLauncher(false); pgstat_report_xact_timestamp(0); } @@ -3391,7 +3396,8 @@ CleanupTransaction(void) * do abort cleanup processing */ AtCleanup_Portals(); /* now safe to release portal memory */ - AtEOXact_Snapshot(false); /* and release the transaction's snapshots */ + AtEOXact_Snapshot(false, true); /* and release the transaction's + * snapshots */ CurrentResourceOwner = NULL; /* and resource owner */ if (TopTransactionResourceOwner) @@ -3527,7 +3533,7 @@ CommitTransactionCommand(void) * These shouldn't happen. TBLOCK_DEFAULT means the previous * StartTransactionCommand didn't set the STARTED state * appropriately, while TBLOCK_PARALLEL_INPROGRESS should be ended - * by EndParallelWorkerTranaction(), not this function. + * by EndParallelWorkerTransaction(), not this function. */ case TBLOCK_DEFAULT: case TBLOCK_PARALLEL_INPROGRESS: @@ -5490,7 +5496,7 @@ AbortSubTransaction(void) s->parent->subTransactionId); AtSubAbort_Notify(); - /* Advertise the fact that we aborted in pg_clog. */ + /* Advertise the fact that we aborted in pg_xact. */ (void) RecordTransactionAbort(true); /* Post-abort cleanup */ @@ -5812,11 +5818,8 @@ static void ShowTransactionState(const char *str) { /* skip work if message will definitely not be printed */ - if (log_min_messages <= DEBUG3 || client_min_messages <= DEBUG3) - { - elog(DEBUG3, "%s", str); - ShowTransactionStateRec(CurrentTransactionState); - } + if (log_min_messages <= DEBUG5 || client_min_messages <= DEBUG5) + ShowTransactionStateRec(str, CurrentTransactionState); } /* @@ -5824,7 +5827,7 @@ ShowTransactionState(const char *str) * Recursive subroutine for ShowTransactionState */ static void -ShowTransactionStateRec(TransactionState s) +ShowTransactionStateRec(const char *str, TransactionState s) { StringInfoData buf; @@ -5834,17 +5837,18 @@ ShowTransactionStateRec(TransactionState s) { int i; - appendStringInfo(&buf, "%u", s->childXids[0]); + appendStringInfo(&buf, ", children: %u", s->childXids[0]); for (i = 1; i < s->nChildXids; i++) appendStringInfo(&buf, " %u", s->childXids[i]); } if (s->parent) - ShowTransactionStateRec(s->parent); + ShowTransactionStateRec(str, s->parent); /* use ereport to suppress computation if msg will not be printed */ - ereport(DEBUG3, - (errmsg_internal("name: %s; blockState: %13s; state: %7s, xid/subid/cid: %u/%u/%u%s, nestlvl: %d, children: %s", + ereport(DEBUG5, + (errmsg_internal("%s(%d) name: %s; blockState: %s; state: %s, xid/subid/cid: %u/%u/%u%s%s", + str, s->nestingLevel, PointerIsValid(s->name) ? s->name : "unnamed", BlockStateAsString(s->blockState), TransStateAsString(s->state), @@ -5852,7 +5856,7 @@ ShowTransactionStateRec(TransactionState s) (unsigned int) s->subTransactionId, (unsigned int) currentCommandId, currentCommandIdUsed ? " (used)" : "", - s->nestingLevel, buf.data))); + buf.data))); pfree(buf.data); } @@ -5972,7 +5976,7 @@ XactLogCommitRecord(TimestampTz commit_time, int nrels, RelFileNode *rels, int nmsgs, SharedInvalidationMessage *msgs, bool relcacheInval, bool forceSync, - TransactionId twophase_xid) + int xactflags, TransactionId twophase_xid) { xl_xact_commit xlrec; xl_xact_xinfo xl_xinfo; @@ -6003,6 +6007,8 @@ XactLogCommitRecord(TimestampTz commit_time, xl_xinfo.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE; if (forceSyncCommit) xl_xinfo.xinfo |= XACT_COMPLETION_FORCE_SYNC_COMMIT; + if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK)) + xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS; /* * Check if the caller would like to ask standbys for immediate feedback @@ -6100,7 +6106,7 @@ XactLogCommitRecord(TimestampTz commit_time, XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin)); /* we allow filtering by xacts */ - XLogIncludeOrigin(); + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); return XLogInsert(RM_XACT_ID, info); } @@ -6115,7 +6121,7 @@ XLogRecPtr XactLogAbortRecord(TimestampTz abort_time, int nsubxacts, TransactionId *subxacts, int nrels, RelFileNode *rels, - TransactionId twophase_xid) + int xactflags, TransactionId twophase_xid) { xl_xact_abort xlrec; xl_xact_xinfo xl_xinfo; @@ -6140,6 +6146,9 @@ XactLogAbortRecord(TimestampTz abort_time, xlrec.xact_time = abort_time; + if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK)) + xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS; + if (nsubxacts > 0) { xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS; @@ -6239,7 +6248,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, if (standbyState == STANDBY_DISABLED) { /* - * Mark the transaction committed in pg_clog. + * Mark the transaction committed in pg_xact. */ TransactionIdCommitTree(xid, parsed->nsubxacts, parsed->subxacts); } @@ -6257,7 +6266,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, RecordKnownAssignedTransactionIds(max_xid); /* - * Mark the transaction committed in pg_clog. We use async commit + * Mark the transaction committed in pg_xact. We use async commit * protocol during recovery to provide information on database * consistency for when users try to set hint bits. It is important * that we do not set hint bits until the minRecoveryPoint is past @@ -6291,7 +6300,8 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, * via their top-level xid only, so no need to provide subxact list, * which will save time when replaying commits. */ - StandbyReleaseLockTree(xid, 0, NULL); + if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS) + StandbyReleaseLockTree(xid, 0, NULL); } if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) @@ -6394,7 +6404,7 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid) if (standbyState == STANDBY_DISABLED) { - /* Mark the transaction aborted in pg_clog, no need for async stuff */ + /* Mark the transaction aborted in pg_xact, no need for async stuff */ TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); } else @@ -6410,7 +6420,7 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid) */ RecordKnownAssignedTransactionIds(max_xid); - /* Mark the transaction aborted in pg_clog, no need for async stuff */ + /* Mark the transaction aborted in pg_xact, no need for async stuff */ TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); /* @@ -6427,7 +6437,8 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid) /* * Release locks, if any. There are no invalidations to send. */ - StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts); + if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS) + StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts); } /* Make sure files supposed to be dropped are dropped */ @@ -6470,7 +6481,9 @@ xact_redo(XLogReaderState *record) Assert(TransactionIdIsValid(parsed.twophase_xid)); xact_redo_commit(&parsed, parsed.twophase_xid, record->EndRecPtr, XLogRecGetOrigin(record)); - RemoveTwoPhaseFile(parsed.twophase_xid, false); + + /* Delete TwoPhaseState gxact entry and/or 2PC file. */ + PrepareRedoRemove(parsed.twophase_xid, false); } } else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED) @@ -6490,14 +6503,20 @@ xact_redo(XLogReaderState *record) { Assert(TransactionIdIsValid(parsed.twophase_xid)); xact_redo_abort(&parsed, parsed.twophase_xid); - RemoveTwoPhaseFile(parsed.twophase_xid, false); + + /* Delete TwoPhaseState gxact entry and/or 2PC file. */ + PrepareRedoRemove(parsed.twophase_xid, false); } } else if (info == XLOG_XACT_PREPARE) { - /* the record contents are exactly the 2PC file */ - RecreateTwoPhaseFile(XLogRecGetXid(record), - XLogRecGetData(record), XLogRecGetDataLen(record)); + /* + * Store xid and start/end pointers of the WAL record in TwoPhaseState + * gxact entry. + */ + PrepareRedoAdd(XLogRecGetData(record), + record->ReadRecPtr, + record->EndRecPtr); } else if (info == XLOG_XACT_ASSIGNMENT) { diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 19b4921075..b29f283e6a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1,10 +1,10 @@ /*------------------------------------------------------------------------- * * xlog.c - * PostgreSQL transaction log manager + * PostgreSQL write-ahead log manager * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/xlog.c @@ -15,6 +15,7 @@ #include "postgres.h" #include <ctype.h> +#include <math.h> #include <time.h> #include <fcntl.h> #include <sys/stat.h> @@ -44,6 +45,7 @@ #include "pgxc/barrier.h" #endif #include "pgstat.h" +#include "port/atomics.h" #include "postmaster/bgwriter.h" #include "postmaster/walwriter.h" #include "postmaster/startup.h" @@ -54,7 +56,6 @@ #include "replication/snapbuild.h" #include "replication/walreceiver.h" #include "replication/walsender.h" -#include "storage/barrier.h" #include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -67,9 +68,11 @@ #include "storage/reinit.h" #include "storage/smgr.h" #include "storage/spin.h" +#include "utils/backend_random.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/memutils.h" +#include "utils/pg_lsn.h" #include "utils/ps_status.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -86,8 +89,8 @@ extern uint32 bootstrap_data_checksum_version; /* User-settable parameters */ -int max_wal_size = 64; /* 1 GB */ -int min_wal_size = 5; /* 80 MB */ +int max_wal_size_mb = 1024; /* 1 GB */ +int min_wal_size_mb = 80; /* 80 MB */ int wal_keep_segments = 0; int XLOGbuffers = -1; int XLogArchiveTimeout = 0; @@ -97,6 +100,8 @@ bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; bool wal_compression = false; +char *wal_consistency_checking_string = NULL; +bool *wal_consistency_checking = NULL; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; @@ -237,9 +242,9 @@ static int LocalXLogInsertAllowed = -1; * valid in the startup process. * * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're - * currently performing crash recovery using only XLOG files in pg_xlog, but + * currently performing crash recovery using only XLOG files in pg_wal, but * will switch to using offline XLOG archives as soon as we reach the end of - * WAL in pg_xlog. + * WAL in pg_wal. */ bool ArchiveRecoveryRequested = false; bool InArchiveRecovery = false; @@ -247,6 +252,10 @@ bool InArchiveRecovery = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; +/* Buffers dedicated to consistency checks of size BLCKSZ */ +static char *replay_image_masked = NULL; +static char *master_image_masked = NULL; + /* options taken from recovery.conf for archive recovery */ char *recoveryRestoreCommand = NULL; static char *recoveryEndCommand = NULL; @@ -258,6 +267,7 @@ static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; static char *recoveryTargetBarrierId; static char *recoveryTargetName; +static XLogRecPtr recoveryTargetLSN; static int recovery_min_apply_delay = 0; static TimestampTz recoveryDelayUntilTime; @@ -279,6 +289,7 @@ static bool fast_promote = false; */ static TransactionId recoveryStopXid; static TimestampTz recoveryStopTime; +static XLogRecPtr recoveryStopLSN; static char recoveryStopName[MAXFNAMELEN]; static bool recoveryStopAfter; @@ -443,11 +454,21 @@ typedef struct XLogwrtResult * the WAL record is just copied to the page and the lock is released. But * to avoid the deadlock-scenario explained above, the indicator is always * updated before sleeping while holding an insertion lock. + * + * lastImportantAt contains the LSN of the last important WAL record inserted + * using a given lock. This value is used to detect if there has been + * important WAL activity since the last time some action, like a checkpoint, + * was performed - allowing to not repeat the action if not. The LSN is + * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was + * set. lastImportantAt is never cleared, only overwritten by the LSN of newer + * records. Tracking the WAL activity directly in WALInsertLock has the + * advantage of not needing any additional locks to update the value. */ typedef struct { LWLock lock; XLogRecPtr insertingAt; + XLogRecPtr lastImportantAt; } WALInsertLock; /* @@ -464,6 +485,35 @@ typedef union WALInsertLockPadded } WALInsertLockPadded; /* + * State of an exclusive backup, necessary to control concurrent activities + * across sessions when working on exclusive backups. + * + * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually + * running, to be more precise pg_start_backup() is not being executed for + * an exclusive backup and there is no exclusive backup in progress. + * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an + * exclusive backup. + * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished + * running and an exclusive backup is in progress. pg_stop_backup() is + * needed to finish it. + * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an + * exclusive backup. + */ +typedef enum ExclusiveBackupState +{ + EXCLUSIVE_BACKUP_NONE = 0, + EXCLUSIVE_BACKUP_STARTING, + EXCLUSIVE_BACKUP_IN_PROGRESS, + EXCLUSIVE_BACKUP_STOPPING +} ExclusiveBackupState; + +/* + * Session status of running backup, used for sanity checks in SQL-callable + * functions to start and stop backups. + */ +static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE; + +/* * Shared state data for WAL insertion. */ typedef struct XLogCtlInsert @@ -504,13 +554,14 @@ typedef struct XLogCtlInsert bool fullPageWrites; /* - * exclusiveBackup is true if a backup started with pg_start_backup() is - * in progress, and nonExclusiveBackups is a counter indicating the number - * of streaming base backups currently in progress. forcePageWrites is set - * to true when either of these is non-zero. lastBackupStart is the latest - * checkpoint redo location used as a starting point for an online backup. + * exclusiveBackupState indicates the state of an exclusive backup (see + * comments of ExclusiveBackupState for more details). nonExclusiveBackups + * is a counter indicating the number of streaming base backups currently + * in progress. forcePageWrites is set to true when either of these is + * non-zero. lastBackupStart is the latest checkpoint redo location used + * as a starting point for an online backup. */ - bool exclusiveBackup; + ExclusiveBackupState exclusiveBackupState; int nonExclusiveBackups; XLogRecPtr lastBackupStart; @@ -518,7 +569,6 @@ typedef struct XLogCtlInsert * WAL insertion locks. */ WALInsertLockPadded *WALInsertLocks; - LWLockTranche WALInsertLockTranche; } XLogCtlInsert; /* @@ -543,8 +593,9 @@ typedef struct XLogCtlData XLogRecPtr unloggedLSN; slock_t ulsn_lck; - /* Time of last xlog segment switch. Protected by WALWriteLock. */ + /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */ pg_time_t lastSegSwitchTime; + XLogRecPtr lastSegSwitchLSN; /* * Protected by info_lck and WALWriteLock (you must hold either lock to @@ -616,11 +667,14 @@ typedef struct XLogCtlData /* * During recovery, we keep a copy of the latest checkpoint record here. - * Used by the background writer when it wants to create a restartpoint. + * lastCheckPointRecPtr points to start of checkpoint record and + * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the + * checkpointer when it wants to create a restartpoint. * * Protected by info_lck. */ XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; CheckPoint lastCheckPoint; /* @@ -687,6 +741,10 @@ static ControlFileData *ControlFile = NULL; #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD) #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD)) +/* Convert min_wal_size_mb and max wal_size_mb to equivalent segment count */ +#define ConvertToXSegs(x) \ + (x / (XLOG_SEG_SIZE / (1024 * 1024))) + /* * Private, possibly out-of-date copy of shared LogwrtResult. * See discussion above. @@ -701,12 +759,12 @@ typedef enum { XLOG_FROM_ANY = 0, /* request to read WAL from any source */ XLOG_FROM_ARCHIVE, /* restored using restore_command */ - XLOG_FROM_PG_XLOG, /* existing file in pg_xlog */ + XLOG_FROM_PG_WAL, /* existing file in pg_wal */ XLOG_FROM_STREAM /* streamed from master */ } XLogSource; /* human-readable names for XLogSources, for debugging output */ -static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"}; +static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"}; /* * openLogFile is -1 or a kernel FD for an open log file segment. @@ -814,7 +872,7 @@ static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, bool find_free, XLogSegNo max_segno, bool use_lock); static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, - int source, bool notexistOk); + int source, bool notfoundOk); static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source); static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf, @@ -846,6 +904,7 @@ static void xlog_outrec(StringInfo buf, XLogReaderState *record); #endif static void xlog_outdesc(StringInfo buf, XLogReaderState *record); static void pg_start_backup_callback(int code, Datum arg); +static void pg_stop_backup_callback(int code, Datum arg); static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby); static bool read_tablespace_map(List **tablespaces); @@ -865,6 +924,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr); static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); +static void checkXLogConsistency(XLogReaderState *record); static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); @@ -883,6 +943,9 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); * which pages need a full-page image, and retry. If fpw_lsn is invalid, the * record is always inserted. * + * 'flags' gives more in-depth control on the record being inserted. See + * XLogSetRecordFlags() for details. + * * The first XLogRecData in the chain must be for the record header, and its * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and * xl_crc fields in the header, the rest of the header must already be filled @@ -895,14 +958,17 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); * WAL rule "write the log before the data".) */ XLogRecPtr -XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) +XLogInsertRecord(XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags) { XLogCtlInsert *Insert = &XLogCtl->Insert; pg_crc32c rdata_crc; bool inserted; XLogRecord *rechdr = (XLogRecord *) rdata->data; + uint8 info = rechdr->xl_info & ~XLR_INFO_MASK; bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && - rechdr->xl_info == XLOG_SWITCH); + info == XLOG_SWITCH); XLogRecPtr StartPos; XLogRecPtr EndPos; @@ -1011,6 +1077,18 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) */ CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, StartPos, EndPos); + + /* + * Unless record is flagged as not important, update LSN of last + * important record in the current slot. When holding all locks, just + * update the first one. + */ + if ((flags & XLOG_MARK_UNIMPORTANT) == 0) + { + int lockno = holdingAllLocks ? 0 : MyLockNo; + + WALInsertLocks[lockno].l.lastImportantAt = StartPos; + } } else { @@ -1051,7 +1129,7 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) */ if (isLogSwitch) { - TRACE_POSTGRESQL_XLOG_SWITCH(); + TRACE_POSTGRESQL_WAL_SWITCH(); XLogFlush(EndPos); /* @@ -1259,6 +1337,114 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) } /* + * Checks whether the current buffer page and backup page stored in the + * WAL record are consistent or not. Before comparing the two pages, a + * masking can be applied to the pages to ignore certain areas like hint bits, + * unused space between pd_lower and pd_upper among other things. This + * function should be called once WAL replay has been completed for a + * given record. + */ +static void +checkXLogConsistency(XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + int block_id; + + /* Records with no backup blocks have no need for consistency checks. */ + if (!XLogRecHasAnyBlockRefs(record)) + return; + + Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); + + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + Buffer buf; + Page page; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + { + /* + * WAL record doesn't contain a block reference with the given id. + * Do nothing. + */ + continue; + } + + Assert(XLogRecHasBlockImage(record, block_id)); + + if (XLogRecBlockImageApply(record, block_id)) + { + /* + * WAL record has already applied the page, so bypass the + * consistency check as that would result in comparing the full + * page stored in the record with itself. + */ + continue; + } + + /* + * Read the contents from the current buffer and store it in a + * temporary page. + */ + buf = XLogReadBufferExtended(rnode, forknum, blkno, + RBM_NORMAL_NO_LOG); + if (!BufferIsValid(buf)) + continue; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + /* + * Take a copy of the local page where WAL has been applied to have a + * comparison base before masking it... + */ + memcpy(replay_image_masked, page, BLCKSZ); + + /* No need for this page anymore now that a copy is in. */ + UnlockReleaseBuffer(buf); + + /* + * If the block LSN is already ahead of this WAL record, we can't + * expect contents to match. This can happen if recovery is + * restarted. + */ + if (PageGetLSN(replay_image_masked) > record->EndRecPtr) + continue; + + /* + * Read the contents from the backup copy, stored in WAL record and + * store it in a temporary page. There is no need to allocate a new + * page here, a local buffer is fine to hold its contents and a mask + * can be directly applied on it. + */ + if (!RestoreBlockImage(record, block_id, master_image_masked)) + elog(ERROR, "failed to restore block image"); + + /* + * If masking function is defined, mask both the master and replay + * images + */ + if (RmgrTable[rmid].rm_mask != NULL) + { + RmgrTable[rmid].rm_mask(replay_image_masked, blkno); + RmgrTable[rmid].rm_mask(master_image_masked, blkno); + } + + /* Time to compare the master and replay images. */ + if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0) + { + elog(FATAL, + "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + } + } +} + +/* * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved * area in the WAL. */ @@ -2022,7 +2208,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) } /* - * Calculate CheckPointSegments based on max_wal_size and + * Calculate CheckPointSegments based on max_wal_size_mb and * checkpoint_completion_target. */ static void @@ -2032,14 +2218,14 @@ CalculateCheckpointSegments(void) /*------- * Calculate the distance at which to trigger a checkpoint, to avoid - * exceeding max_wal_size. This is based on two assumptions: + * exceeding max_wal_size_mb. This is based on two assumptions: * * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint. * b) during checkpoint, we consume checkpoint_completion_target * * number of segments consumed between checkpoints. *------- */ - target = (double) max_wal_size / (2.0 + CheckPointCompletionTarget); + target = (double) ConvertToXSegs(max_wal_size_mb) / (2.0 + CheckPointCompletionTarget); /* round down */ CheckPointSegments = (int) target; @@ -2051,7 +2237,7 @@ CalculateCheckpointSegments(void) void assign_max_wal_size(int newval, void *extra) { - max_wal_size = newval; + max_wal_size_mb = newval; CalculateCheckpointSegments(); } @@ -2075,12 +2261,12 @@ XLOGfileslop(XLogRecPtr PriorRedoPtr) XLogSegNo recycleSegNo; /* - * Calculate the segment numbers that min_wal_size and max_wal_size + * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb * correspond to. Always recycle enough segments to meet the minimum, and * remove enough segments to stay below the maximum. */ - minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1; - maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1; + minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + ConvertToXSegs(min_wal_size_mb) - 1; + maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + ConvertToXSegs(max_wal_size_mb) - 1; /* * Between those limits, recycle enough segments to get us through to the @@ -2284,7 +2470,9 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) do { errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); written = write(openLogFile, from, nleft); + pgstat_report_wait_end(); if (written <= 0) { if (errno == EINTR) @@ -2330,6 +2518,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) XLogArchiveNotifySeg(openLogSegNo); XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush; /* * Request a checkpoint if we've consumed too much xlog since @@ -2756,7 +2945,7 @@ XLogFlush(XLogRecPtr record) * This routine is invoked periodically by the background walwriter process. * * Returns TRUE if there was any work to do, even if we skipped flushing due - * to wal_writer_delay/wal_flush_after. + * to wal_writer_delay/wal_writer_flush_after. */ bool XLogBackgroundFlush(void) @@ -3034,6 +3223,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ) { errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ) { int save_errno = errno; @@ -3052,8 +3242,10 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); } + pgstat_report_wait_end(); } + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); if (pg_fsync(fd) != 0) { close(fd); @@ -3061,6 +3253,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); } + pgstat_report_wait_end(); if (close(fd)) ereport(ERROR, @@ -3187,6 +3380,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, if (nread > sizeof(buffer)) nread = sizeof(buffer); errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ); if (read(srcfd, buffer, nread) != nread) { if (errno != 0) @@ -3199,8 +3393,10 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, (errmsg("not enough data in file \"%s\"", path))); } + pgstat_report_wait_end(); } errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE); if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer)) { int save_errno = errno; @@ -3216,12 +3412,15 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); } + pgstat_report_wait_end(); } + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC); if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); if (CloseTransientFile(fd)) ereport(ERROR, @@ -3284,7 +3483,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, if (!find_free) { /* Force installation: get rid of any pre-existing segment file */ - unlink(path); + durable_unlink(path, DEBUG1); } else { @@ -3337,7 +3536,7 @@ XLogFileOpen(XLogSegNo segno) if (fd < 0) ereport(PANIC, (errcode_for_file_access(), - errmsg("could not open transaction log file \"%s\": %m", path))); + errmsg("could not open write-ahead log file \"%s\": %m", path))); return fd; } @@ -3346,7 +3545,7 @@ XLogFileOpen(XLogSegNo segno) * Open a logfile segment for reading (during recovery). * * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. - * Otherwise, it's assumed to be already available in pg_xlog. + * Otherwise, it's assumed to be already available in pg_wal. */ static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, @@ -3375,7 +3574,7 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, return -1; break; - case XLOG_FROM_PG_XLOG: + case XLOG_FROM_PG_WAL: case XLOG_FROM_STREAM: XLogFilePath(path, tli, segno); restoredFromArchive = false; @@ -3394,7 +3593,7 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, KeepFileRestoredFromArchive(path, xlogfname); /* - * Set path to point at the new file in pg_xlog. + * Set path to point at the new file in pg_wal. */ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); } @@ -3482,10 +3681,10 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source) } } - if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG) + if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL) { fd = XLogFileRead(segno, emode, tli, - XLOG_FROM_PG_XLOG, true); + XLOG_FROM_PG_WAL, true); if (fd != -1) { if (!expectedTLEs) @@ -3643,7 +3842,7 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open transaction log directory \"%s\": %m", + errmsg("could not open write-ahead log directory \"%s\": %m", XLOGDIR))); /* @@ -3694,10 +3893,10 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) * * This is called during recovery, whenever we switch to follow a new * timeline, and at the end of recovery when we create a new timeline. We - * wouldn't otherwise care about extra WAL files lying in pg_xlog, but they + * wouldn't otherwise care about extra WAL files lying in pg_wal, but they * might be leftover pre-allocated or recycled WAL segments on the old timeline * that we haven't used yet, and contain garbage. If we just leave them in - * pg_xlog, they will eventually be archived, and we can't let that happen. + * pg_wal, they will eventually be archived, and we can't let that happen. * Files that belong to our timeline history are valid, because we have * successfully replayed them, but from others we can't be sure. * @@ -3718,7 +3917,7 @@ RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI) if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open transaction log directory \"%s\": %m", + errmsg("could not open write-ahead log directory \"%s\": %m", XLOGDIR))); /* @@ -3799,7 +3998,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) true, recycleSegNo, true)) { ereport(DEBUG2, - (errmsg("recycled transaction log file \"%s\"", + (errmsg("recycled write-ahead log file \"%s\"", segname))); CheckpointStats.ckpt_segs_recycled++; /* Needn't recheck that slot on future iterations */ @@ -3811,7 +4010,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) int rc; ereport(DEBUG2, - (errmsg("removing transaction log file \"%s\"", + (errmsg("removing write-ahead log file \"%s\"", segname))); #ifdef WIN32 @@ -3831,20 +4030,17 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) { ereport(LOG, (errcode_for_file_access(), - errmsg("could not rename old transaction log file \"%s\": %m", + errmsg("could not rename old write-ahead log file \"%s\": %m", path))); return; } - rc = unlink(newpath); + rc = durable_unlink(newpath, LOG); #else - rc = unlink(path); + rc = durable_unlink(path, LOG); #endif if (rc != 0) { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not remove old transaction log file \"%s\": %m", - path))); + /* Message already logged by durable_unlink() */ return; } CheckpointStats.ckpt_segs_removed++; @@ -3854,15 +4050,15 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) } /* - * Verify whether pg_xlog and pg_xlog/archive_status exist. + * Verify whether pg_wal and pg_wal/archive_status exist. * If the latter does not exist, recreate it. * * It is not the goal of this function to verify the contents of these * directories, but to help in cases where someone has performed a cluster - * copy for PITR purposes but omitted pg_xlog from the copy. + * copy for PITR purposes but omitted pg_wal from the copy. * - * We could also recreate pg_xlog if it doesn't exist, but a deliberate - * policy decision was made not to. It is fairly common for pg_xlog to be + * We could also recreate pg_wal if it doesn't exist, but a deliberate + * policy decision was made not to. It is fairly common for pg_wal to be * a symlink, and if that was the DBA's intent then automatically making a * plain directory would result in degraded performance with no notice. */ @@ -3872,7 +4068,7 @@ ValidateXLOGDirectoryStructure(void) char path[MAXPGPATH]; struct stat stat_buf; - /* Check for pg_xlog; if it doesn't exist, error out */ + /* Check for pg_wal; if it doesn't exist, error out */ if (stat(XLOGDIR, &stat_buf) != 0 || !S_ISDIR(stat_buf.st_mode)) ereport(FATAL, @@ -3910,13 +4106,13 @@ CleanupBackupHistory(void) { DIR *xldir; struct dirent *xlde; - char path[MAXPGPATH]; + char path[MAXPGPATH + sizeof(XLOGDIR)]; xldir = AllocateDir(XLOGDIR); if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open transaction log directory \"%s\": %m", + errmsg("could not open write-ahead log directory \"%s\": %m", XLOGDIR))); while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) @@ -3925,10 +4121,9 @@ CleanupBackupHistory(void) { if (XLogArchiveCheckDone(xlde->d_name)) { - ereport(DEBUG2, - (errmsg("removing transaction log backup history file \"%s\"", - xlde->d_name))); - snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name); + elog(DEBUG2, "removing WAL backup history file \"%s\"", + xlde->d_name); + snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name); unlink(path); XLogArchiveCleanup(xlde->d_name); } @@ -4028,11 +4223,11 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, * If archive recovery was requested, but we were still doing * crash recovery, switch to archive recovery and retry using the * offline archive. We have now replayed all the valid WAL in - * pg_xlog, so we are presumably now consistent. + * pg_wal, so we are presumably now consistent. * * We require that there's at least some valid WAL present in - * pg_xlog, however (!fetch_ckpt). We could recover using the WAL - * from the archive, even if pg_xlog is completely empty, but we'd + * pg_wal, however (!fetch_ckpt). We could recover using the WAL + * from the archive, even if pg_wal is completely empty, but we'd * have no idea how far we'd have to replay to reach consistency. * So err on the safe side and give up. */ @@ -4040,7 +4235,7 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, !fetching_ckpt) { ereport(DEBUG1, - (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery"))); + (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); InArchiveRecovery = true; if (StandbyModeRequested) StandbyMode = true; @@ -4157,7 +4352,7 @@ rescanLatestTimeLine(void) /* * As in StartupXLOG(), try to ensure we have all the history files - * between the old target and new target in pg_xlog. + * between the old target and new target in pg_wal. */ restoreTimeLineHistoryFiles(oldtarget + 1, newtarget); @@ -4208,11 +4403,6 @@ WriteControlFile(void) ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE; ControlFile->loblksize = LOBLKSIZE; -#ifdef HAVE_INT64_TIMESTAMP - ControlFile->enableIntTimes = true; -#else - ControlFile->enableIntTimes = false; -#endif ControlFile->float4ByVal = FLOAT4PASSBYVAL; ControlFile->float8ByVal = FLOAT8PASSBYVAL; @@ -4246,6 +4436,7 @@ WriteControlFile(void) XLOG_CONTROL_FILE))); errno = 0; + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE); if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE) { /* if write didn't set errno, assume problem is no disk space */ @@ -4255,11 +4446,14 @@ WriteControlFile(void) (errcode_for_file_access(), errmsg("could not write to control file: %m"))); } + pgstat_report_wait_end(); + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC); if (pg_fsync(fd) != 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %m"))); + pgstat_report_wait_end(); if (close(fd)) ereport(PANIC, @@ -4285,10 +4479,12 @@ ReadControlFile(void) errmsg("could not open control file \"%s\": %m", XLOG_CONTROL_FILE))); + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ); if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData)) ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m"))); + pgstat_report_wait_end(); close(fd); @@ -4408,22 +4604,6 @@ ReadControlFile(void) ControlFile->loblksize, (int) LOBLKSIZE), errhint("It looks like you need to recompile or initdb."))); -#ifdef HAVE_INT64_TIMESTAMP - if (ControlFile->enableIntTimes != true) - ereport(FATAL, - (errmsg("database files are incompatible with server"), - errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP" - " but the server was compiled with HAVE_INT64_TIMESTAMP."), - errhint("It looks like you need to recompile or initdb."))); -#else - if (ControlFile->enableIntTimes != false) - ereport(FATAL, - (errmsg("database files are incompatible with server"), - errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP" - " but the server was compiled without HAVE_INT64_TIMESTAMP."), - errhint("It looks like you need to recompile or initdb."))); -#endif - #ifdef USE_FLOAT4_BYVAL if (ControlFile->float4ByVal != true) ereport(FATAL, @@ -4482,6 +4662,7 @@ UpdateControlFile(void) XLOG_CONTROL_FILE))); errno = 0; + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE); if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData)) { /* if write didn't set errno, assume problem is no disk space */ @@ -4491,11 +4672,14 @@ UpdateControlFile(void) (errcode_for_file_access(), errmsg("could not write to control file: %m"))); } + pgstat_report_wait_end(); + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE); if (pg_fsync(fd) != 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %m"))); + pgstat_report_wait_end(); if (close(fd)) ereport(PANIC, @@ -4514,6 +4698,16 @@ GetSystemIdentifier(void) } /* + * Returns the random nonce from control file. + */ +char * +GetMockAuthenticationNonce(void) +{ + Assert(ControlFile != NULL); + return ControlFile->mock_authentication_nonce; +} + +/* * Are checksums enabled for data pages? */ bool @@ -4667,9 +4861,7 @@ XLOGShmemInit(void) { walDebugCxt = AllocSetContextCreate(TopMemoryContext, "WAL Debug", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); MemoryContextAllowInCriticalSection(walDebugCxt, true); } #endif @@ -4687,7 +4879,7 @@ XLOGShmemInit(void) /* Initialize local copy of WALInsertLocks and register the tranche */ WALInsertLocks = XLogCtl->Insert.WALInsertLocks; LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, - &XLogCtl->Insert.WALInsertLockTranche); + "wal_insert"); return; } memset(XLogCtl, 0, sizeof(XLogCtlData)); @@ -4710,15 +4902,12 @@ XLOGShmemInit(void) (WALInsertLockPadded *) allocptr; allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS; - XLogCtl->Insert.WALInsertLockTranche.name = "wal_insert"; - XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks; - XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded); - - LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, &XLogCtl->Insert.WALInsertLockTranche); + LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert"); for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) { LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT); WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr; + WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr; } /* @@ -4768,6 +4957,7 @@ BootStrapXLOG(void) char *recptr; bool use_existent; uint64 sysidentifier; + char mock_auth_nonce[MOCK_AUTH_NONCE_LEN]; struct timeval tv; pg_crc32c crc; @@ -4788,6 +4978,17 @@ BootStrapXLOG(void) sysidentifier |= ((uint64) tv.tv_usec) << 12; sysidentifier |= getpid() & 0xFFF; + /* + * Generate a random nonce. This is used for authentication requests that + * will fail because the user does not exist. The nonce is used to create + * a genuine-looking password challenge for the non-existent user, in lieu + * of an actual stored password. + */ + if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN)) + ereport(PANIC, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate secret authorization token"))); + /* First timeline ID is always 1 */ ThisTimeLineID = 1; @@ -4825,8 +5026,9 @@ BootStrapXLOG(void) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); /* Set up the XLOG page header */ @@ -4849,7 +5051,7 @@ BootStrapXLOG(void) record->xl_rmid = RM_XLOG_ID; recptr += SizeOfXLogRecord; /* fill the XLogRecordDataHeaderShort struct */ - *(recptr++) = XLR_BLOCK_ID_DATA_SHORT; + *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT; *(recptr++) = sizeof(checkPoint); memcpy(recptr, &checkPoint, sizeof(checkPoint)); recptr += sizeof(checkPoint); @@ -4867,6 +5069,7 @@ BootStrapXLOG(void) /* Write the first page with the initial record */ errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE); if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ @@ -4874,18 +5077,21 @@ BootStrapXLOG(void) errno = ENOSPC; ereport(PANIC, (errcode_for_file_access(), - errmsg("could not write bootstrap transaction log file: %m"))); + errmsg("could not write bootstrap write-ahead log file: %m"))); } + pgstat_report_wait_end(); + pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC); if (pg_fsync(openLogFile) != 0) ereport(PANIC, (errcode_for_file_access(), - errmsg("could not fsync bootstrap transaction log file: %m"))); + errmsg("could not fsync bootstrap write-ahead log file: %m"))); + pgstat_report_wait_end(); if (close(openLogFile)) ereport(PANIC, (errcode_for_file_access(), - errmsg("could not close bootstrap transaction log file: %m"))); + errmsg("could not close bootstrap write-ahead log file: %m"))); openLogFile = -1; @@ -4894,6 +5100,7 @@ BootStrapXLOG(void) memset(ControlFile, 0, sizeof(ControlFileData)); /* Initialize pg_control status fields */ ControlFile->system_identifier = sysidentifier; + memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN); ControlFile->state = DB_SHUTDOWNED; ControlFile->time = checkPoint.time; ControlFile->checkPoint = checkPoint.redo; @@ -5028,7 +5235,8 @@ readRecoveryCommandFile(void) rtli = (TimeLineID) strtoul(item->value, NULL, 0); if (errno == EINVAL || errno == ERANGE) ereport(FATAL, - (errmsg("recovery_target_timeline is not a valid number: \"%s\"", + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery_target_timeline is not a valid number: \"%s\"", item->value))); } if (rtli) @@ -5044,7 +5252,8 @@ readRecoveryCommandFile(void) recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0); if (errno == EINVAL || errno == ERANGE) ereport(FATAL, - (errmsg("recovery_target_xid is not a valid number: \"%s\"", + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery_target_xid is not a valid number: \"%s\"", item->value))); ereport(DEBUG2, (errmsg_internal("recovery_target_xid = %u", @@ -5089,6 +5298,23 @@ readRecoveryCommandFile(void) (errmsg_internal("recovery_target_name = '%s'", recoveryTargetName))); } + else if (strcmp(item->name, "recovery_target_lsn") == 0) + { + recoveryTarget = RECOVERY_TARGET_LSN; + + /* + * Convert the LSN string given by the user to XLogRecPtr form. + */ + recoveryTargetLSN = + DatumGetLSN(DirectFunctionCall3(pg_lsn_in, + CStringGetDatum(item->value), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1))); + ereport(DEBUG2, + (errmsg_internal("recovery_target_lsn = '%X/%X'", + (uint32) (recoveryTargetLSN >> 32), + (uint32) recoveryTargetLSN))); + } else if (strcmp(item->name, "recovery_target") == 0) { if (strcmp(item->value, "immediate") == 0) @@ -5166,7 +5392,8 @@ readRecoveryCommandFile(void) } else ereport(FATAL, - (errmsg("unrecognized recovery parameter \"%s\"", + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized recovery parameter \"%s\"", item->name))); } @@ -5179,13 +5406,14 @@ readRecoveryCommandFile(void) ereport(WARNING, (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command", RECOVERY_COMMAND_FILE), - errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there."))); + errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there."))); } else { if (recoveryRestoreCommand == NULL) ereport(FATAL, - (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled", + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled", RECOVERY_COMMAND_FILE))); } @@ -5199,6 +5427,15 @@ readRecoveryCommandFile(void) !EnableHotStandby) recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN; + /* + * We don't support standby_mode in standalone backends; that requires + * other processes such as the WAL receiver to be alive. + */ + if (StandbyModeRequested && !IsUnderPostmaster) + ereport(FATAL, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("standby mode is not supported by single-user servers"))); + /* Enable fetching from archive recovery area */ ArchiveRecoveryRequested = true; @@ -5215,7 +5452,8 @@ readRecoveryCommandFile(void) /* Timeline 1 does not have a history file, all else should */ if (rtli != 1 && !existsTimeLineHistory(rtli)) ereport(FATAL, - (errmsg("recovery target timeline %u does not exist", + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery target timeline %u does not exist", rtli))); recoveryTargetTLI = rtli; recoveryTargetIsLatest = false; @@ -5404,8 +5642,26 @@ recoveryStopsBefore(XLogReaderState *record) recoveryStopAfter = false; recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + return true; + } + + /* Check if target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + !recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = false; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; recoveryStopTime = 0; recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", + (uint32) (recoveryStopLSN >> 32), + (uint32) recoveryStopLSN))); return true; } #ifdef PGXC @@ -5520,6 +5776,7 @@ recoveryStopsBefore(XLogReaderState *record) recoveryStopAfter = false; recoveryStopXid = recordXid; recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; recoveryStopName[0] = '\0'; if (isCommit) @@ -5584,6 +5841,7 @@ recoveryStopsAfter(XLogReaderState *record) { recoveryStopAfter = true; recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; (void) getRecordTimestamp(record, &recoveryStopTime); strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN); @@ -5595,6 +5853,23 @@ recoveryStopsAfter(XLogReaderState *record) } } + /* Check if the target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", + (uint32) (recoveryStopLSN >> 32), + (uint32) recoveryStopLSN))); + return true; + } + if (rmid != RM_XACT_ID) return false; @@ -5650,6 +5925,7 @@ recoveryStopsAfter(XLogReaderState *record) recoveryStopAfter = true; recoveryStopXid = recordXid; recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; recoveryStopName[0] = '\0'; if (xact_info == XLOG_XACT_COMMIT || @@ -5681,6 +5957,7 @@ recoveryStopsAfter(XLogReaderState *record) recoveryStopAfter = true; recoveryStopXid = InvalidTransactionId; recoveryStopTime = 0; + recoveryStopLSN = InvalidXLogRecPtr; recoveryStopName[0] = '\0'; return true; } @@ -5704,7 +5981,7 @@ recoveryPausesHere(void) ereport(LOG, (errmsg("recovery has paused"), - errhint("Execute pg_xlog_replay_resume() to continue."))); + errhint("Execute pg_wal_replay_resume() to continue."))); while (RecoveryIsPaused()) { @@ -5820,7 +6097,8 @@ recoveryApplyDelay(XLogReaderState *record) WaitLatch(&XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, - secs * 1000L + microsecs / 1000); + secs * 1000L + microsecs / 1000, + WAIT_EVENT_RECOVERY_APPLY_DELAY); } return true; } @@ -6048,7 +6326,7 @@ StartupXLOG(void) #endif /* - * Verify that pg_xlog and pg_xlog/archive_status exist. In cases where + * Verify that pg_wal and pg_wal/archive_status exist. In cases where * someone has performed a copy for PITR, these directories may have been * excluded and need to be re-created. */ @@ -6113,6 +6391,11 @@ StartupXLOG(void) ereport(LOG, (errmsg("starting point-in-time recovery to \"%s\"", recoveryTargetName))); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + ereport(LOG, + (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", + (uint32) (recoveryTargetLSN >> 32), + (uint32) recoveryTargetLSN))); else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); @@ -6135,9 +6418,16 @@ StartupXLOG(void) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"), - errdetail("Failed while allocating an XLog reading processor."))); + errdetail("Failed while allocating a WAL reading processor."))); xlogreader->system_identifier = ControlFile->system_identifier; + /* + * Allocate pages dedicated to WAL consistency checks, those had better be + * aligned. + */ + replay_image_masked = (char *) palloc(BLCKSZ); + master_image_masked = (char *) palloc(BLCKSZ); + if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)) { @@ -6160,7 +6450,7 @@ StartupXLOG(void) if (record != NULL) { memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); - wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, (errmsg("checkpoint record is at %X/%X", (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc))); @@ -6262,7 +6552,7 @@ StartupXLOG(void) * and put it into archive recovery by creating a recovery.conf file. * * Our strategy in that case is to perform crash recovery first, - * replaying all the WAL present in pg_xlog, and only enter archive + * replaying all the WAL present in pg_wal, and only enter archive * recovery after that. * * But usually we already know how far we need to replay the WAL (up @@ -6318,7 +6608,7 @@ StartupXLOG(void) (errmsg("could not locate a valid checkpoint record"))); } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); - wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); } /* @@ -6409,8 +6699,9 @@ StartupXLOG(void) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); SetCommitTsLimit(checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid); XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch; @@ -6466,18 +6757,28 @@ StartupXLOG(void) /* * Copy any missing timeline history files between 'now' and the recovery - * target timeline from archive to pg_xlog. While we don't need those - * files ourselves - the history file of the recovery target timeline - * covers all the previous timelines in the history too - a cascading - * standby server might be interested in them. Or, if you archive the WAL - * from this server to a different archive than the master, it'd be good - * for all the history files to get archived there after failover, so that - * you can use one of the old timelines as a PITR target. Timeline history - * files are small, so it's better to copy them unnecessarily than not - * copy them and regret later. + * target timeline from archive to pg_wal. While we don't need those files + * ourselves - the history file of the recovery target timeline covers all + * the previous timelines in the history too - a cascading standby server + * might be interested in them. Or, if you archive the WAL from this + * server to a different archive than the master, it'd be good for all the + * history files to get archived there after failover, so that you can use + * one of the old timelines as a PITR target. Timeline history files are + * small, so it's better to copy them unnecessarily than not copy them and + * regret later. */ restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI); + /* + * Before running in recovery, scan pg_twophase and fill in its status to + * be able to work on entries generated by redo. Doing a scan before + * taking any recovery action has the merit to discard any 2PC files that + * are newer than the first record to replay, saving from any conflicts at + * replay. This avoids as well any subsequent scans when doing recovery + * of the on-disk two-phase data. + */ + restoreTwoPhaseData(); + lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; @@ -6698,7 +6999,7 @@ StartupXLOG(void) ProcArrayApplyRecoveryInfo(&running); - StandbyRecoverPreparedTransactions(false); + StandbyRecoverPreparedTransactions(); } } @@ -6938,6 +7239,15 @@ StartupXLOG(void) /* Now apply the WAL record itself */ RmgrTable[record->xl_rmid].rm_redo(xlogreader); + /* + * After redo, check whether the backup pages associated with + * the WAL record are consistent with the existing pages. This + * check is done only if consistency check is enabled for this + * record. + */ + if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) + checkXLogConsistency(xlogreader); + /* Pop the error context stack */ error_context_stack = errcallback.previous; @@ -7087,7 +7397,7 @@ StartupXLOG(void) /* * We are now done reading the xlog from stream. Turn off streaming * recovery to force fetching the files (which would be required at end of - * recovery, e.g., timeline history file) from archive or pg_xlog. + * recovery, e.g., timeline history file) from archive or pg_wal. */ StandbyMode = false; @@ -7182,6 +7492,12 @@ StartupXLOG(void) "%s %s\n", recoveryStopAfter ? "after" : "before", timestamptz_to_str(recoveryStopTime)); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + snprintf(reason, sizeof(reason), + "%s LSN %X/%X\n", + recoveryStopAfter ? "after" : "before", + (uint32) (recoveryStopLSN >> 32), + (uint32) recoveryStopLSN); else if (recoveryTarget == RECOVERY_TARGET_NAME) snprintf(reason, sizeof(reason), "at restore point \"%s\"", @@ -7216,7 +7532,7 @@ StartupXLOG(void) exitArchiveRecovery(EndOfLogTLI, EndOfLog); /* - * Prepare to write WAL starting at EndOfLog position, and init xlog + * Prepare to write WAL starting at EndOfLog location, and init xlog * buffer cache using the block containing the last record from the * previous incarnation. */ @@ -7376,7 +7692,7 @@ StartupXLOG(void) * As a compromise, we rename the last segment with the .partial * suffix, and archive it. Archive recovery will never try to read * .partial segments, so they will normally go unused. But in the odd - * PITR case, the administrator can copy them manually to the pg_xlog + * PITR case, the administrator can copy them manually to the pg_wal * directory (removing the suffix). They can be useful in debugging, * too. * @@ -7426,14 +7742,9 @@ StartupXLOG(void) */ InRecovery = false; - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->state = DB_IN_PRODUCTION; - ControlFile->time = (pg_time_t) time(NULL); - UpdateControlFile(); - LWLockRelease(ControlFileLock); - - /* start the archive_timeout timer running */ + /* start the archive_timeout timer and LSN running */ XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = EndOfLog; /* also initialize latestCompletedXid, to nextXid - 1 */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); @@ -7489,15 +7800,32 @@ StartupXLOG(void) CompleteCommitTsInitialization(); /* - * All done. Allow backends to write WAL. (Although the bool flag is - * probably atomic in itself, we use the info_lck here to ensure that - * there are no race conditions concerning visibility of other recent - * updates to shared memory.) + * All done with end-of-recovery actions. + * + * Now allow backends to write WAL and update the control file status in + * consequence. The boolean flag allowing backends to write WAL is + * updated while holding ControlFileLock to prevent other backends to look + * at an inconsistent state of the control file in shared memory. There + * is still a small window during which backends can write WAL and the + * control file is still referring to a system not in DB_IN_PRODUCTION + * state while looking at the on-disk control file. + * + * Also, although the boolean flag to allow WAL is probably atomic in + * itself, we use the info_lck here to ensure that there are no race + * conditions concerning visibility of other recent updates to shared + * memory. */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_PRODUCTION; + ControlFile->time = (pg_time_t) time(NULL); + SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->SharedRecoveryInProgress = false; SpinLockRelease(&XLogCtl->info_lck); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + /* * If there were cascading standby servers connected to us, nudge any wal * sender processes to notice that we've been promoted. @@ -7765,6 +8093,7 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpt, bool report) { XLogRecord *record; + uint8 info; if (!XRecOffIsValid(RecPtr)) { @@ -7832,8 +8161,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, } return NULL; } - if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN && - record->xl_info != XLOG_CHECKPOINT_ONLINE) + info = record->xl_info & ~XLR_INFO_MASK; + if (info != XLOG_CHECKPOINT_SHUTDOWN && + info != XLOG_CHECKPOINT_ONLINE) { switch (whichChkpt) { @@ -7976,16 +8306,51 @@ GetFlushRecPtr(void) } /* - * Get the time of the last xlog segment switch + * GetLastImportantRecPtr -- Returns the LSN of the last important record + * inserted. All records not explicitly marked as unimportant are considered + * important. + * + * The LSN is determined by computing the maximum of + * WALInsertLocks[i].lastImportantAt. + */ +XLogRecPtr +GetLastImportantRecPtr(void) +{ + XLogRecPtr res = InvalidXLogRecPtr; + int i; + + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + XLogRecPtr last_important; + + /* + * Need to take a lock to prevent torn reads of the LSN, which are + * possible on some of the supported platforms. WAL insert locks only + * support exclusive mode, so we have to use that. + */ + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + last_important = WALInsertLocks[i].l.lastImportantAt; + LWLockRelease(&WALInsertLocks[i].l.lock); + + if (res < last_important) + res = last_important; + } + + return res; +} + +/* + * Get the time and LSN of the last xlog segment switch */ pg_time_t -GetLastSegSwitchTime(void) +GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN) { pg_time_t result; /* Need WALWriteLock, but shared lock is sufficient */ LWLockAcquire(WALWriteLock, LW_SHARED); result = XLogCtl->lastSegSwitchTime; + *lastSwitchLSN = XLogCtl->lastSegSwitchLSN; LWLockRelease(WALWriteLock); return result; @@ -8036,6 +8401,12 @@ ShutdownXLOG(int code, Datum arg) ereport(IsPostmasterEnvironment ? LOG : NOTICE, (errmsg("shutting down"))); + /* + * Wait for WAL senders to be in stopping state. This prevents commands + * from writing new WAL. + */ + WalSndWaitStopping(); + if (RecoveryInProgress()) CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); else @@ -8047,7 +8418,7 @@ ShutdownXLOG(int code, Datum arg) * record will go to the next XLOG file and won't be archived (yet). */ if (XLogArchivingActive() && XLogArchiveCommandSet()) - RequestXLogSwitch(); + RequestXLogSwitch(false); CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } @@ -8137,7 +8508,7 @@ LogCheckpointEnd(bool restartpoint) average_usecs = average_sync_time - (uint64) average_secs *1000000; elog(LOG, "%s complete: wrote %d buffers (%.1f%%); " - "%d transaction log file(s) added, %d removed, %d recycled; " + "%d WAL file(s) added, %d removed, %d recycled; " "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB", @@ -8235,7 +8606,7 @@ CreateCheckPoint(int flags) uint32 freespace; XLogRecPtr PriorRedoPtr; XLogRecPtr curInsert; - XLogRecPtr prevPtr; + XLogRecPtr last_important_lsn; VirtualTransactionId *vxids; int nvxids; @@ -8316,38 +8687,33 @@ CreateCheckPoint(int flags) checkPoint.oldestActiveXid = InvalidTransactionId; /* + * Get location of last important record before acquiring insert locks (as + * GetLastImportantRecPtr() also locks WAL locks). + */ + last_important_lsn = GetLastImportantRecPtr(); + + /* * We must block concurrent insertions while examining insert state to * determine the checkpoint REDO pointer. */ WALInsertLockAcquireExclusive(); curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); - prevPtr = XLogBytePosToRecPtr(Insert->PrevBytePos); /* - * If this isn't a shutdown or forced checkpoint, and we have not inserted - * any XLOG records since the start of the last checkpoint, skip the - * checkpoint. The idea here is to avoid inserting duplicate checkpoints - * when the system is idle. That wastes log space, and more importantly it - * exposes us to possible loss of both current and previous checkpoint - * records if the machine crashes just as we're writing the update. - * (Perhaps it'd make even more sense to checkpoint only when the previous - * checkpoint record is in a different xlog page?) - * - * If the previous checkpoint crossed a WAL segment, however, we create - * the checkpoint anyway, to have the latest checkpoint fully contained in - * the new segment. This is for a little bit of extra robustness: it's - * better if you don't need to keep two WAL segments around to recover the - * checkpoint. + * If this isn't a shutdown or forced checkpoint, and if there has been no + * WAL activity requiring a checkpoint, skip it. The idea here is to + * avoid inserting duplicate checkpoints when the system is idle. */ if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE)) == 0) { - if (prevPtr == ControlFile->checkPointCopy.redo && - prevPtr / XLOG_SEG_SIZE == curInsert / XLOG_SEG_SIZE) + if (last_important_lsn == ControlFile->checkPoint) { WALInsertLockRelease(); LWLockRelease(CheckpointLock); END_CRIT_SECTION(); + ereport(DEBUG1, + (errmsg("checkpoint skipped due to an idle system"))); return; } } @@ -8422,6 +8788,11 @@ CreateCheckPoint(int flags) /* * Get the other info we need for the checkpoint record. + * + * We don't need to save oldestClogXid in the checkpoint, it only matters + * for the short period in which clog is being truncated, and if we crash + * during that we'll redo the clog truncation and fix up oldestClogXid + * there. */ LWLockAcquire(XidGenLock, LW_SHARED); checkPoint.nextXid = ShmemVariableCache->nextXid; @@ -8471,7 +8842,7 @@ CreateCheckPoint(int flags) * that are currently in commit critical sections. If an xact inserted * its commit record into XLOG just before the REDO point, then a crash * restart from the REDO point would not replay that record, which means - * that our flushing had better include the xact's update of pg_clog. So + * that our flushing had better include the xact's update of pg_xact. So * we wait till he's out of his commit critical section before proceeding. * See notes in RecordTransactionCommit(). * @@ -8547,7 +8918,7 @@ CreateCheckPoint(int flags) */ if (shutdown && checkPoint.redo != ProcLastRecPtr) ereport(PANIC, - (errmsg("concurrent transaction log activity while database system is shutting down"))); + (errmsg("concurrent write-ahead log activity while database system is shutting down"))); /* * Remember the prior checkpoint's redo pointer, used later to determine @@ -8630,7 +9001,7 @@ CreateCheckPoint(int flags) * StartupSUBTRANS hasn't been called yet. */ if (!RecoveryInProgress()) - TruncateSUBTRANS(GetOldestXmin(NULL, false)); + TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); /* Real work is done, but log and update stats before releasing lock. */ LogCheckpointEnd(false); @@ -8756,6 +9127,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint) */ SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->lastCheckPointRecPtr = ReadRecPtr; + XLogCtl->lastCheckPointEndPtr = EndRecPtr; XLogCtl->lastCheckPoint = *checkPoint; SpinLockRelease(&XLogCtl->info_lck); } @@ -8775,6 +9147,7 @@ bool CreateRestartPoint(int flags) { XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; CheckPoint lastCheckPoint; XLogRecPtr PriorRedoPtr; TimestampTz xtime; @@ -8788,6 +9161,7 @@ CreateRestartPoint(int flags) /* Get a local copy of the last safe checkpoint record. */ SpinLockAcquire(&XLogCtl->info_lck); lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr; + lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr; lastCheckPoint = XLogCtl->lastCheckPoint; SpinLockRelease(&XLogCtl->info_lck); @@ -8891,6 +9265,27 @@ CreateRestartPoint(int flags) ControlFile->checkPoint = lastCheckPointRecPtr; ControlFile->checkPointCopy = lastCheckPoint; ControlFile->time = (pg_time_t) time(NULL); + + /* + * Ensure minRecoveryPoint is past the checkpoint record. Normally, + * this will have happened already while writing out dirty buffers, + * but not necessarily - e.g. because no buffers were dirtied. We do + * this because a non-exclusive base backup uses minRecoveryPoint to + * determine which WAL files must be included in the backup, and the + * file (or files) containing the checkpoint record must be included, + * at a minimum. Note that for an ordinary restart of recovery there's + * no value in having the minimum recovery point any earlier than this + * anyway, because redo will begin just after the checkpoint record. + */ + if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr) + { + ControlFile->minRecoveryPoint = lastCheckPointEndPtr; + ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID; + + /* update local copy */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; UpdateControlFile(); @@ -8969,7 +9364,7 @@ CreateRestartPoint(int flags) * this because StartupSUBTRANS hasn't been called yet. */ if (EnableHotStandby) - TruncateSUBTRANS(GetOldestXmin(NULL, false)); + TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); /* Real work is done, but log and update before releasing lock. */ LogCheckpointEnd(true); @@ -9024,7 +9419,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) /* then check whether slots limit removal further */ if (max_replication_slots > 0 && keep != InvalidXLogRecPtr) { - XLogRecPtr slotSegNo; + XLogSegNo slotSegNo; XLByteToSeg(keep, slotSegNo); @@ -9080,12 +9475,15 @@ XLogPutNextOid(Oid nextOid) * write a switch record because we are already at segment start. */ XLogRecPtr -RequestXLogSwitch(void) +RequestXLogSwitch(bool mark_unimportant) { XLogRecPtr RecPtr; /* XLOG SWITCH has no data */ XLogBeginInsert(); + + if (mark_unimportant) + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH); return RecPtr; @@ -9324,6 +9722,11 @@ xlog_redo(XLogReaderState *record) MultiXactAdvanceOldest(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + + /* + * No need to set oldestClogXid here as well; it'll be set when we + * redo an xl_clog_truncate if it changed since initialization. + */ SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); /* @@ -9372,7 +9775,7 @@ xlog_redo(XLogReaderState *record) ProcArrayApplyRecoveryInfo(&running); - StandbyRecoverPreparedTransactions(true); + StandbyRecoverPreparedTransactions(); } /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ @@ -9733,11 +10136,13 @@ assign_xlog_sync_method(int new_sync_method, void *extra) */ if (openLogFile >= 0) { + pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN); if (pg_fsync(openLogFile) != 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync log segment %s: %m", XLogFileNameP(ThisTimeLineID, openLogSegNo)))); + pgstat_report_wait_end(); if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method)) XLogFileClose(); } @@ -9832,7 +10237,7 @@ XLogFileNameP(TimeLineID tli, XLogSegNo segno) * when backup needs to generate tablespace_map file, it is used to * embed escape character before newline character in tablespace path. * - * Returns the minimum WAL position that must be present to restore from this + * Returns the minimum WAL location that must be present to restore from this * backup, and the corresponding timeline ID in *starttli_p. * * Every successfully started non-exclusive backup must be stopped by calling @@ -9910,7 +10315,12 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, WALInsertLockAcquireExclusive(); if (exclusive) { - if (XLogCtl->Insert.exclusiveBackup) + /* + * At first, mark that we're now starting an exclusive backup, to + * ensure that there are no other sessions currently running + * pg_start_backup() or pg_stop_backup(). + */ + if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE) { WALInsertLockRelease(); ereport(ERROR, @@ -9918,7 +10328,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, errmsg("a backup is already in progress"), errhint("Run pg_stop_backup() and try again."))); } - XLogCtl->Insert.exclusiveBackup = true; + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING; } else XLogCtl->Insert.nonExclusiveBackups++; @@ -9941,7 +10351,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, * first WAL segment containing the startup checkpoint has pages in * the beginning with the old timeline ID. That can cause trouble at * recovery: we won't have a history file covering the old timeline if - * pg_xlog directory was not included in the base backup and the WAL + * pg_wal directory was not included in the base backup and the WAL * archive was cleared too before starting the backup. * * This also ensures that we have emitted a WAL page header that has @@ -9955,7 +10365,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, * recovery case described above. */ if (!backup_started_in_recovery) - RequestXLogSwitch(); + RequestXLogSwitch(false); do { @@ -10062,7 +10472,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, /* Collect information about all tablespaces */ while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) { - char fullpath[MAXPGPATH]; + char fullpath[MAXPGPATH + 10]; char linkpath[MAXPGPATH]; char *relpath = NULL; int rllen; @@ -10173,8 +10583,9 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, { /* * Check for existing backup label --- implies a backup is already - * running. (XXX given that we checked exclusiveBackup above, - * maybe it would be OK to just unlink any such label file?) + * running. (XXX given that we checked exclusiveBackupState + * above, maybe it would be OK to just unlink any such label + * file?) */ if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0) { @@ -10255,6 +10666,20 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive)); /* + * Mark that start phase has correctly finished for an exclusive backup. + * Session-level locks are updated as well to reflect that state. + */ + if (exclusive) + { + WALInsertLockAcquireExclusive(); + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS; + WALInsertLockRelease(); + sessionBackupState = SESSION_BACKUP_EXCLUSIVE; + } + else + sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE; + + /* * We're done. As a convenience, return the starting WAL location. */ if (starttli_p) @@ -10272,8 +10697,8 @@ pg_start_backup_callback(int code, Datum arg) WALInsertLockAcquireExclusive(); if (exclusive) { - Assert(XLogCtl->Insert.exclusiveBackup); - XLogCtl->Insert.exclusiveBackup = false; + Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING); + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE; } else { @@ -10281,7 +10706,7 @@ pg_start_backup_callback(int code, Datum arg) XLogCtl->Insert.nonExclusiveBackups--; } - if (!XLogCtl->Insert.exclusiveBackup && + if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE && XLogCtl->Insert.nonExclusiveBackups == 0) { XLogCtl->Insert.forcePageWrites = false; @@ -10290,13 +10715,40 @@ pg_start_backup_callback(int code, Datum arg) } /* + * Error cleanup callback for pg_stop_backup + */ +static void +pg_stop_backup_callback(int code, Datum arg) +{ + bool exclusive = DatumGetBool(arg); + + /* Update backup status on failure */ + WALInsertLockAcquireExclusive(); + if (exclusive) + { + Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING); + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS; + } + WALInsertLockRelease(); +} + +/* + * Utility routine to fetch the session-level status of a backup running. + */ +SessionBackupState +get_backup_status(void) +{ + return sessionBackupState; +} + +/* * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup() * function. - + * * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops * the non-exclusive backup specified by 'labelfile'. * - * Returns the last WAL position that must be present to restore from this + * Returns the last WAL location that must be present to restore from this * backup, and the corresponding timeline ID in *stoptli_p. * * It is the responsibility of the caller of this function to verify the @@ -10351,20 +10803,87 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) errmsg("WAL level not sufficient for making an online backup"), errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); - /* - * OK to update backup counters and forcePageWrites - */ - WALInsertLockAcquireExclusive(); if (exclusive) { - if (!XLogCtl->Insert.exclusiveBackup) + /* + * At first, mark that we're now stopping an exclusive backup, to + * ensure that there are no other sessions currently running + * pg_start_backup() or pg_stop_backup(). + */ + WALInsertLockAcquireExclusive(); + if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS) { WALInsertLockRelease(); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("exclusive backup not in progress"))); } - XLogCtl->Insert.exclusiveBackup = false; + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING; + WALInsertLockRelease(); + + /* + * Remove backup_label. In case of failure, the state for an exclusive + * backup is switched back to in-progress. + */ + PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive)); + { + /* + * Read the existing label file into memory. + */ + struct stat statbuf; + int r; + + if (stat(BACKUP_LABEL_FILE, &statbuf)) + { + /* should not happen per the upper checks */ + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + BACKUP_LABEL_FILE))); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is not in progress"))); + } + + lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); + if (!lfp) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + } + labelfile = palloc(statbuf.st_size + 1); + r = fread(labelfile, statbuf.st_size, 1, lfp); + labelfile[statbuf.st_size] = '\0'; + + /* + * Close and remove the backup label file + */ + if (r != 1 || ferror(lfp) || FreeFile(lfp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + durable_unlink(BACKUP_LABEL_FILE, ERROR); + + /* + * Remove tablespace_map file if present, it is created only if + * there are tablespaces. + */ + durable_unlink(TABLESPACE_MAP, DEBUG1); + } + PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive)); + } + + /* + * OK to update backup counters and forcePageWrites + */ + WALInsertLockAcquireExclusive(); + if (exclusive) + { + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE; } else { @@ -10378,65 +10897,15 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) XLogCtl->Insert.nonExclusiveBackups--; } - if (!XLogCtl->Insert.exclusiveBackup && + if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE && XLogCtl->Insert.nonExclusiveBackups == 0) { XLogCtl->Insert.forcePageWrites = false; } WALInsertLockRelease(); - if (exclusive) - { - /* - * Read the existing label file into memory. - */ - struct stat statbuf; - int r; - - if (stat(BACKUP_LABEL_FILE, &statbuf)) - { - if (errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", - BACKUP_LABEL_FILE))); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("a backup is not in progress"))); - } - - lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); - if (!lfp) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", - BACKUP_LABEL_FILE))); - } - labelfile = palloc(statbuf.st_size + 1); - r = fread(labelfile, statbuf.st_size, 1, lfp); - labelfile[statbuf.st_size] = '\0'; - - /* - * Close and remove the backup label file - */ - if (r != 1 || ferror(lfp) || FreeFile(lfp)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", - BACKUP_LABEL_FILE))); - if (unlink(BACKUP_LABEL_FILE) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", - BACKUP_LABEL_FILE))); - - /* - * Remove tablespace_map file if present, it is created only if there - * are tablespaces. - */ - unlink(TABLESPACE_MAP); - } + /* Clean up session-level lock */ + sessionBackupState = SESSION_BACKUP_NONE; /* * Read and parse the START WAL LOCATION line (this code is pretty crude, @@ -10540,7 +11009,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) * Force a switch to a new xlog segment file, so that the backup is valid * as soon as archiver moves out the current segment file. */ - RequestXLogSwitch(); + RequestXLogSwitch(false); XLByteToPrevSeg(stoppoint, _logSegNo); XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo); @@ -10588,9 +11057,9 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) * archived before returning. If archiving isn't enabled, the required WAL * needs to be transported via streaming replication (hopefully with * wal_keep_segments set high enough), or some more exotic mechanism like - * polling and copying files from pg_xlog with script. We have no - * knowledge of those mechanisms, so it's up to the user to ensure that he - * gets all the required WAL. + * polling and copying files from pg_wal with script. We have no knowledge + * of those mechanisms, so it's up to the user to ensure that he gets all + * the required WAL. * * We wait until both the last WAL file filled during backup and the * history file have been archived, and assume that the alphabetic sorting @@ -10599,8 +11068,9 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) * * We wait forever, since archive_command is supposed to work and we * assume the admin wanted his backup to work completely. If you don't - * wish to wait, you can set statement_timeout. Also, some notices are - * issued to clue in anyone who might be doing this interactively. + * wish to wait, then either waitforarchive should be passed in as false, + * or you can set statement_timeout. Also, some notices are issued to + * clue in anyone who might be doing this interactively. */ if (waitforarchive && XLogArchivingActive()) { @@ -10674,7 +11144,7 @@ do_pg_abort_backup(void) Assert(XLogCtl->Insert.nonExclusiveBackups > 0); XLogCtl->Insert.nonExclusiveBackups--; - if (!XLogCtl->Insert.exclusiveBackup && + if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE && XLogCtl->Insert.nonExclusiveBackups == 0) { XLogCtl->Insert.forcePageWrites = false; @@ -10929,8 +11399,8 @@ rm_redo_error_callback(void *arg) initStringInfo(&buf); xlog_outdesc(&buf, record); - /* translator: %s is an XLog record description */ - errcontext("xlog redo at %X/%X for %s", + /* translator: %s is a WAL record description */ + errcontext("WAL redo at %X/%X for %s", (uint32) (record->ReadRecPtr >> 32), (uint32) record->ReadRecPtr, buf.data); @@ -11139,10 +11609,12 @@ retry: goto next_record_is_invalid; } + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ) { char fname[MAXFNAMELEN]; + pgstat_report_wait_end(); XLogFileName(fname, curFileTLI, readSegNo); ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode_for_file_access(), @@ -11150,6 +11622,7 @@ retry: fname, readOff))); goto next_record_is_invalid; } + pgstat_report_wait_end(); Assert(targetSegNo == readSegNo); Assert(targetPageOff == readOff); @@ -11175,12 +11648,12 @@ next_record_is_invalid: } /* - * Open the WAL segment containing WAL position 'RecPtr'. + * Open the WAL segment containing WAL location 'RecPtr'. * * The segment can be fetched via restore_command, or via walreceiver having - * streamed the record, or it can already be present in pg_xlog. Checking - * pg_xlog is mainly for crash recovery, but it will be polled in standby mode - * too, in case someone copies a new segment directly to pg_xlog. That is not + * streamed the record, or it can already be present in pg_wal. Checking + * pg_wal is mainly for crash recovery, but it will be polled in standby mode + * too, in case someone copies a new segment directly to pg_wal. That is not * documented or recommended, though. * * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should @@ -11206,12 +11679,13 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, { static TimestampTz last_fail_time = 0; TimestampTz now; + bool streaming_reply_sent = false; /*------- * Standby mode is implemented by a state machine: * - * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just - * pg_xlog (XLOG_FROM_XLOG) + * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just + * pg_wal (XLOG_FROM_PG_WAL) * 2. Check trigger file * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) * 4. Rescan timelines @@ -11227,7 +11701,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, *------- */ if (!InArchiveRecovery) - currentSource = XLOG_FROM_PG_XLOG; + currentSource = XLOG_FROM_PG_WAL; else if (currentSource == 0) currentSource = XLOG_FROM_ARCHIVE; @@ -11246,13 +11720,13 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, switch (currentSource) { case XLOG_FROM_ARCHIVE: - case XLOG_FROM_PG_XLOG: + case XLOG_FROM_PG_WAL: /* * Check to see if the trigger file exists. Note that we * do this only after failure, so when you create the * trigger file, we still finish replaying as much as we - * can from archive and pg_xlog before failover. + * can from archive and pg_wal before failover. */ if (StandbyMode && CheckForStandbyTrigger()) { @@ -11262,7 +11736,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, /* * Not in standby mode, and we've now tried the archive - * and pg_xlog. + * and pg_wal. */ if (!StandbyMode) return false; @@ -11322,8 +11796,8 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * little chance that the problem will just go away, but * PANIC is not good for availability either, especially * in hot standby mode. So, we treat that the same as - * disconnection, and retry from archive/pg_xlog again. - * The WAL in the archive should be identical to what was + * disconnection, and retry from archive/pg_wal again. The + * WAL in the archive should be identical to what was * streamed, so it's unlikely that it helps, but one can * hope... */ @@ -11371,7 +11845,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, WaitLatch(&XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, - wait_time); + wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM); ResetLatch(&XLogCtl->recoveryWakeupLatch); now = GetCurrentTimestamp(); } @@ -11383,11 +11857,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, elog(ERROR, "unexpected WAL source %d", currentSource); } } - else if (currentSource == XLOG_FROM_PG_XLOG) + else if (currentSource == XLOG_FROM_PG_WAL) { /* - * We just successfully read a file in pg_xlog. We prefer files in - * the archive over ones in pg_xlog, so try the next file again + * We just successfully read a file in pg_wal. We prefer files in + * the archive over ones in pg_wal, so try the next file again * from the archive first. */ if (InArchiveRecovery) @@ -11408,7 +11882,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, switch (currentSource) { case XLOG_FROM_ARCHIVE: - case XLOG_FROM_PG_XLOG: + case XLOG_FROM_PG_WAL: /* Close any old file we might have open. */ if (readFile >= 0) { @@ -11421,7 +11895,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, /* * Try to restore the file from archive, or read an existing - * file from pg_xlog. + * file from pg_wal. */ readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : @@ -11430,7 +11904,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, return true; /* success! */ /* - * Nope, not found in archive or pg_xlog. + * Nope, not found in archive or pg_wal. */ lastSourceFailed = true; break; @@ -11486,9 +11960,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * not open already. Also read the timeline history * file if we haven't initialized timeline history * yet; it should be streamed over and present in - * pg_xlog by now. Use XLOG_FROM_STREAM so that - * source info is set correctly and XLogReceiptTime - * isn't changed. + * pg_wal by now. Use XLOG_FROM_STREAM so that source + * info is set correctly and XLogReceiptTime isn't + * changed. */ if (readFile < 0) { @@ -11518,10 +11992,10 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, /* * Note that we don't "return false" immediately here. * After being triggered, we still want to replay all - * the WAL that was already streamed. It's in pg_xlog + * the WAL that was already streamed. It's in pg_wal * now, so we just treat this as a failure, and the * state machine will move on to replay the streamed - * WAL from pg_xlog, and then recheck the trigger and + * WAL from pg_wal, and then recheck the trigger and * exit replay. */ lastSourceFailed = true; @@ -11529,12 +12003,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, } /* + * Since we have replayed everything we have received so + * far and are about to start waiting for more WAL, let's + * tell the upstream server our replay location now so + * that pg_stat_replication doesn't show stale + * information. + */ + if (!streaming_reply_sent) + { + WalRcvForceReply(); + streaming_reply_sent = true; + } + + /* * Wait for more WAL to arrive. Time out after 5 seconds * to react to a trigger file promptly. */ WaitLatch(&XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, - 5000L); + 5000L, WAIT_EVENT_RECOVERY_WAL_ALL); ResetLatch(&XLogCtl->recoveryWakeupLatch); break; } @@ -11561,7 +12048,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * or legitimate end-of-WAL situation. Generally, we use it as-is, but if * we're retrying the exact same record that we've tried previously, only * complain the first time to keep the noise down. However, we only do when - * reading from pg_xlog, because we don't expect any invalid records in archive + * reading from pg_wal, because we don't expect any invalid records in archive * or in records streamed from master. Files in the archive should be complete, * and we should never hit the end of WAL because we stop and wait for more WAL * to arrive before replaying it. @@ -11576,7 +12063,7 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) { static XLogRecPtr lastComplaint = 0; - if (readSource == XLOG_FROM_PG_XLOG && emode == LOG) + if (readSource == XLOG_FROM_PG_WAL && emode == LOG) { if (RecPtr == lastComplaint) emode = DEBUG1; diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index d153a44ea9..7afb73579b 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -4,7 +4,7 @@ * Functions for archiving WAL files and restoring from the archive. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/xlogarchive.c @@ -14,7 +14,6 @@ #include "postgres.h" -#include <sys/types.h> #include <sys/stat.h> #include <sys/wait.h> #include <signal.h> @@ -421,7 +420,7 @@ ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal) /* * A file was restored from the archive under a temporary filename (path), * and now we want to keep it. Rename it under the permanent filename in - * in pg_xlog (xlogfname), replacing any existing file with the same name. + * in pg_wal (xlogfname), replacing any existing file with the same name. */ void KeepFileRestoredFromArchive(char *path, char *xlogfname) diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 33383b4dcc..b3223d691d 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -2,12 +2,12 @@ * * xlogfuncs.c * - * PostgreSQL transaction log manager user interface functions + * PostgreSQL write-ahead log manager user interface functions * * This file contains WAL control and information functions. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/xlogfuncs.c @@ -18,7 +18,6 @@ #include "access/htup_details.h" #include "access/xlog.h" -#include "access/xlog_fn.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "catalog/catalog.h" @@ -43,8 +42,6 @@ */ static StringInfo label_file; static StringInfo tblspc_map_file; -static bool exclusive_backup_running = false; -static bool nonexclusive_backup_running = false; /* * Called when the backend exits with a running non-exclusive base backup, @@ -73,16 +70,17 @@ nonexclusive_base_backup_cleanup(int code, Datum arg) Datum pg_start_backup(PG_FUNCTION_ARGS) { - text *backupid = PG_GETARG_TEXT_P(0); + text *backupid = PG_GETARG_TEXT_PP(0); bool fast = PG_GETARG_BOOL(1); bool exclusive = PG_GETARG_BOOL(2); char *backupidstr; XLogRecPtr startpoint; DIR *dir; + SessionBackupState status = get_backup_status(); backupidstr = text_to_cstring(backupid); - if (exclusive_backup_running || nonexclusive_backup_running) + if (status == SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is already in progress in this session"))); @@ -97,7 +95,6 @@ pg_start_backup(PG_FUNCTION_ARGS) { startpoint = do_pg_start_backup(backupidstr, fast, NULL, NULL, dir, NULL, NULL, false, true); - exclusive_backup_running = true; } else { @@ -114,7 +111,6 @@ pg_start_backup(PG_FUNCTION_ARGS) startpoint = do_pg_start_backup(backupidstr, fast, NULL, label_file, dir, NULL, tblspc_map_file, false, true); - nonexclusive_backup_running = true; before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0); } @@ -128,7 +124,7 @@ pg_start_backup(PG_FUNCTION_ARGS) * pg_stop_backup: finish taking an on-line backup dump * * We write an end-of-backup WAL record, and remove the backup label file - * created by pg_start_backup, creating a backup history file in pg_xlog + * created by pg_start_backup, creating a backup history file in pg_wal * instead (whence it will immediately be archived). The backup history file * contains the same info found in the label file, plus the backup-end time * and WAL location. Before 9.0, the backup-end time was read from the backup @@ -148,8 +144,9 @@ Datum pg_stop_backup(PG_FUNCTION_ARGS) { XLogRecPtr stoppoint; + SessionBackupState status = get_backup_status(); - if (nonexclusive_backup_running) + if (status == SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("non-exclusive backup in progress"), @@ -157,14 +154,13 @@ pg_stop_backup(PG_FUNCTION_ARGS) /* * Exclusive backups were typically started in a different connection, so - * don't try to verify that exclusive_backup_running is set in this one. - * Actual verification that an exclusive backup is in fact running is - * handled inside do_pg_stop_backup. + * don't try to verify that status of backup is set to + * SESSION_BACKUP_EXCLUSIVE in this function. Actual verification that an + * exclusive backup is in fact running is handled inside + * do_pg_stop_backup. */ stoppoint = do_pg_stop_backup(NULL, true, NULL); - exclusive_backup_running = false; - PG_RETURN_LSN(stoppoint); } @@ -176,6 +172,13 @@ pg_stop_backup(PG_FUNCTION_ARGS) * the backup label and tablespace map files as text fields in as part of the * resultset. * + * The first parameter (variable 'exclusive') allows the user to tell us if + * this is an exclusive or a non-exclusive backup. + * + * The second paramter (variable 'waitforarchive'), which is optional, + * allows the user to choose if they want to wait for the WAL to be archived + * or if we should just return as soon as the WAL record is written. + * * Permission checking for this function is managed through the normal * GRANT system. */ @@ -191,7 +194,9 @@ pg_stop_backup_v2(PG_FUNCTION_ARGS) bool nulls[3]; bool exclusive = PG_GETARG_BOOL(0); + bool waitforarchive = PG_GETARG_BOOL(1); XLogRecPtr stoppoint; + SessionBackupState status = get_backup_status(); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) @@ -223,7 +228,7 @@ pg_stop_backup_v2(PG_FUNCTION_ARGS) if (exclusive) { - if (nonexclusive_backup_running) + if (status == SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("non-exclusive backup in progress"), @@ -233,15 +238,14 @@ pg_stop_backup_v2(PG_FUNCTION_ARGS) * Stop the exclusive backup, and since we're in an exclusive backup * return NULL for both backup_label and tablespace_map. */ - stoppoint = do_pg_stop_backup(NULL, true, NULL); - exclusive_backup_running = false; + stoppoint = do_pg_stop_backup(NULL, waitforarchive, NULL); nulls[1] = true; nulls[2] = true; } else { - if (!nonexclusive_backup_running) + if (status != SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("non-exclusive backup is not in progress"), @@ -251,8 +255,7 @@ pg_stop_backup_v2(PG_FUNCTION_ARGS) * Stop the non-exclusive backup. Return a copy of the backup label * and tablespace map so they can be written to disk by the caller. */ - stoppoint = do_pg_stop_backup(label_file->data, true, NULL); - nonexclusive_backup_running = false; + stoppoint = do_pg_stop_backup(label_file->data, waitforarchive, NULL); cancel_before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0); values[1] = CStringGetTextDatum(label_file->data); @@ -277,13 +280,13 @@ pg_stop_backup_v2(PG_FUNCTION_ARGS) } /* - * pg_switch_xlog: switch to next xlog file + * pg_switch_wal: switch to next xlog file * * Permission checking for this function is managed through the normal * GRANT system. */ Datum -pg_switch_xlog(PG_FUNCTION_ARGS) +pg_switch_wal(PG_FUNCTION_ARGS) { XLogRecPtr switchpoint; @@ -293,7 +296,7 @@ pg_switch_xlog(PG_FUNCTION_ARGS) errmsg("recovery is in progress"), errhint("WAL control functions cannot be executed during recovery."))); - switchpoint = RequestXLogSwitch(); + switchpoint = RequestXLogSwitch(false); /* * As a convenience, return the WAL location of the switch record @@ -310,7 +313,7 @@ pg_switch_xlog(PG_FUNCTION_ARGS) Datum pg_create_restore_point(PG_FUNCTION_ARGS) { - text *restore_name = PG_GETARG_TEXT_P(0); + text *restore_name = PG_GETARG_TEXT_PP(0); char *restore_name_str; XLogRecPtr restorepoint; @@ -349,7 +352,7 @@ pg_create_restore_point(PG_FUNCTION_ARGS) * to the kernel, but is not necessarily synced to disk. */ Datum -pg_current_xlog_location(PG_FUNCTION_ARGS) +pg_current_wal_lsn(PG_FUNCTION_ARGS) { XLogRecPtr current_recptr; @@ -370,7 +373,7 @@ pg_current_xlog_location(PG_FUNCTION_ARGS) * This function is mostly for debugging purposes. */ Datum -pg_current_xlog_insert_location(PG_FUNCTION_ARGS) +pg_current_wal_insert_lsn(PG_FUNCTION_ARGS) { XLogRecPtr current_recptr; @@ -391,7 +394,7 @@ pg_current_xlog_insert_location(PG_FUNCTION_ARGS) * This function is mostly for debugging purposes. */ Datum -pg_current_xlog_flush_location(PG_FUNCTION_ARGS) +pg_current_wal_flush_lsn(PG_FUNCTION_ARGS) { XLogRecPtr current_recptr; @@ -413,7 +416,7 @@ pg_current_xlog_flush_location(PG_FUNCTION_ARGS) * and synced to disk by walreceiver. */ Datum -pg_last_xlog_receive_location(PG_FUNCTION_ARGS) +pg_last_wal_receive_lsn(PG_FUNCTION_ARGS) { XLogRecPtr recptr; @@ -432,7 +435,7 @@ pg_last_xlog_receive_location(PG_FUNCTION_ARGS) * connections during recovery. */ Datum -pg_last_xlog_replay_location(PG_FUNCTION_ARGS) +pg_last_wal_replay_lsn(PG_FUNCTION_ARGS) { XLogRecPtr recptr; @@ -453,7 +456,7 @@ pg_last_xlog_replay_location(PG_FUNCTION_ARGS) * expected usage is to determine which xlog file(s) are ready to archive. */ Datum -pg_xlogfile_name_offset(PG_FUNCTION_ARGS) +pg_walfile_name_offset(PG_FUNCTION_ARGS) { XLogSegNo xlogsegno; uint32 xrecoff; @@ -469,7 +472,7 @@ pg_xlogfile_name_offset(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), - errhint("pg_xlogfile_name_offset() cannot be executed during recovery."))); + errhint("pg_walfile_name_offset() cannot be executed during recovery."))); /* * Construct a tuple descriptor for the result row. This must match this @@ -515,7 +518,7 @@ pg_xlogfile_name_offset(PG_FUNCTION_ARGS) * such as is returned by pg_stop_backup() or pg_xlog_switch(). */ Datum -pg_xlogfile_name(PG_FUNCTION_ARGS) +pg_walfile_name(PG_FUNCTION_ARGS) { XLogSegNo xlogsegno; XLogRecPtr locationpoint = PG_GETARG_LSN(0); @@ -525,7 +528,7 @@ pg_xlogfile_name(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), - errhint("pg_xlogfile_name() cannot be executed during recovery."))); + errhint("pg_walfile_name() cannot be executed during recovery."))); XLByteToPrevSeg(locationpoint, xlogsegno); XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno); @@ -534,13 +537,13 @@ pg_xlogfile_name(PG_FUNCTION_ARGS) } /* - * pg_xlog_replay_pause - pause recovery now + * pg_wal_replay_pause - pause recovery now * * Permission checking for this function is managed through the normal * GRANT system. */ Datum -pg_xlog_replay_pause(PG_FUNCTION_ARGS) +pg_wal_replay_pause(PG_FUNCTION_ARGS) { if (!RecoveryInProgress()) ereport(ERROR, @@ -554,13 +557,13 @@ pg_xlog_replay_pause(PG_FUNCTION_ARGS) } /* - * pg_xlog_replay_resume - resume recovery now + * pg_wal_replay_resume - resume recovery now * * Permission checking for this function is managed through the normal * GRANT system. */ Datum -pg_xlog_replay_resume(PG_FUNCTION_ARGS) +pg_wal_replay_resume(PG_FUNCTION_ARGS) { if (!RecoveryInProgress()) ereport(ERROR, @@ -574,10 +577,10 @@ pg_xlog_replay_resume(PG_FUNCTION_ARGS) } /* - * pg_is_xlog_replay_paused + * pg_is_wal_replay_paused */ Datum -pg_is_xlog_replay_paused(PG_FUNCTION_ARGS) +pg_is_wal_replay_paused(PG_FUNCTION_ARGS) { if (!RecoveryInProgress()) ereport(ERROR, @@ -619,7 +622,7 @@ pg_is_in_recovery(PG_FUNCTION_ARGS) * Compute the difference in bytes between two WAL locations. */ Datum -pg_xlog_location_diff(PG_FUNCTION_ARGS) +pg_wal_lsn_diff(PG_FUNCTION_ARGS) { Datum result; diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index c37003a24c..6a02738479 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -9,7 +9,7 @@ * of XLogRecData structs by a call to XLogRecordAssemble(). See * access/transam/README for details. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/xloginsert.c @@ -73,8 +73,8 @@ static XLogRecData *mainrdata_head; static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head; static uint32 mainrdata_len; /* total # of bytes in chain */ -/* Should the in-progress insertion log the origin? */ -static bool include_origin = false; +/* flags for the in-progress insertion */ +static uint8 curinsert_flags = 0; /* * These are used to hold the record header while constructing a record. @@ -201,7 +201,7 @@ XLogResetInsertion(void) max_registered_block_id = 0; mainrdata_len = 0; mainrdata_last = (XLogRecData *) &mainrdata_head; - include_origin = false; + curinsert_flags = 0; begininsert_called = false; } @@ -384,13 +384,20 @@ XLogRegisterBufData(uint8 block_id, char *data, int len) } /* - * Should this record include the replication origin if one is set up? + * Set insert status flags for the upcoming WAL record. + * + * The flags that can be used here are: + * - XLOG_INCLUDE_ORIGIN, to determine if the replication origin should be + * included in the record. + * - XLOG_MARK_UNIMPORTANT, to signal that the record is not important for + * durability, which allows to avoid triggering WAL archiving and other + * background activity. */ void -XLogIncludeOrigin(void) +XLogSetRecordFlags(uint8 flags) { Assert(begininsert_called); - include_origin = true; + curinsert_flags = flags; } /* @@ -414,13 +421,15 @@ XLogInsert(RmgrId rmid, uint8 info) elog(ERROR, "XLogBeginInsert was not called"); /* - * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are - * reserved for use by me. + * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and + * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me. */ - if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0) + if ((info & ~(XLR_RMGR_INFO_MASK | + XLR_SPECIAL_REL_UPDATE | + XLR_CHECK_CONSISTENCY)) != 0) elog(PANIC, "invalid xlog info mask %02X", info); - TRACE_POSTGRESQL_XLOG_INSERT(rmid, info); + TRACE_POSTGRESQL_WAL_INSERT(rmid, info); /* * In bootstrap mode, we don't actually log anything but XLOG resources; @@ -450,7 +459,7 @@ XLogInsert(RmgrId rmid, uint8 info) rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, &fpw_lsn); - EndPos = XLogInsertRecord(rdt, fpw_lsn); + EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags); } while (EndPos == InvalidXLogRecPtr); XLogResetInsertion(); @@ -498,6 +507,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, hdr_rdt.data = hdr_scratch; /* + * Enforce consistency checks for this record if user is looking for it. + * Do this before at the beginning of this routine to give the possibility + * for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY directly for + * a record. + */ + if (wal_consistency_checking[rmid]) + info |= XLR_CHECK_CONSISTENCY; + + /* * Make an rdata chain containing all the data portions of all block * references. This includes the data for full-page images. Also append * the headers for the block references in the scratch buffer. @@ -513,6 +531,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecordBlockCompressHeader cbimg = {0}; bool samerel; bool is_compressed = false; + bool include_image; if (!regbuf->in_use) continue; @@ -556,7 +575,13 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) bkpb.fork_flags |= BKPBLOCK_WILL_INIT; - if (needs_backup) + /* + * If needs_backup is true or WAL checking is enabled for current + * resource manager, log a full-page write for the current block. + */ + include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0; + + if (include_image) { Page page = regbuf->page; uint16 compressed_len; @@ -618,6 +643,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE; + /* + * If WAL consistency checking is enabled for the resource manager + * of this WAL record, a full-page image is included in the record + * for the block modified. During redo, the full-page is replayed + * only if BKPIMAGE_APPLY is set. + */ + if (needs_backup) + bimg.bimg_info |= BKPIMAGE_APPLY; + if (is_compressed) { bimg.length = compressed_len; @@ -680,7 +714,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* Ok, copy the header to the scratch buffer */ memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); scratch += SizeOfXLogRecordBlockHeader; - if (needs_backup) + if (include_image) { memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); scratch += SizeOfXLogRecordBlockImageHeader; @@ -701,9 +735,10 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } /* followed by the record's origin, if any */ - if (include_origin && replorigin_session_origin != InvalidRepOriginId) + if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) && + replorigin_session_origin != InvalidRepOriginId) { - *(scratch++) = XLR_BLOCK_ID_ORIGIN; + *(scratch++) = (char) XLR_BLOCK_ID_ORIGIN; memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin)); scratch += sizeof(replorigin_session_origin); } @@ -713,13 +748,13 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, { if (mainrdata_len > 255) { - *(scratch++) = XLR_BLOCK_ID_DATA_LONG; + *(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG; memcpy(scratch, &mainrdata_len, sizeof(uint32)); scratch += sizeof(uint32); } else { - *(scratch++) = XLR_BLOCK_ID_DATA_SHORT; + *(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT; *(scratch++) = (uint8) mainrdata_len; } rdt_datas_last->next = mainrdata_head; @@ -997,9 +1032,7 @@ InitXLogInsert(void) { xloginsert_cxt = AllocSetContextCreate(TopMemoryContext, "WAL record construction", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); } if (registered_buffers == NULL) diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index dcf747c633..c3b1371764 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -3,7 +3,7 @@ * xlogreader.c * Generic XLog reading facility * - * Portions Copyright (c) 2013-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2013-2017, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/access/transam/xlogreader.c @@ -462,7 +462,8 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) /* * Special processing if it's an XLOG SWITCH record */ - if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH) + if (record->xl_rmid == RM_XLOG_ID && + (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) { /* Pretend it extends to end of segment */ state->EndRecPtr += XLogSegSize - 1; @@ -866,46 +867,83 @@ XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) { XLogReaderState saved_state = *state; - XLogRecPtr targetPagePtr; XLogRecPtr tmpRecPtr; - int targetRecOff; XLogRecPtr found = InvalidXLogRecPtr; - uint32 pageHeaderSize; XLogPageHeader header; - int readLen; char *errormsg; Assert(!XLogRecPtrIsInvalid(RecPtr)); - targetRecOff = RecPtr % XLOG_BLCKSZ; + /* + * skip over potential continuation data, keeping in mind that it may span + * multiple pages + */ + tmpRecPtr = RecPtr; + while (true) + { + XLogRecPtr targetPagePtr; + int targetRecOff; + uint32 pageHeaderSize; + int readLen; - /* scroll back to page boundary */ - targetPagePtr = RecPtr - targetRecOff; + /* + * Compute targetRecOff. It should typically be equal or greater than + * short page-header since a valid record can't start anywhere before + * that, except when caller has explicitly specified the offset that + * falls somewhere there or when we are skipping multi-page + * continuation record. It doesn't matter though because + * ReadPageInternal() is prepared to handle that and will read at + * least short page-header worth of data + */ + targetRecOff = tmpRecPtr % XLOG_BLCKSZ; - /* Read the page containing the record */ - readLen = ReadPageInternal(state, targetPagePtr, targetRecOff); - if (readLen < 0) - goto err; + /* scroll back to page boundary */ + targetPagePtr = tmpRecPtr - targetRecOff; - header = (XLogPageHeader) state->readBuf; + /* Read the page containing the record */ + readLen = ReadPageInternal(state, targetPagePtr, targetRecOff); + if (readLen < 0) + goto err; - pageHeaderSize = XLogPageHeaderSize(header); + header = (XLogPageHeader) state->readBuf; - /* make sure we have enough data for the page header */ - readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize); - if (readLen < 0) - goto err; + pageHeaderSize = XLogPageHeaderSize(header); - /* skip over potential continuation data */ - if (header->xlp_info & XLP_FIRST_IS_CONTRECORD) - { - /* record headers are MAXALIGN'ed */ - tmpRecPtr = targetPagePtr + pageHeaderSize - + MAXALIGN(header->xlp_rem_len); - } - else - { - tmpRecPtr = targetPagePtr + pageHeaderSize; + /* make sure we have enough data for the page header */ + readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize); + if (readLen < 0) + goto err; + + /* skip over potential continuation data */ + if (header->xlp_info & XLP_FIRST_IS_CONTRECORD) + { + /* + * If the length of the remaining continuation data is more than + * what can fit in this page, the continuation record crosses over + * this page. Read the next page and try again. xlp_rem_len in the + * next page header will contain the remaining length of the + * continuation data + * + * Note that record headers are MAXALIGN'ed + */ + if (MAXALIGN(header->xlp_rem_len) > (XLOG_BLCKSZ - pageHeaderSize)) + tmpRecPtr = targetPagePtr + XLOG_BLCKSZ; + else + { + /* + * The previous continuation record ends in this page. Set + * tmpRecPtr to point to the first valid record + */ + tmpRecPtr = targetPagePtr + pageHeaderSize + + MAXALIGN(header->xlp_rem_len); + break; + } + } + else + { + tmpRecPtr = targetPagePtr + pageHeaderSize; + break; + } } /* @@ -959,6 +997,7 @@ ResetDecoder(XLogReaderState *state) state->blocks[block_id].in_use = false; state->blocks[block_id].has_image = false; state->blocks[block_id].has_data = false; + state->blocks[block_id].apply_image = false; } state->max_block_id = -1; } @@ -1051,6 +1090,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) blk = &state->blocks[block_id]; blk->in_use = true; + blk->apply_image = false; COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; @@ -1082,6 +1122,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); + + blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0); + if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) { if (blk->bimg_info & BKPIMAGE_HAS_HOLE) @@ -1205,6 +1248,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (!blk->in_use) continue; + + Assert(blk->has_image || !blk->apply_image); + if (blk->has_image) { blk->bkp_image = ptr; diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 1bdbea655b..4f67dc62fb 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -2,13 +2,13 @@ * * xlogutils.c * - * PostgreSQL transaction log manager utility routines + * PostgreSQL write-ahead log manager utility routines * * This file contains support routines that are used by XLOG replay functions. * None of this code is used during normal system operation. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/xlogutils.c @@ -20,11 +20,13 @@ #include <unistd.h> #include "miscadmin.h" +#include "access/timeline.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "miscadmin.h" +#include "pgstat.h" #include "storage/smgr.h" #include "utils/guc.h" #include "utils/hsearch.h" @@ -276,9 +278,9 @@ XLogCheckInvalidPages(void) * will complain if we don't have the lock. In hot standby mode it's * definitely necessary.) * - * Note: when a backup block is available in XLOG, we restore it - * unconditionally, even if the page in the database appears newer. This is - * to protect ourselves against database pages that were partially or + * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag + * set, we restore it, even if the page in the database appears newer. This + * is to protect ourselves against database pages that were partially or * incorrectly written during a crash. We assume that the XLOG data must be * good because it has passed a CRC check, while the database page might not * be. This will force us to replay all subsequent modifications of the page @@ -353,9 +355,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, if (!willinit && zeromode) elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); - /* If it's a full-page image, restore it. */ - if (XLogRecHasBlockImage(record, block_id)) + /* If it has a full-page image and it should be restored, do it. */ + if (XLogRecBlockImageApply(record, block_id)) { + Assert(XLogRecHasBlockImage(record, block_id)); *buf = XLogReadBufferExtended(rnode, forknum, blkno, get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); page = BufferGetPage(*buf); @@ -647,7 +650,7 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, * always be one descriptor left open until the process ends, but never * more than one. * - * XXX This is very similar to pg_xlogdump's XLogDumpXLogRead and to XLogRead + * XXX This is very similar to pg_waldump's XLogDumpXLogRead and to XLogRead * in walsender.c but for small differences (such as lack of elog() in * frontend). Probably these should be merged at some point. */ @@ -661,6 +664,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) /* state maintained across calls */ static int sendFile = -1; static XLogSegNo sendSegNo = 0; + static TimeLineID sendTLI = 0; static uint32 sendOff = 0; p = buf; @@ -676,7 +680,8 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) startoff = recptr % XLogSegSize; /* Do we need to switch to a different xlog segment? */ - if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) + if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) || + sendTLI != tli) { char path[MAXPGPATH]; @@ -703,6 +708,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) path))); } sendOff = 0; + sendTLI = tli; } /* Need to seek in the file? */ @@ -728,7 +734,9 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) else segbytes = nbytes; + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); readbytes = read(sendFile, p, segbytes); + pgstat_report_wait_end(); if (readbytes <= 0) { char path[MAXPGPATH]; @@ -751,6 +759,137 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) } /* + * Determine which timeline to read an xlog page from and set the + * XLogReaderState's currTLI to that timeline ID. + * + * We care about timelines in xlogreader when we might be reading xlog + * generated prior to a promotion, either if we're currently a standby in + * recovery or if we're a promoted master reading xlogs generated by the old + * master before our promotion. + * + * wantPage must be set to the start address of the page to read and + * wantLength to the amount of the page that will be read, up to + * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ. + * + * We switch to an xlog segment from the new timeline eagerly when on a + * historical timeline, as soon as we reach the start of the xlog segment + * containing the timeline switch. The server copied the segment to the new + * timeline so all the data up to the switch point is the same, but there's no + * guarantee the old segment will still exist. It may have been deleted or + * renamed with a .partial suffix so we can't necessarily keep reading from + * the old TLI even though tliSwitchPoint says it's OK. + * + * We can't just check the timeline when we read a page on a different segment + * to the last page. We could've received a timeline switch from a cascading + * upstream, so the current segment ends abruptly (possibly getting renamed to + * .partial) and we have to switch to a new one. Even in the middle of reading + * a page we could have to dump the cached page and switch to a new TLI. + * + * Because of this, callers MAY NOT assume that currTLI is the timeline that + * will be in a page's xlp_tli; the page may begin on an older timeline or we + * might be reading from historical timeline data on a segment that's been + * copied to a new timeline. + * + * The caller must also make sure it doesn't read past the current replay + * position (using GetWalRcvWriteRecPtr) if executing in recovery, so it + * doesn't fail to notice that the current timeline became historical. The + * caller must also update ThisTimeLineID with the result of + * GetWalRcvWriteRecPtr and must check RecoveryInProgress(). + */ +void +XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength) +{ + const XLogRecPtr lastReadPage = state->readSegNo * XLogSegSize + state->readOff; + + Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); + Assert(wantLength <= XLOG_BLCKSZ); + Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); + + /* + * If the desired page is currently read in and valid, we have nothing to + * do. + * + * The caller should've ensured that it didn't previously advance readOff + * past the valid limit of this timeline, so it doesn't matter if the + * current TLI has since become historical. + */ + if (lastReadPage == wantPage && + state->readLen != 0 && + lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) + return; + + /* + * If we're reading from the current timeline, it hasn't become historical + * and the page we're reading is after the last page read, we can again + * just carry on. (Seeking backwards requires a check to make sure the + * older page isn't on a prior timeline). + * + * ThisTimeLineID might've become historical since we last looked, but the + * caller is required not to read past the flush limit it saw at the time + * it looked up the timeline. There's nothing we can do about it if + * StartupXLOG() renames it to .partial concurrently. + */ + if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage) + { + Assert(state->currTLIValidUntil == InvalidXLogRecPtr); + return; + } + + /* + * If we're just reading pages from a previously validated historical + * timeline and the timeline we're reading from is valid until the end of + * the current segment we can just keep reading. + */ + if (state->currTLIValidUntil != InvalidXLogRecPtr && + state->currTLI != ThisTimeLineID && + state->currTLI != 0 && + (wantPage + wantLength) / XLogSegSize < state->currTLIValidUntil / XLogSegSize) + return; + + /* + * If we reach this point we're either looking up a page for random + * access, the current timeline just became historical, or we're reading + * from a new segment containing a timeline switch. In all cases we need + * to determine the newest timeline on the segment. + * + * If it's the current timeline we can just keep reading from here unless + * we detect a timeline switch that makes the current timeline historical. + * If it's a historical timeline we can read all the segment on the newest + * timeline because it contains all the old timelines' data too. So only + * one switch check is required. + */ + { + /* + * We need to re-read the timeline history in case it's been changed + * by a promotion or replay from a cascaded replica. + */ + List *timelineHistory = readTimeLineHistory(ThisTimeLineID); + + XLogRecPtr endOfSegment = (((wantPage / XLogSegSize) + 1) * XLogSegSize) - 1; + + Assert(wantPage / XLogSegSize == endOfSegment / XLogSegSize); + + /* + * Find the timeline of the last LSN on the segment containing + * wantPage. + */ + state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); + state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, + &state->nextTLI); + + Assert(state->currTLIValidUntil == InvalidXLogRecPtr || + wantPage + wantLength < state->currTLIValidUntil); + + list_free_deep(timelineHistory); + + elog(DEBUG3, "switched to timeline %u valid until %X/%X", + state->currTLI, + (uint32) (state->currTLIValidUntil >> 32), + (uint32) (state->currTLIValidUntil)); + } +} + +/* * read_page callback for reading local xlog files * * Public because it would likely be very helpful for someone writing another @@ -771,28 +910,85 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int count; loc = targetPagePtr + reqLen; + + /* Loop waiting for xlog to be available if necessary */ while (1) { /* - * TODO: we're going to have to do something more intelligent about - * timelines on standbys. Use readTimeLineHistory() and - * tliOfPointInHistory() to get the proper LSN? For now we'll catch - * that case earlier, but the code and TODO is left in here for when - * that changes. + * Determine the limit of xlog we can currently read to, and what the + * most recent timeline is. + * + * RecoveryInProgress() will update ThisTimeLineID when it first + * notices recovery finishes, so we only have to maintain it for the + * local process until recovery ends. */ if (!RecoveryInProgress()) - { - *pageTLI = ThisTimeLineID; read_upto = GetFlushRecPtr(); - } else - read_upto = GetXLogReplayRecPtr(pageTLI); + read_upto = GetXLogReplayRecPtr(&ThisTimeLineID); - if (loc <= read_upto) - break; + *pageTLI = ThisTimeLineID; + + /* + * Check which timeline to get the record from. + * + * We have to do it each time through the loop because if we're in + * recovery as a cascading standby, the current timeline might've + * become historical. We can't rely on RecoveryInProgress() because in + * a standby configuration like + * + * A => B => C + * + * if we're a logical decoding session on C, and B gets promoted, our + * timeline will change while we remain in recovery. + * + * We can't just keep reading from the old timeline as the last WAL + * archive in the timeline will get renamed to .partial by + * StartupXLOG(). + * + * If that happens after our caller updated ThisTimeLineID but before + * we actually read the xlog page, we might still try to read from the + * old (now renamed) segment and fail. There's not much we can do + * about this, but it can only happen when we're a leaf of a cascading + * standby whose master gets promoted while we're decoding, so a + * one-off ERROR isn't too bad. + */ + XLogReadDetermineTimeline(state, targetPagePtr, reqLen); + + if (state->currTLI == ThisTimeLineID) + { - CHECK_FOR_INTERRUPTS(); - pg_usleep(1000L); + if (loc <= read_upto) + break; + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + } + else + { + /* + * We're on a historical timeline, so limit reading to the switch + * point where we moved to the next timeline. + * + * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know + * about the new timeline, so we must've received past the end of + * it. + */ + read_upto = state->currTLIValidUntil; + + /* + * Setting pageTLI to our wanted record's TLI is slightly wrong; + * the page might begin on an older timeline if it contains a + * timeline switch, since its xlog segment will have been copied + * from the prior timeline. This is pretty harmless though, as + * nothing cares so long as the timeline doesn't go backwards. We + * should read the page header instead; FIXME someday. + */ + *pageTLI = state->currTLI; + + /* No need to wait on a historical timeline */ + break; + } } if (targetPagePtr + XLOG_BLCKSZ <= read_upto) diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index 41d2fd4a5f..de3695c7e0 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -4,7 +4,7 @@ * bootparse.y * yacc grammar for the "bootstrap" mode (BKI file format) * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -22,7 +22,6 @@ #include "access/htup.h" #include "access/itup.h" #include "access/tupdesc.h" -#include "access/xact.h" #include "bootstrap/bootstrap.h" #include "catalog/catalog.h" #include "catalog/heap.h" @@ -49,10 +48,9 @@ #include "storage/off.h" #include "storage/smgr.h" #include "tcop/dest.h" +#include "utils/memutils.h" #include "utils/rel.h" -#define atooid(x) ((Oid) strtoul((x), NULL, 10)) - /* * Bison doesn't allocate anything that needs to live across parser calls, @@ -65,19 +63,27 @@ #define YYMALLOC palloc #define YYFREE pfree +static MemoryContext per_line_ctx = NULL; + static void do_start(void) { - StartTransactionCommand(); - elog(DEBUG4, "start transaction"); + Assert(CurrentMemoryContext == CurTransactionContext); + /* First time through, create the per-line working context */ + if (per_line_ctx == NULL) + per_line_ctx = AllocSetContextCreate(CurTransactionContext, + "bootstrap per-line processing", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(per_line_ctx); } static void do_end(void) { - CommitTransactionCommand(); - elog(DEBUG4, "commit transaction"); + /* Reclaim memory allocated while processing this line */ + MemoryContextSwitchTo(CurTransactionContext); + MemoryContextReset(per_line_ctx); CHECK_FOR_INTERRUPTS(); /* allow SIGINT to kill bootstrap run */ if (isatty(0)) { @@ -105,11 +111,11 @@ static int num_columns_read = 0; %type <list> boot_index_params %type <ielem> boot_index_param -%type <str> boot_const boot_ident +%type <str> boot_ident %type <ival> optbootstrap optsharedrelation optwithoutoids boot_column_nullness %type <oidval> oidspec optoideq optrowtypeoid -%token <str> CONST_P ID +%token <str> ID %token OPEN XCLOSE XCREATE INSERT_TUPLE %token XDECLARE INDEX ON USING XBUILD INDICES UNIQUE XTOAST %token COMMA EQUALS LPAREN RPAREN @@ -464,16 +470,10 @@ boot_column_val_list: boot_column_val: boot_ident { InsertOneValue($1, num_columns_read++); } - | boot_const - { InsertOneValue($1, num_columns_read++); } | NULLVAL { InsertOneNull(num_columns_read++); } ; -boot_const : - CONST_P { $$ = yylval.str; } - ; - boot_ident : ID { $$ = yylval.str; } ; diff --git a/src/backend/bootstrap/bootscanner.l b/src/backend/bootstrap/bootscanner.l index 0e1413bff9..6467882fa3 100644 --- a/src/backend/bootstrap/bootscanner.l +++ b/src/backend/bootstrap/bootscanner.l @@ -4,7 +4,7 @@ * bootscanner.l * a lexical scanner for the bootstrap parser * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -66,7 +66,6 @@ static int yyline = 1; /* line number for error reporting */ D [0-9] oct \\{D}{D}{D} -Exp [Ee][-+]?{D}+ id ([A-Za-z0-9_]|{oct}|\-)+ sid \"([^\"])*\" arrayid [A-Za-z0-9_]+\[{D}*\] @@ -127,13 +126,6 @@ insert { return(INSERT_TUPLE); } return(ID); } -(-)?{D}+"."{D}*({Exp})? | -(-)?{D}*"."{D}+({Exp})? | -(-)?{D}+{Exp} { - yylval.str = pstrdup(yytext); - return(CONST_P); - } - . { elog(ERROR, "syntax error at line %d: unexpected character \"%s\"", yyline, yytext); } diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 86732f73d8..c2274ae2ff 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -5,7 +5,7 @@ * bootstrap mode is used to create the initial template database * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -20,6 +20,7 @@ #include <signal.h> #include "access/htup_details.h" +#include "access/xact.h" #include "bootstrap/bootstrap.h" #include "catalog/index.h" #include "catalog/pg_collation.h" @@ -36,6 +37,7 @@ #include "replication/walreceiver.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" +#include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/proc.h" #include "tcop/tcopprot.h" @@ -56,7 +58,8 @@ uint32 bootstrap_data_checksum_version = 0; /* No checksum */ -#define ALLOC(t, c) ((t *) calloc((unsigned)(c), sizeof(t))) +#define ALLOC(t, c) \ + ((t *) MemoryContextAllocZero(TopMemoryContext, (unsigned)(c) * sizeof(t))) static void CheckerModeMain(void); static void BootstrapModeMain(void); @@ -236,7 +239,7 @@ AuxiliaryProcessMain(int argc, char *argv[]) SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; case 'D': - userDoption = strdup(optarg); + userDoption = pstrdup(optarg); break; case 'd': { @@ -408,6 +411,10 @@ AuxiliaryProcessMain(int argc, char *argv[]) /* finish setting up bufmgr.c */ InitBufferPoolBackend(); + /* Initialize backend status information */ + pgstat_initialize(); + pgstat_bestart(); + /* register a before-shutdown callback for LWLock cleanup */ before_shmem_exit(ShutdownAuxiliaryProcess, 0); } @@ -525,7 +532,9 @@ BootstrapModeMain(void) /* * Process bootstrap input. */ + StartTransactionCommand(); boot_yyparse(); + CommitTransactionCommand(); /* * We should now know about all mapped relations, so it's okay to write @@ -570,6 +579,7 @@ static void ShutdownAuxiliaryProcess(int code, Datum arg) { LWLockReleaseAll(); + ConditionVariableCancelSleep(); pgstat_report_wait_end(); } @@ -1037,13 +1047,8 @@ boot_get_type_io_data(Oid typid, static Form_pg_attribute AllocateAttribute(void) { - Form_pg_attribute attribute = (Form_pg_attribute) malloc(ATTRIBUTE_FIXED_PART_SIZE); - - if (!PointerIsValid(attribute)) - elog(FATAL, "out of memory"); - MemSet(attribute, 0, ATTRIBUTE_FIXED_PART_SIZE); - - return attribute; + return (Form_pg_attribute) + MemoryContextAllocZero(TopMemoryContext, ATTRIBUTE_FIXED_PART_SIZE); } /* @@ -1104,9 +1109,7 @@ index_register(Oid heap, if (nogc == NULL) nogc = AllocSetContextCreate(NULL, "BootstrapNoGC", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(nogc); @@ -1117,13 +1120,13 @@ index_register(Oid heap, memcpy(newind->il_info, indexInfo, sizeof(IndexInfo)); /* expressions will likely be null, but may as well copy it */ - newind->il_info->ii_Expressions = (List *) + newind->il_info->ii_Expressions = copyObject(indexInfo->ii_Expressions); newind->il_info->ii_ExpressionsState = NIL; /* predicate will likely be null, but may as well copy it */ - newind->il_info->ii_Predicate = (List *) + newind->il_info->ii_Predicate = copyObject(indexInfo->ii_Predicate); - newind->il_info->ii_PredicateState = NIL; + newind->il_info->ii_PredicateState = NULL; /* no exclusion constraints at bootstrap time, so no need to copy */ Assert(indexInfo->ii_ExclusionOps == NULL); Assert(indexInfo->ii_ExclusionProcs == NULL); diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm index f411b970e5..7abfda3d3a 100644 --- a/src/backend/catalog/Catalog.pm +++ b/src/backend/catalog/Catalog.pm @@ -4,7 +4,7 @@ # Perl module that extracts info from catalog headers into Perl # data structures # -# Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group +# Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California # # src/backend/catalog/Catalog.pm @@ -19,7 +19,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); our @EXPORT = (); -our @EXPORT_OK = qw(Catalogs RenameTempFile); +our @EXPORT_OK = qw(Catalogs SplitDataLine RenameTempFile); # Call this function with an array of names of header files to parse. # Returns a nested data structure describing the data in the headers. @@ -44,10 +44,13 @@ sub Catalogs $catalog{columns} = []; $catalog{data} = []; - open(INPUT_FILE, '<', $input_file) || die "$input_file: $!"; + open(my $ifh, '<', $input_file) || die "$input_file: $!"; + + my ($filename) = ($input_file =~ m/(\w+)\.h$/); + my $natts_pat = "Natts_$filename"; # Scan the input file. - while (<INPUT_FILE>) + while (<$ifh>) { # Strip C-style comments. @@ -56,13 +59,16 @@ sub Catalogs { # handle multi-line comments properly. - my $next_line = <INPUT_FILE>; + my $next_line = <$ifh>; die "$input_file: ends within C-style comment\n" if !defined $next_line; $_ .= $next_line; redo; } + # Remember input line number for later. + my $input_line_number = $.; + # Strip useless whitespace and trailing semicolons. chomp; s/^\s+//; @@ -70,8 +76,16 @@ sub Catalogs s/\s+/ /g; # Push the data into the appropriate data structure. - if (/^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/) + if (/$natts_pat\s+(\d+)/) + { + $catalog{natts} = $1; + } + elsif ( + /^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/) { + check_natts($filename, $catalog{natts}, $3, $input_file, + $input_line_number); + push @{ $catalog{data} }, { oid => $2, bki_values => $3 }; } elsif (/^DESCR\(\"(.*)\"\)$/) @@ -198,11 +212,33 @@ sub Catalogs } } $catalogs{$catname} = \%catalog; - close INPUT_FILE; + close $ifh; } return \%catalogs; } +# Split a DATA line into fields. +# Call this on the bki_values element of a DATA item returned by Catalogs(); +# it returns a list of field values. We don't strip quoting from the fields. +# Note: it should be safe to assign the result to a list of length equal to +# the nominal number of catalog fields, because check_natts already checked +# the number of fields. +sub SplitDataLine +{ + my $bki_values = shift; + + # This handling of quoted strings might look too simplistic, but it + # matches what bootscanner.l does: that has no provision for quote marks + # inside quoted strings, either. If we don't have a quoted string, just + # snarf everything till next whitespace. That will accept some things + # that bootscanner.l will see as erroneous tokens; but it seems wiser + # to do that and let bootscanner.l complain than to silently drop + # non-whitespace characters. + my @result = $bki_values =~ /"[^"]*"|\S+/g; + + return @result; +} + # Rename temporary files to final names. # Call this function with the final file name and the .tmp extension # Note: recommended extension is ".tmp$$", so that parallel make steps @@ -216,4 +252,21 @@ sub RenameTempFile rename($temp_name, $final_name) || die "rename: $temp_name: $!"; } +# verify the number of fields in the passed-in DATA line +sub check_natts +{ + my ($catname, $natts, $bki_val, $file, $line) = @_; + + die +"Could not find definition for Natts_${catname} before start of DATA() in $file\n" + unless defined $natts; + + my $nfields = scalar(SplitDataLine($bki_val)); + + die sprintf +"Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n", + $file, $line, $natts, $nfields + unless $natts == $nfields; +} + 1; diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 240c44d0f0..52bc63c788 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -11,11 +11,12 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \ - objectaccess.o objectaddress.o pg_aggregate.o pg_collation.o \ + objectaccess.o objectaddress.o partition.o pg_aggregate.o pg_collation.o \ pg_constraint.o pg_conversion.o \ pg_depend.o pg_enum.o pg_inherits.o pg_largeobject.o pg_namespace.o \ - pg_operator.o pg_proc.o pg_range.o pg_db_role_setting.o pg_shdepend.o \ - pg_type.o pgxc_class.o storage.o toasting.o + pg_operator.o pg_proc.o pg_publication.o pg_range.o \ + pg_db_role_setting.o pg_shdepend.o pg_subscription.o pg_type.o \ + pgxc_class.o storage.o toasting.o BKIFILES = postgres.bki postgres.description postgres.shdescription @@ -32,6 +33,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \ pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \ pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \ + pg_statistic_ext.h \ pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \ pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \ pg_database.h pg_db_role_setting.h pg_tablespace.h pg_pltemplate.h \ @@ -42,7 +44,9 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pgxc_class.h pgxc_node.h pgxc_group.h \ pg_foreign_table.h pg_policy.h pg_replication_origin.h \ pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \ - pg_collation.h pg_range.h pg_transform.h \ + pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \ + pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \ + pg_subscription_rel.h toasting.h indexing.h \ toasting.h indexing.h \ ) diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index a585c3ad19..387a3be701 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -3,7 +3,7 @@ * aclchk.c * Routines to check access control permissions. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -27,7 +27,10 @@ #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/objectaccess.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_am.h" #include "catalog/pg_authid.h" +#include "catalog/pg_cast.h" #include "catalog/pg_collation.h" #include "catalog/pg_conversion.h" #include "catalog/pg_database.h" @@ -45,10 +48,15 @@ #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" #include "catalog/pg_ts_config.h" #include "catalog/pg_ts_dict.h" +#include "catalog/pg_ts_parser.h" +#include "catalog/pg_ts_template.h" +#include "catalog/pg_transform.h" #include "commands/dbcommands.h" #include "commands/event_trigger.h" #include "commands/extension.h" @@ -130,6 +138,8 @@ static AclMode pg_aclmask(AclObjectKind objkind, Oid table_oid, AttrNumber attnu Oid roleid, AclMode mask, AclMaskHow how); static void recordExtensionInitPriv(Oid objoid, Oid classoid, int objsubid, Acl *new_acl); +static void recordExtensionInitPrivWorker(Oid objoid, Oid classoid, int objsubid, + Acl *new_acl); #ifdef ACLDEBUG @@ -423,7 +433,7 @@ ExecuteGrantStmt(GrantStmt *stmt) grantee_uid = ACL_ID_PUBLIC; break; default: - grantee_uid = get_rolespec_oid((Node *) grantee, false); + grantee_uid = get_rolespec_oid(grantee, false); break; } istmt.grantees = lappend_oid(istmt.grantees, grantee_uid); @@ -658,11 +668,10 @@ objectNamesToOids(GrantObjectType objtype, List *objnames) case ACL_OBJECT_FUNCTION: foreach(cell, objnames) { - FuncWithArgs *func = (FuncWithArgs *) lfirst(cell); + ObjectWithArgs *func = (ObjectWithArgs *) lfirst(cell); Oid funcid; - funcid = LookupFuncNameTypeNames(func->funcname, - func->funcargs, false); + funcid = LookupFuncWithArgs(func, false); objects = lappend_oid(objects, funcid); } break; @@ -768,6 +777,8 @@ objectsInSchemaToOids(GrantObjectType objtype, List *nspnames) objects = list_concat(objects, objs); objs = getRelationsInNamespace(namespaceId, RELKIND_FOREIGN_TABLE); objects = list_concat(objects, objs); + objs = getRelationsInNamespace(namespaceId, RELKIND_PARTITIONED_TABLE); + objects = list_concat(objects, objs); break; case ACL_OBJECT_SEQUENCE: objs = getRelationsInNamespace(namespaceId, RELKIND_SEQUENCE); @@ -849,7 +860,7 @@ getRelationsInNamespace(Oid namespaceId, char relkind) * ALTER DEFAULT PRIVILEGES statement */ void -ExecAlterDefaultPrivilegesStmt(AlterDefaultPrivilegesStmt *stmt) +ExecAlterDefaultPrivilegesStmt(ParseState *pstate, AlterDefaultPrivilegesStmt *stmt) { GrantStmt *action = stmt->action; InternalDefaultACL iacls; @@ -871,7 +882,8 @@ ExecAlterDefaultPrivilegesStmt(AlterDefaultPrivilegesStmt *stmt) if (dnspnames) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dnspnames = defel; } else if (strcmp(defel->defname, "roles") == 0) @@ -879,7 +891,8 @@ ExecAlterDefaultPrivilegesStmt(AlterDefaultPrivilegesStmt *stmt) if (drolespecs) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); drolespecs = defel; } else @@ -918,7 +931,7 @@ ExecAlterDefaultPrivilegesStmt(AlterDefaultPrivilegesStmt *stmt) grantee_uid = ACL_ID_PUBLIC; break; default: - grantee_uid = get_rolespec_oid((Node *) grantee, false); + grantee_uid = get_rolespec_oid(grantee, false); break; } iacls.grantees = lappend_oid(iacls.grantees, grantee_uid); @@ -946,6 +959,10 @@ ExecAlterDefaultPrivilegesStmt(AlterDefaultPrivilegesStmt *stmt) all_privileges = ACL_ALL_RIGHTS_TYPE; errormsg = gettext_noop("invalid privilege type %s for type"); break; + case ACL_OBJECT_NAMESPACE: + all_privileges = ACL_ALL_RIGHTS_NAMESPACE; + errormsg = gettext_noop("invalid privilege type %s for schema"); + break; default: elog(ERROR, "unrecognized GrantStmt.objtype: %d", (int) action->objtype); @@ -1008,7 +1025,7 @@ ExecAlterDefaultPrivilegesStmt(AlterDefaultPrivilegesStmt *stmt) { RoleSpec *rolespec = lfirst(rolecell); - iacls.roleid = get_rolespec_oid((Node *) rolespec, false); + iacls.roleid = get_rolespec_oid(rolespec, false); /* * We insist that calling user be a member of each target role. If @@ -1133,6 +1150,16 @@ SetDefaultACL(InternalDefaultACL *iacls) this_privileges = ACL_ALL_RIGHTS_TYPE; break; + case ACL_OBJECT_NAMESPACE: + if (OidIsValid(iacls->nspid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_GRANT_OPERATION), + errmsg("cannot use IN SCHEMA clause when using GRANT/REVOKE ON SCHEMAS"))); + objtype = DEFACLOBJ_NAMESPACE; + if (iacls->all_privs && this_privileges == ACL_NO_RIGHTS) + this_privileges = ACL_ALL_RIGHTS_NAMESPACE; + break; + default: elog(ERROR, "unrecognized objtype: %d", (int) iacls->objtype); @@ -1239,7 +1266,7 @@ SetDefaultACL(InternalDefaultACL *iacls) values[Anum_pg_default_acl_defaclacl - 1] = PointerGetDatum(new_acl); newtuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); - simple_heap_insert(rel, newtuple); + CatalogTupleInsert(rel, newtuple); } else { @@ -1249,12 +1276,9 @@ SetDefaultACL(InternalDefaultACL *iacls) newtuple = heap_modify_tuple(tuple, RelationGetDescr(rel), values, nulls, replaces); - simple_heap_update(rel, &newtuple->t_self, newtuple); + CatalogTupleUpdate(rel, &newtuple->t_self, newtuple); } - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(rel, newtuple); - /* these dependencies don't change in an update */ if (isNew) { @@ -1359,6 +1383,9 @@ RemoveRoleFromObjectACL(Oid roleid, Oid classid, Oid objid) case DEFACLOBJ_TYPE: iacls.objtype = ACL_OBJECT_TYPE; break; + case DEFACLOBJ_NAMESPACE: + iacls.objtype = ACL_OBJECT_NAMESPACE; + break; default: /* Shouldn't get here */ elog(ERROR, "unexpected default ACL type: %d", @@ -1460,7 +1487,7 @@ RemoveDefaultACLById(Oid defaclOid) if (!HeapTupleIsValid(tuple)) elog(ERROR, "could not find tuple for default ACL %u", defaclOid); - simple_heap_delete(rel, &tuple->t_self); + CatalogTupleDelete(rel, &tuple->t_self); systable_endscan(scan); heap_close(rel, RowExclusiveLock); @@ -1684,10 +1711,7 @@ ExecGrant_Attribute(InternalGrant *istmt, Oid relOid, const char *relname, newtuple = heap_modify_tuple(attr_tuple, RelationGetDescr(attRelation), values, nulls, replaces); - simple_heap_update(attRelation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(attRelation, newtuple); + CatalogTupleUpdate(attRelation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(relOid, RelationRelationId, attnum, @@ -1815,7 +1839,8 @@ ExecGrant_Relation(InternalGrant *istmt) */ ereport(ERROR, (errcode(ERRCODE_INVALID_GRANT_OPERATION), - errmsg("invalid privilege type USAGE for table"))); + errmsg("invalid privilege type %s for table", + "USAGE"))); } } } @@ -1949,10 +1974,7 @@ ExecGrant_Relation(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(relOid, RelationRelationId, 0, new_acl); @@ -2142,10 +2164,7 @@ ExecGrant_Database(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update the shared dependency ACL info */ updateAclDependencies(DatabaseRelationId, HeapTupleGetOid(tuple), 0, @@ -2267,10 +2286,7 @@ ExecGrant_Fdw(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(fdwid, ForeignDataWrapperRelationId, 0, @@ -2396,10 +2412,7 @@ ExecGrant_ForeignServer(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(srvid, ForeignServerRelationId, 0, new_acl); @@ -2523,10 +2536,7 @@ ExecGrant_Function(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(funcId, ProcedureRelationId, 0, new_acl); @@ -2657,10 +2667,7 @@ ExecGrant_Language(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(langId, LanguageRelationId, 0, new_acl); @@ -2799,10 +2806,7 @@ ExecGrant_Largeobject(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(loid, LargeObjectRelationId, 0, new_acl); @@ -2927,10 +2931,7 @@ ExecGrant_Namespace(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(nspid, NamespaceRelationId, 0, new_acl); @@ -3054,10 +3055,7 @@ ExecGrant_Tablespace(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update the shared dependency ACL info */ updateAclDependencies(TableSpaceRelationId, tblId, 0, @@ -3191,10 +3189,7 @@ ExecGrant_Type(InternalGrant *istmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); - simple_heap_update(relation, &newtuple->t_self, newtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, newtuple); + CatalogTupleUpdate(relation, &newtuple->t_self, newtuple); /* Update initial privileges for extensions */ recordExtensionInitPriv(typId, TypeRelationId, 0, new_acl); @@ -3325,6 +3320,8 @@ static const char *const no_priv_msg[MAX_ACL_KIND] = gettext_noop("permission denied for collation %s"), /* ACL_KIND_CONVERSION */ gettext_noop("permission denied for conversion %s"), + /* ACL_KIND_STATISTICS */ + gettext_noop("permission denied for statistics object %s"), /* ACL_KIND_TABLESPACE */ gettext_noop("permission denied for tablespace %s"), /* ACL_KIND_TSDICTIONARY */ @@ -3339,6 +3336,10 @@ static const char *const no_priv_msg[MAX_ACL_KIND] = gettext_noop("permission denied for event trigger %s"), /* ACL_KIND_EXTENSION */ gettext_noop("permission denied for extension %s"), + /* ACL_KIND_PUBLICATION */ + gettext_noop("permission denied for publication %s"), + /* ACL_KIND_SUBSCRIPTION */ + gettext_noop("permission denied for subscription %s"), }; static const char *const not_owner_msg[MAX_ACL_KIND] = @@ -3371,6 +3372,8 @@ static const char *const not_owner_msg[MAX_ACL_KIND] = gettext_noop("must be owner of collation %s"), /* ACL_KIND_CONVERSION */ gettext_noop("must be owner of conversion %s"), + /* ACL_KIND_STATISTICS */ + gettext_noop("must be owner of statistics object %s"), /* ACL_KIND_TABLESPACE */ gettext_noop("must be owner of tablespace %s"), /* ACL_KIND_TSDICTIONARY */ @@ -3385,6 +3388,10 @@ static const char *const not_owner_msg[MAX_ACL_KIND] = gettext_noop("must be owner of event trigger %s"), /* ACL_KIND_EXTENSION */ gettext_noop("must be owner of extension %s"), + /* ACL_KIND_PUBLICATION */ + gettext_noop("must be owner of publication %s"), + /* ACL_KIND_SUBSCRIPTION */ + gettext_noop("must be owner of subscription %s"), }; @@ -3482,6 +3489,10 @@ pg_aclmask(AclObjectKind objkind, Oid table_oid, AttrNumber attnum, Oid roleid, mask, how, NULL); case ACL_KIND_NAMESPACE: return pg_namespace_aclmask(table_oid, roleid, mask, how); + case ACL_KIND_STATISTICS: + elog(ERROR, "grantable rights not supported for statistics objects"); + /* not reached, but keep compiler quiet */ + return ACL_NO_RIGHTS; case ACL_KIND_TABLESPACE: return pg_tablespace_aclmask(table_oid, roleid, mask, how); case ACL_KIND_FDW: @@ -5067,6 +5078,85 @@ pg_extension_ownercheck(Oid ext_oid, Oid roleid) } /* + * Ownership check for an publication (specified by OID). + */ +bool +pg_publication_ownercheck(Oid pub_oid, Oid roleid) +{ + HeapTuple tuple; + Oid ownerId; + + /* Superusers bypass all permission checking. */ + if (superuser_arg(roleid)) + return true; + + tuple = SearchSysCache1(PUBLICATIONOID, ObjectIdGetDatum(pub_oid)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("publication with OID %u does not exist", pub_oid))); + + ownerId = ((Form_pg_publication) GETSTRUCT(tuple))->pubowner; + + ReleaseSysCache(tuple); + + return has_privs_of_role(roleid, ownerId); +} + +/* + * Ownership check for a subscription (specified by OID). + */ +bool +pg_subscription_ownercheck(Oid sub_oid, Oid roleid) +{ + HeapTuple tuple; + Oid ownerId; + + /* Superusers bypass all permission checking. */ + if (superuser_arg(roleid)) + return true; + + tuple = SearchSysCache1(SUBSCRIPTIONOID, ObjectIdGetDatum(sub_oid)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("subscription with OID %u does not exist", sub_oid))); + + ownerId = ((Form_pg_subscription) GETSTRUCT(tuple))->subowner; + + ReleaseSysCache(tuple); + + return has_privs_of_role(roleid, ownerId); +} + +/* + * Ownership check for a statistics object (specified by OID). + */ +bool +pg_statistics_object_ownercheck(Oid stat_oid, Oid roleid) +{ + HeapTuple tuple; + Oid ownerId; + + /* Superusers bypass all permission checking. */ + if (superuser_arg(roleid)) + return true; + + tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stat_oid)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("statistics object with OID %u does not exist", + stat_oid))); + + ownerId = ((Form_pg_statistic_ext) GETSTRUCT(tuple))->stxowner; + + ReleaseSysCache(tuple); + + return has_privs_of_role(roleid, ownerId); +} + +/* * Check whether specified role has CREATEROLE privilege (or is a superuser) * * Note: roles do not have owners per se; instead we use this test in @@ -5187,6 +5277,10 @@ get_user_default_acl(GrantObjectType objtype, Oid ownerId, Oid nsp_oid) defaclobjtype = DEFACLOBJ_TYPE; break; + case ACL_OBJECT_NAMESPACE: + defaclobjtype = DEFACLOBJ_NAMESPACE; + break; + default: return NULL; } @@ -5222,10 +5316,367 @@ get_user_default_acl(GrantObjectType objtype, Oid ownerId, Oid nsp_oid) } /* - * Record initial ACL for an extension object + * Record initial privileges for the top-level object passed in. * - * This will perform a wholesale replacement of the entire ACL for the object - * passed in, therefore be sure to pass in the complete new ACL to use. + * For the object passed in, this will record its ACL (if any) and the ACLs of + * any sub-objects (eg: columns) into pg_init_privs. + * + * Any new kinds of objects which have ACLs associated with them and can be + * added to an extension should be added to the if-else tree below. + */ +void +recordExtObjInitPriv(Oid objoid, Oid classoid) +{ + /* + * pg_class / pg_attribute + * + * If this is a relation then we need to see if there are any sub-objects + * (eg: columns) for it and, if so, be sure to call + * recordExtensionInitPrivWorker() for each one. + */ + if (classoid == RelationRelationId) + { + Form_pg_class pg_class_tuple; + Datum aclDatum; + bool isNull; + HeapTuple tuple; + + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", objoid); + pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); + + /* Indexes don't have permissions */ + if (pg_class_tuple->relkind == RELKIND_INDEX) + return; + + /* Composite types don't have permissions either */ + if (pg_class_tuple->relkind == RELKIND_COMPOSITE_TYPE) + return; + + /* + * If this isn't a sequence, index, or composite type then it's + * possibly going to have columns associated with it that might have + * ACLs. + */ + if (pg_class_tuple->relkind != RELKIND_SEQUENCE) + { + AttrNumber curr_att; + AttrNumber nattrs = pg_class_tuple->relnatts; + + for (curr_att = 1; curr_att <= nattrs; curr_att++) + { + HeapTuple attTuple; + Datum attaclDatum; + + attTuple = SearchSysCache2(ATTNUM, + ObjectIdGetDatum(objoid), + Int16GetDatum(curr_att)); + + if (!HeapTupleIsValid(attTuple)) + continue; + + /* ignore dropped columns */ + if (((Form_pg_attribute) GETSTRUCT(attTuple))->attisdropped) + { + ReleaseSysCache(attTuple); + continue; + } + + attaclDatum = SysCacheGetAttr(ATTNUM, attTuple, + Anum_pg_attribute_attacl, + &isNull); + + /* no need to do anything for a NULL ACL */ + if (isNull) + { + ReleaseSysCache(attTuple); + continue; + } + + recordExtensionInitPrivWorker(objoid, classoid, curr_att, + DatumGetAclP(attaclDatum)); + + ReleaseSysCache(attTuple); + } + } + + aclDatum = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_relacl, + &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + ReleaseSysCache(tuple); + } + /* pg_foreign_data_wrapper */ + else if (classoid == ForeignDataWrapperRelationId) + { + Datum aclDatum; + bool isNull; + HeapTuple tuple; + + tuple = SearchSysCache1(FOREIGNDATAWRAPPEROID, + ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for foreign data wrapper %u", + objoid); + + aclDatum = SysCacheGetAttr(FOREIGNDATAWRAPPEROID, tuple, + Anum_pg_foreign_data_wrapper_fdwacl, + &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + ReleaseSysCache(tuple); + } + /* pg_foreign_server */ + else if (classoid == ForeignServerRelationId) + { + Datum aclDatum; + bool isNull; + HeapTuple tuple; + + tuple = SearchSysCache1(FOREIGNSERVEROID, ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for foreign data wrapper %u", + objoid); + + aclDatum = SysCacheGetAttr(FOREIGNSERVEROID, tuple, + Anum_pg_foreign_server_srvacl, + &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + ReleaseSysCache(tuple); + } + /* pg_language */ + else if (classoid == LanguageRelationId) + { + Datum aclDatum; + bool isNull; + HeapTuple tuple; + + tuple = SearchSysCache1(LANGOID, ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for language %u", objoid); + + aclDatum = SysCacheGetAttr(LANGOID, tuple, Anum_pg_language_lanacl, + &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + ReleaseSysCache(tuple); + } + /* pg_largeobject_metadata */ + else if (classoid == LargeObjectMetadataRelationId) + { + Datum aclDatum; + bool isNull; + HeapTuple tuple; + ScanKeyData entry[1]; + SysScanDesc scan; + Relation relation; + + relation = heap_open(LargeObjectMetadataRelationId, RowExclusiveLock); + + /* There's no syscache for pg_largeobject_metadata */ + ScanKeyInit(&entry[0], + ObjectIdAttributeNumber, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(objoid)); + + scan = systable_beginscan(relation, + LargeObjectMetadataOidIndexId, true, + NULL, 1, entry); + + tuple = systable_getnext(scan); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for large object %u", objoid); + + aclDatum = heap_getattr(tuple, + Anum_pg_largeobject_metadata_lomacl, + RelationGetDescr(relation), &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + systable_endscan(scan); + } + /* pg_namespace */ + else if (classoid == NamespaceRelationId) + { + Datum aclDatum; + bool isNull; + HeapTuple tuple; + + tuple = SearchSysCache1(NAMESPACEOID, ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for function %u", objoid); + + aclDatum = SysCacheGetAttr(NAMESPACEOID, tuple, + Anum_pg_namespace_nspacl, &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + ReleaseSysCache(tuple); + } + /* pg_proc */ + else if (classoid == ProcedureRelationId) + { + Datum aclDatum; + bool isNull; + HeapTuple tuple; + + tuple = SearchSysCache1(PROCOID, ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for function %u", objoid); + + aclDatum = SysCacheGetAttr(PROCOID, tuple, Anum_pg_proc_proacl, + &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + ReleaseSysCache(tuple); + } + /* pg_type */ + else if (classoid == TypeRelationId) + { + Datum aclDatum; + bool isNull; + HeapTuple tuple; + + tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for function %u", objoid); + + aclDatum = SysCacheGetAttr(TYPEOID, tuple, Anum_pg_type_typacl, + &isNull); + + /* Add the record, if any, for the top-level object */ + if (!isNull) + recordExtensionInitPrivWorker(objoid, classoid, 0, + DatumGetAclP(aclDatum)); + + ReleaseSysCache(tuple); + } + else if (classoid == AccessMethodRelationId || + classoid == AggregateRelationId || + classoid == CastRelationId || + classoid == CollationRelationId || + classoid == ConversionRelationId || + classoid == EventTriggerRelationId || + classoid == OperatorRelationId || + classoid == OperatorClassRelationId || + classoid == OperatorFamilyRelationId || + classoid == NamespaceRelationId || + classoid == TSConfigRelationId || + classoid == TSDictionaryRelationId || + classoid == TSParserRelationId || + classoid == TSTemplateRelationId || + classoid == TransformRelationId + ) + { + /* no ACL for these object types, so do nothing. */ + } + + /* + * complain if we are given a class OID for a class that extensions don't + * support or that we don't recognize. + */ + else + { + elog(ERROR, "unrecognized or unsupported class OID: %u", classoid); + } +} + +/* + * For the object passed in, remove its ACL and the ACLs of any object subIds + * from pg_init_privs (via recordExtensionInitPrivWorker()). + */ +void +removeExtObjInitPriv(Oid objoid, Oid classoid) +{ + /* + * If this is a relation then we need to see if there are any sub-objects + * (eg: columns) for it and, if so, be sure to call + * recordExtensionInitPrivWorker() for each one. + */ + if (classoid == RelationRelationId) + { + Form_pg_class pg_class_tuple; + HeapTuple tuple; + + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(objoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", objoid); + pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); + + /* Indexes don't have permissions */ + if (pg_class_tuple->relkind == RELKIND_INDEX) + return; + + /* Composite types don't have permissions either */ + if (pg_class_tuple->relkind == RELKIND_COMPOSITE_TYPE) + return; + + /* + * If this isn't a sequence, index, or composite type then it's + * possibly going to have columns associated with it that might have + * ACLs. + */ + if (pg_class_tuple->relkind != RELKIND_SEQUENCE) + { + AttrNumber curr_att; + AttrNumber nattrs = pg_class_tuple->relnatts; + + for (curr_att = 1; curr_att <= nattrs; curr_att++) + { + HeapTuple attTuple; + + attTuple = SearchSysCache2(ATTNUM, + ObjectIdGetDatum(objoid), + Int16GetDatum(curr_att)); + + if (!HeapTupleIsValid(attTuple)) + continue; + + /* when removing, remove all entires, even dropped columns */ + + recordExtensionInitPrivWorker(objoid, classoid, curr_att, NULL); + + ReleaseSysCache(attTuple); + } + } + + ReleaseSysCache(tuple); + } + + /* Remove the record, if any, for the top-level object */ + recordExtensionInitPrivWorker(objoid, classoid, 0, NULL); +} + +/* + * Record initial ACL for an extension object * * Can be called at any time, we check if 'creating_extension' is set and, if * not, exit immediately. @@ -5244,12 +5695,6 @@ get_user_default_acl(GrantObjectType objtype, Oid ownerId, Oid nsp_oid) static void recordExtensionInitPriv(Oid objoid, Oid classoid, int objsubid, Acl *new_acl) { - Relation relation; - ScanKeyData key[3]; - SysScanDesc scan; - HeapTuple tuple; - HeapTuple oldtuple; - /* * Generally, we only record the initial privileges when an extension is * being created, but because we don't actually use CREATE EXTENSION @@ -5261,6 +5706,30 @@ recordExtensionInitPriv(Oid objoid, Oid classoid, int objsubid, Acl *new_acl) if (!creating_extension && !binary_upgrade_record_init_privs) return; + recordExtensionInitPrivWorker(objoid, classoid, objsubid, new_acl); +} + +/* + * Record initial ACL for an extension object, worker. + * + * This will perform a wholesale replacement of the entire ACL for the object + * passed in, therefore be sure to pass in the complete new ACL to use. + * + * Generally speaking, do *not* use this function directly but instead use + * recordExtensionInitPriv(), which checks if 'creating_extension' is set. + * This function does *not* check if 'creating_extension' is set as it is also + * used when an object is added to or removed from an extension via ALTER + * EXTENSION ... ADD/DROP. + */ +static void +recordExtensionInitPrivWorker(Oid objoid, Oid classoid, int objsubid, Acl *new_acl) +{ + Relation relation; + ScanKeyData key[3]; + SysScanDesc scan; + HeapTuple tuple; + HeapTuple oldtuple; + relation = heap_open(InitPrivsRelationId, RowExclusiveLock); ScanKeyInit(&key[0], @@ -5302,39 +5771,44 @@ recordExtensionInitPriv(Oid objoid, Oid classoid, int objsubid, Acl *new_acl) oldtuple = heap_modify_tuple(oldtuple, RelationGetDescr(relation), values, nulls, replace); - simple_heap_update(relation, &oldtuple->t_self, oldtuple); - - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, oldtuple); + CatalogTupleUpdate(relation, &oldtuple->t_self, oldtuple); } else + { /* new_acl is NULL, so delete the entry we found. */ - simple_heap_delete(relation, &oldtuple->t_self); + CatalogTupleDelete(relation, &oldtuple->t_self); + } } else { - /* No entry found, so add it. */ Datum values[Natts_pg_init_privs]; bool nulls[Natts_pg_init_privs]; - MemSet(nulls, false, sizeof(nulls)); - - values[Anum_pg_init_privs_objoid - 1] = ObjectIdGetDatum(objoid); - values[Anum_pg_init_privs_classoid - 1] = ObjectIdGetDatum(classoid); - values[Anum_pg_init_privs_objsubid - 1] = Int32GetDatum(objsubid); + /* + * Only add a new entry if the new ACL is non-NULL. + * + * If we are passed in a NULL ACL and no entry exists, we can just + * fall through and do nothing. + */ + if (new_acl) + { + /* No entry found, so add it. */ + MemSet(nulls, false, sizeof(nulls)); - /* This function only handles initial privileges of extensions */ - values[Anum_pg_init_privs_privtype - 1] = - CharGetDatum(INITPRIVS_EXTENSION); + values[Anum_pg_init_privs_objoid - 1] = ObjectIdGetDatum(objoid); + values[Anum_pg_init_privs_classoid - 1] = ObjectIdGetDatum(classoid); + values[Anum_pg_init_privs_objsubid - 1] = Int32GetDatum(objsubid); - values[Anum_pg_init_privs_privs - 1] = PointerGetDatum(new_acl); + /* This function only handles initial privileges of extensions */ + values[Anum_pg_init_privs_privtype - 1] = + CharGetDatum(INITPRIVS_EXTENSION); - tuple = heap_form_tuple(RelationGetDescr(relation), values, nulls); + values[Anum_pg_init_privs_privs - 1] = PointerGetDatum(new_acl); - simple_heap_insert(relation, tuple); + tuple = heap_form_tuple(RelationGetDescr(relation), values, nulls); - /* keep the catalog indexes up to date */ - CatalogUpdateIndexes(relation, tuple); + CatalogTupleInsert(relation, tuple); + } } systable_endscan(scan); diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 9bb937aa4c..2e8cd10ebb 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -6,7 +6,7 @@ * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -37,6 +37,7 @@ #include "catalog/pg_shdepend.h" #include "catalog/pg_shdescription.h" #include "catalog/pg_shseclabel.h" +#include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/toasting.h" #include "catalog/pgxc_node.h" @@ -237,7 +238,8 @@ IsSharedRelation(Oid relationId) relationId == PgxcNodeRelationId || #endif relationId == DbRoleSettingRelationId || - relationId == ReplicationOriginRelationId) + relationId == ReplicationOriginRelationId || + relationId == SubscriptionRelationId) return true; /* These are their indexes (see indexing.h) */ if (relationId == AuthIdRolnameIndexId || @@ -262,7 +264,9 @@ IsSharedRelation(Oid relationId) #endif relationId == DbRoleSettingDatidRolidIndexId || relationId == ReplicationOriginIdentIndex || - relationId == ReplicationOriginNameIndex) + relationId == ReplicationOriginNameIndex || + relationId == SubscriptionObjectIndexId || + relationId == SubscriptionNameIndexId) return true; /* These are their toast tables and toast indexes (see toasting.h) */ if (relationId == PgShdescriptionToastTable || diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 467d9ead0e..f8e560a8d4 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -5,7 +5,7 @@ * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -50,7 +50,11 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_policy.h" #include "catalog/pg_proc.h" +#include "catalog/pg_publication.h" +#include "catalog/pg_publication_rel.h" #include "catalog/pg_rewrite.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" #include "catalog/pg_trigger.h" @@ -77,8 +81,10 @@ #include "commands/extension.h" #include "commands/policy.h" #include "commands/proclang.h" +#include "commands/publicationcmds.h" #include "commands/schemacmds.h" #include "commands/seclabel.h" +#include "commands/sequence.h" #include "commands/trigger.h" #include "commands/typecmds.h" #include "nodes/nodeFuncs.h" @@ -162,6 +168,7 @@ static const Oid object_classes[] = { RewriteRelationId, /* OCLASS_REWRITE */ TriggerRelationId, /* OCLASS_TRIGGER */ NamespaceRelationId, /* OCLASS_SCHEMA */ + StatisticExtRelationId, /* OCLASS_STATISTIC_EXT */ TSParserRelationId, /* OCLASS_TSPARSER */ TSDictionaryRelationId, /* OCLASS_TSDICT */ TSTemplateRelationId, /* OCLASS_TSTEMPLATE */ @@ -181,10 +188,14 @@ static const Oid object_classes[] = { #endif EventTriggerRelationId, /* OCLASS_EVENT_TRIGGER */ PolicyRelationId, /* OCLASS_POLICY */ + PublicationRelationId, /* OCLASS_PUBLICATION */ + PublicationRelRelationId, /* OCLASS_PUBLICATION_REL */ + SubscriptionRelationId, /* OCLASS_SUBSCRIPTION */ TransformRelationId /* OCLASS_TRANSFORM */ }; static void findDependentObjects(const ObjectAddress *object, + int objflags, int flags, ObjectAddressStack *stack, ObjectAddresses *targetObjects, @@ -192,7 +203,7 @@ static void findDependentObjects(const ObjectAddress *object, Relation *depRel); static void reportDependentObjects(const ObjectAddresses *targetObjects, DropBehavior behavior, - int msglevel, + int flags, const ObjectAddress *origObject); static void deleteOneObject(const ObjectAddress *object, Relation *depRel, int32 flags); @@ -254,11 +265,17 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, } /* - * Delete all the objects in the proper order. + * Delete all the objects in the proper order, except that if told to, we + * should skip the original object(s). */ for (i = 0; i < targetObjects->numrefs; i++) { ObjectAddress *thisobj = targetObjects->refs + i; + ObjectAddressExtra *thisextra = targetObjects->extras + i; + + if ((flags & PERFORM_DELETION_SKIP_ORIGINAL) && + (thisextra->flags & DEPFLAG_ORIGINAL)) + continue; deleteOneObject(thisobj, depRel, flags); } @@ -272,16 +289,32 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, * according to the dependency type. * * This is the outer control routine for all forms of DROP that drop objects - * that can participate in dependencies. Note that the next two routines - * are variants on the same theme; if you change anything here you'll likely - * need to fix them too. + * that can participate in dependencies. Note that performMultipleDeletions + * is a variant on the same theme; if you change anything here you'll likely + * need to fix that too. + * + * Bits in the flags argument can include: + * + * PERFORM_DELETION_INTERNAL: indicates that the drop operation is not the + * direct result of a user-initiated action. For example, when a temporary + * schema is cleaned out so that a new backend can use it, or when a column + * default is dropped as an intermediate step while adding a new one, that's + * an internal operation. On the other hand, when we drop something because + * the user issued a DROP statement against it, that's not internal. Currently + * this suppresses calling event triggers and making some permissions checks. + * + * PERFORM_DELETION_CONCURRENTLY: perform the drop concurrently. This does + * not currently work for anything except dropping indexes; don't set it for + * other object types or you may get strange results. + * + * PERFORM_DELETION_QUIETLY: reduce message level from NOTICE to DEBUG2. * - * flags should include PERFORM_DELETION_INTERNAL when the drop operation is - * not the direct result of a user-initiated action. For example, when a - * temporary schema is cleaned out so that a new backend can use it, or when - * a column default is dropped as an intermediate step while adding a new one, - * that's an internal operation. On the other hand, when we drop something - * because the user issued a DROP statement against it, that's not internal. + * PERFORM_DELETION_SKIP_ORIGINAL: do not delete the specified object(s), + * but only what depends on it/them. + * + * PERFORM_DELETION_SKIP_EXTENSIONS: do not delete extensions, even when + * deleting objects that are part of an extension. This should generally + * be used only when dropping temporary objects. */ void performDeletion(const ObjectAddress *object, @@ -310,6 +343,7 @@ performDeletion(const ObjectAddress *object, findDependentObjects(object, DEPFLAG_ORIGINAL, + flags, NULL, /* empty stack */ targetObjects, NULL, /* no pendingObjects */ @@ -320,7 +354,7 @@ performDeletion(const ObjectAddress *object, */ reportDependentObjects(targetObjects, behavior, - NOTICE, + flags, object); /* do the deed */ @@ -381,6 +415,7 @@ performMultipleDeletions(const ObjectAddresses *objects, findDependentObjects(thisobj, DEPFLAG_ORIGINAL, + flags, NULL, /* empty stack */ targetObjects, objects, @@ -395,7 +430,7 @@ performMultipleDeletions(const ObjectAddresses *objects, */ reportDependentObjects(targetObjects, behavior, - NOTICE, + flags, (objects->numrefs == 1 ? objects->refs : NULL)); /* do the deed */ @@ -474,6 +509,8 @@ performRename(const ObjectAddress *object, const char *oldname, const char *newn findDependentObjects(object, DEPFLAG_ORIGINAL, + 0, /* XXX seems like flags are only used while + dropping objects */ NULL, /* empty stack */ targetObjects, NULL, @@ -494,88 +531,6 @@ performRename(const ObjectAddress *object, const char *oldname, const char *newn #endif /* - * deleteWhatDependsOn: attempt to drop everything that depends on the - * specified object, though not the object itself. Behavior is always - * CASCADE. - * - * This is currently used only to clean out the contents of a schema - * (namespace): the passed object is a namespace. We normally want this - * to be done silently, so there's an option to suppress NOTICE messages. - * - * Note we don't fire object drop event triggers here; it would be wrong to do - * so for the current only use of this function, but if more callers are added - * this might need to be reconsidered. - */ -void -deleteWhatDependsOn(const ObjectAddress *object, - bool showNotices) -{ - Relation depRel; - ObjectAddresses *targetObjects; - int i; - - /* - * We save some cycles by opening pg_depend just once and passing the - * Relation pointer down to all the recursive deletion steps. - */ - depRel = heap_open(DependRelationId, RowExclusiveLock); - - /* - * Acquire deletion lock on the target object. (Ideally the caller has - * done this already, but many places are sloppy about it.) - */ - AcquireDeletionLock(object, 0); - - /* - * Construct a list of objects to delete (ie, the given object plus - * everything directly or indirectly dependent on it). - */ - targetObjects = new_object_addresses(); - - findDependentObjects(object, - DEPFLAG_ORIGINAL, - NULL, /* empty stack */ - targetObjects, - NULL, /* no pendingObjects */ - &depRel); - - /* - * Check if deletion is allowed, and report about cascaded deletes. - */ - reportDependentObjects(targetObjects, - DROP_CASCADE, - showNotices ? NOTICE : DEBUG2, - object); - - /* - * Delete all the objects in the proper order, except we skip the original - * object. - */ - for (i = 0; i < targetObjects->numrefs; i++) - { - ObjectAddress *thisobj = targetObjects->refs + i; - ObjectAddressExtra *thisextra = targetObjects->extras + i; - - if (thisextra->flags & DEPFLAG_ORIGINAL) - continue; - - /* - * Since this function is currently only used to clean out temporary - * schemas, we pass PERFORM_DELETION_INTERNAL here, indicating that - * the operation is an automatic system operation rather than a user - * action. If, in the future, this function is used for other - * purposes, we might need to revisit this. - */ - deleteOneObject(thisobj, &depRel, PERFORM_DELETION_INTERNAL); - } - - /* And clean up */ - free_object_addresses(targetObjects); - - heap_close(depRel, RowExclusiveLock); -} - -/* * findDependentObjects - find all objects that depend on 'object' * * For every object that depends on the starting object, acquire a deletion @@ -595,16 +550,22 @@ deleteWhatDependsOn(const ObjectAddress *object, * its sub-objects too. * * object: the object to add to targetObjects and find dependencies on - * flags: flags to be ORed into the object's targetObjects entry + * objflags: flags to be ORed into the object's targetObjects entry + * flags: PERFORM_DELETION_xxx flags for the deletion operation as a whole * stack: list of objects being visited in current recursion; topmost item * is the object that we recursed from (NULL for external callers) * targetObjects: list of objects that are scheduled to be deleted * pendingObjects: list of other objects slated for destruction, but * not necessarily in targetObjects yet (can be NULL if none) * *depRel: already opened pg_depend relation + * + * Note: objflags describes the reason for visiting this particular object + * at this time, and is not passed down when recursing. The flags argument + * is passed down, since it describes what we're doing overall. */ static void findDependentObjects(const ObjectAddress *object, + int objflags, int flags, ObjectAddressStack *stack, ObjectAddresses *targetObjects, @@ -621,8 +582,8 @@ findDependentObjects(const ObjectAddress *object, /* * If the target object is already being visited in an outer recursion - * level, just report the current flags back to that level and exit. This - * is needed to avoid infinite recursion in the face of circular + * level, just report the current objflags back to that level and exit. + * This is needed to avoid infinite recursion in the face of circular * dependencies. * * The stack check alone would result in dependency loops being broken at @@ -635,19 +596,19 @@ findDependentObjects(const ObjectAddress *object, * auto dependency, too, if we had to. However there are no known cases * where that would be necessary. */ - if (stack_address_present_add_flags(object, flags, stack)) + if (stack_address_present_add_flags(object, objflags, stack)) return; /* * It's also possible that the target object has already been completely * processed and put into targetObjects. If so, again we just add the - * specified flags to its entry and return. + * specified objflags to its entry and return. * * (Note: in these early-exit cases we could release the caller-taken * lock, since the object is presumably now locked multiple times; but it * seems not worth the cycles.) */ - if (object_address_present_add_flags(object, flags, targetObjects)) + if (object_address_present_add_flags(object, objflags, targetObjects)) return; /* @@ -697,29 +658,52 @@ findDependentObjects(const ObjectAddress *object, case DEPENDENCY_AUTO_EXTENSION: /* no problem */ break; - case DEPENDENCY_INTERNAL: + case DEPENDENCY_EXTENSION: /* + * If told to, ignore EXTENSION dependencies altogether. This + * flag is normally used to prevent dropping extensions during + * temporary-object cleanup, even if a temp object was created + * during an extension script. + */ + if (flags & PERFORM_DELETION_SKIP_EXTENSIONS) + break; + + /* + * If the other object is the extension currently being + * created/altered, ignore this dependency and continue with + * the deletion. This allows dropping of an extension's + * objects within the extension's scripts, as well as corner + * cases such as dropping a transient object created within + * such a script. + */ + if (creating_extension && + otherObject.classId == ExtensionRelationId && + otherObject.objectId == CurrentExtensionObject) + break; + + /* Otherwise, treat this like an internal dependency */ + /* FALL THRU */ + + case DEPENDENCY_INTERNAL: + + /* * This object is part of the internal implementation of * another object, or is part of the extension that is the * other object. We have three cases: * - * 1. At the outermost recursion level, we normally disallow - * the DROP. (We just ereport here, rather than proceeding, - * since no other dependencies are likely to be interesting.) - * However, there are exceptions. + * 1. At the outermost recursion level, disallow the DROP. (We + * just ereport here, rather than proceeding, since no other + * dependencies are likely to be interesting.) However, if + * the owning object is listed in pendingObjects, just release + * the caller's lock and return; we'll eventually complete the + * DROP when we reach that entry in the pending list. */ if (stack == NULL) { char *otherObjDesc; - /* - * Exception 1a: if the owning object is listed in - * pendingObjects, just release the caller's lock and - * return. We'll eventually complete the DROP when we - * reach that entry in the pending list. - */ if (pendingObjects && object_address_present(&otherObject, pendingObjects)) { @@ -728,21 +712,6 @@ findDependentObjects(const ObjectAddress *object, ReleaseDeletionLock(object); return; } - - /* - * Exception 1b: if the owning object is the extension - * currently being created/altered, it's okay to continue - * with the deletion. This allows dropping of an - * extension's objects within the extension's scripts, as - * well as corner cases such as dropping a transient - * object created within such a script. - */ - if (creating_extension && - otherObject.classId == ExtensionRelationId && - otherObject.objectId == CurrentExtensionObject) - break; - - /* No exception applies, so throw the error */ otherObjDesc = getObjectDescription(&otherObject); ereport(ERROR, (errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST), @@ -803,6 +772,7 @@ findDependentObjects(const ObjectAddress *object, */ findDependentObjects(&otherObject, DEPFLAG_REVERSE, + flags, stack, targetObjects, pendingObjects, @@ -833,7 +803,7 @@ findDependentObjects(const ObjectAddress *object, * they have to be deleted before the current object. */ mystack.object = object; /* set up a new stack level */ - mystack.flags = flags; + mystack.flags = objflags; mystack.next = stack; ScanKeyInit(&key[0], @@ -887,7 +857,7 @@ findDependentObjects(const ObjectAddress *object, continue; } - /* Recurse, passing flags indicating the dependency type */ + /* Recurse, passing objflags indicating the dependency type */ switch (foundDep->deptype) { case DEPENDENCY_NORMAL: @@ -924,6 +894,7 @@ findDependentObjects(const ObjectAddress *object, findDependentObjects(&otherObject, subflags, + flags, &mystack, targetObjects, pendingObjects, @@ -954,16 +925,17 @@ findDependentObjects(const ObjectAddress *object, * * targetObjects: list of objects that are scheduled to be deleted * behavior: RESTRICT or CASCADE - * msglevel: elog level for non-error report messages + * flags: other flags for the deletion operation * origObject: base object of deletion, or NULL if not available * (the latter case occurs in DROP OWNED) */ static void reportDependentObjects(const ObjectAddresses *targetObjects, DropBehavior behavior, - int msglevel, + int flags, const ObjectAddress *origObject) { + int msglevel = (flags & PERFORM_DELETION_QUIETLY) ? DEBUG2 : NOTICE; bool ok = true; StringInfoData clientdetail; StringInfoData logdetail; @@ -1197,7 +1169,7 @@ deleteOneObject(const ObjectAddress *object, Relation *depRel, int flags) while (HeapTupleIsValid(tup = systable_getnext(scan))) { - simple_heap_delete(*depRel, &tup->t_self); + CatalogTupleDelete(*depRel, &tup->t_self); } systable_endscan(scan); @@ -1244,8 +1216,7 @@ doDeletion(const ObjectAddress *object, int flags) if (relKind == RELKIND_INDEX) { - bool concurrent = ((flags & PERFORM_DELETION_CONCURRENTLY) - == PERFORM_DELETION_CONCURRENTLY); + bool concurrent = ((flags & PERFORM_DELETION_CONCURRENTLY) != 0); Assert(object->objectSubId == 0); index_drop(object->objectId, concurrent); @@ -1259,6 +1230,12 @@ doDeletion(const ObjectAddress *object, int flags) heap_drop_with_catalog(object->objectId); } + /* + * for a sequence, in addition to dropping the heap, also + * delete pg_sequence tuple + */ + if (relKind == RELKIND_SEQUENCE) + DeleteSequenceTuple(object->objectId); #ifdef PGXC /* * Do not do extra process if this session is connected to a remote @@ -1382,6 +1359,10 @@ doDeletion(const ObjectAddress *object, int flags) RemoveSchemaById(object->objectId); break; + case OCLASS_STATISTIC_EXT: + RemoveStatisticsById(object->objectId); + break; + case OCLASS_TSPARSER: RemoveTSParserById(object->objectId); break; @@ -1436,13 +1417,35 @@ doDeletion(const ObjectAddress *object, int flags) RemovePolicyById(object->objectId); break; + case OCLASS_PUBLICATION: + RemovePublicationById(object->objectId); + break; + + case OCLASS_PUBLICATION_REL: + RemovePublicationRelById(object->objectId); + break; + case OCLASS_TRANSFORM: DropTransformById(object->objectId); break; - default: - elog(ERROR, "unrecognized object class: %u", - object->classId); + /* + * These global object types are not supported here. + */ + case OCLASS_ROLE: + case OCLASS_DATABASE: + case OCLASS_TBLSPACE: + case OCLASS_SUBSCRIPTION: + case OCLASS_PGXC_NODE: + case OCLASS_PGXC_GROUP: + elog(ERROR, "global objects cannot be deleted by doDeletion"); + break; + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new OCLASS hasn't been handled above. + */ + } } @@ -1550,7 +1553,8 @@ void recordDependencyOnSingleRelExpr(const ObjectAddress *depender, Node *expr, Oid relId, DependencyType behavior, - DependencyType self_behavior) + DependencyType self_behavior, + bool ignore_self) { find_expr_references_context context; RangeTblEntry rte; @@ -1605,9 +1609,10 @@ recordDependencyOnSingleRelExpr(const ObjectAddress *depender, context.addrs->numrefs = outrefs; /* Record the self-dependencies */ - recordMultipleDependencies(depender, - self_addrs->refs, self_addrs->numrefs, - self_behavior); + if (!ignore_self) + recordMultipleDependencies(depender, + self_addrs->refs, self_addrs->numrefs, + self_behavior); free_object_addresses(self_addrs); } @@ -1788,7 +1793,8 @@ find_expr_references_walker(Node *node, case REGROLEOID: ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("constant of the type \"regrole\" cannot be used here"))); + errmsg("constant of the type %s cannot be used here", + "regrole"))); break; } } @@ -2097,6 +2103,13 @@ find_expr_references_walker(Node *node, context->addrs); /* fall through to examine arguments */ } + else if (IsA(node, NextValueExpr)) + { + NextValueExpr *nve = (NextValueExpr *) node; + + add_object_address(OCLASS_CLASS, nve->seqid, 0, + context->addrs); + } return expression_tree_walker(node, find_expr_references_walker, (void *) context); @@ -2551,6 +2564,9 @@ getObjectClass(const ObjectAddress *object) case NamespaceRelationId: return OCLASS_SCHEMA; + case StatisticExtRelationId: + return OCLASS_STATISTIC_EXT; + case TSParserRelationId: return OCLASS_TSPARSER; @@ -2598,6 +2614,15 @@ getObjectClass(const ObjectAddress *object) case PolicyRelationId: return OCLASS_POLICY; + case PublicationRelationId: + return OCLASS_PUBLICATION; + + case PublicationRelRelationId: + return OCLASS_PUBLICATION_REL; + + case SubscriptionRelationId: + return OCLASS_SUBSCRIPTION; + case TransformRelationId: return OCLASS_TRANSFORM; } @@ -2637,7 +2662,7 @@ DeleteInitPrivs(const ObjectAddress *object) NULL, 3, key); while (HeapTupleIsValid(oldtuple = systable_getnext(scan))) - simple_heap_delete(relation, &oldtuple->t_self); + CatalogTupleDelete(relation, &oldtuple->t_self); systable_endscan(scan); diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index e15c2e95d6..12acedd2e0 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -7,7 +7,7 @@ # header files. The .bki files are used to initialize the postgres # template database. # -# Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group +# Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California # # src/backend/catalog/genbki.pl @@ -43,8 +43,8 @@ while (@ARGV) elsif ($arg =~ /^--set-version=(.*)$/) { $major_version = $1; - die "Version must be in format nn.nn.\n" - if !($major_version =~ /^\d+\.\d+$/); + die "Invalid version string.\n" + if !($major_version =~ /^\d+$/); } else { @@ -66,16 +66,16 @@ if ($output_path ne '' && substr($output_path, -1) ne '/') # Open temp files my $tmpext = ".tmp$$"; my $bkifile = $output_path . 'postgres.bki'; -open BKI, '>', $bkifile . $tmpext +open my $bki, '>', $bkifile . $tmpext or die "can't open $bkifile$tmpext: $!"; my $schemafile = $output_path . 'schemapg.h'; -open SCHEMAPG, '>', $schemafile . $tmpext +open my $schemapg, '>', $schemafile . $tmpext or die "can't open $schemafile$tmpext: $!"; my $descrfile = $output_path . 'postgres.description'; -open DESCR, '>', $descrfile . $tmpext +open my $descr, '>', $descrfile . $tmpext or die "can't open $descrfile$tmpext: $!"; my $shdescrfile = $output_path . 'postgres.shdescription'; -open SHDESCR, '>', $shdescrfile . $tmpext +open my $shdescr, '>', $shdescrfile . $tmpext or die "can't open $shdescrfile$tmpext: $!"; # Fetch some special data that we will substitute into the output file. @@ -97,11 +97,12 @@ my $catalogs = Catalog::Catalogs(@input_files); # Generate postgres.bki, postgres.description, and postgres.shdescription # version marker for .bki file -print BKI "# PostgreSQL $major_version\n"; +print $bki "# PostgreSQL $major_version\n"; # vars to hold data needed for schemapg.h my %schemapg_entries; my @tables_needing_macros; +my %regprocoids; our @types; # produce output, one catalog at a time @@ -110,7 +111,7 @@ foreach my $catname (@{ $catalogs->{names} }) # .bki CREATE command for this catalog my $catalog = $catalogs->{$catname}; - print BKI "create $catname $catalog->{relation_oid}" + print $bki "create $catname $catalog->{relation_oid}" . $catalog->{shared_relation} . $catalog->{bootstrap} . $catalog->{without_oids} @@ -120,7 +121,7 @@ foreach my $catname (@{ $catalogs->{names} }) my @attnames; my $first = 1; - print BKI " (\n"; + print $bki " (\n"; foreach my $column (@{ $catalog->{columns} }) { my $attname = $column->{name}; @@ -130,27 +131,27 @@ foreach my $catname (@{ $catalogs->{names} }) if (!$first) { - print BKI " ,\n"; + print $bki " ,\n"; } $first = 0; - print BKI " $attname = $atttype"; + print $bki " $attname = $atttype"; if (defined $column->{forcenotnull}) { - print BKI " FORCE NOT NULL"; + print $bki " FORCE NOT NULL"; } elsif (defined $column->{forcenull}) { - print BKI " FORCE NULL"; + print $bki " FORCE NULL"; } } - print BKI "\n )\n"; + print $bki "\n )\n"; # open it, unless bootstrap case (create bootstrap does this automatically) if ($catalog->{bootstrap} eq '') { - print BKI "open $catname\n"; + print $bki "open $catname\n"; } if (defined $catalog->{data}) @@ -160,32 +161,67 @@ foreach my $catname (@{ $catalogs->{names} }) foreach my $row (@{ $catalog->{data} }) { - # substitute constant values we acquired above - $row->{bki_values} =~ s/\bPGUID\b/$BOOTSTRAP_SUPERUSERID/g; - $row->{bki_values} =~ s/\bPGNSP\b/$PG_CATALOG_NAMESPACE/g; + # Split line into tokens without interpreting their meaning. + my %bki_values; + @bki_values{@attnames} = + Catalog::SplitDataLine($row->{bki_values}); + + # Perform required substitutions on fields + foreach my $att (keys %bki_values) + { + + # Substitute constant values we acquired above. + # (It's intentional that this can apply to parts of a field). + $bki_values{$att} =~ s/\bPGUID\b/$BOOTSTRAP_SUPERUSERID/g; + $bki_values{$att} =~ s/\bPGNSP\b/$PG_CATALOG_NAMESPACE/g; + + # Replace regproc columns' values with OIDs. + # If we don't have a unique value to substitute, + # just do nothing (regprocin will complain). + if ($bki_attr{$att}->{type} eq 'regproc') + { + my $procoid = $regprocoids{ $bki_values{$att} }; + $bki_values{$att} = $procoid + if defined($procoid) && $procoid ne 'MULTIPLE'; + } + } + + # Save pg_proc oids for use in later regproc substitutions. + # This relies on the order we process the files in! + if ($catname eq 'pg_proc') + { + if (defined($regprocoids{ $bki_values{proname} })) + { + $regprocoids{ $bki_values{proname} } = 'MULTIPLE'; + } + else + { + $regprocoids{ $bki_values{proname} } = $row->{oid}; + } + } # Save pg_type info for pg_attribute processing below if ($catname eq 'pg_type') { - my %type; + my %type = %bki_values; $type{oid} = $row->{oid}; - @type{@attnames} = split /\s+/, $row->{bki_values}; push @types, \%type; } # Write to postgres.bki my $oid = $row->{oid} ? "OID = $row->{oid} " : ''; - printf BKI "insert %s( %s)\n", $oid, $row->{bki_values}; + printf $bki "insert %s( %s )\n", $oid, + join(' ', @bki_values{@attnames}); # Write comments to postgres.description and postgres.shdescription if (defined $row->{descr}) { - printf DESCR "%s\t%s\t0\t%s\n", $row->{oid}, $catname, + printf $descr "%s\t%s\t0\t%s\n", $row->{oid}, $catname, $row->{descr}; } if (defined $row->{shdescr}) { - printf SHDESCR "%s\t%s\t%s\n", $row->{oid}, $catname, + printf $shdescr "%s\t%s\t%s\n", $row->{oid}, $catname, $row->{shdescr}; } } @@ -271,7 +307,7 @@ foreach my $catname (@{ $catalogs->{names} }) } } - print BKI "close $catname\n"; + print $bki "close $catname\n"; } # Any information needed for the BKI that is not contained in a pg_*.h header @@ -280,25 +316,25 @@ foreach my $catname (@{ $catalogs->{names} }) # Write out declare toast/index statements foreach my $declaration (@{ $catalogs->{toasting}->{data} }) { - print BKI $declaration; + print $bki $declaration; } foreach my $declaration (@{ $catalogs->{indexing}->{data} }) { - print BKI $declaration; + print $bki $declaration; } # Now generate schemapg.h # Opening boilerplate for schemapg.h -print SCHEMAPG <<EOM; +print $schemapg <<EOM; /*------------------------------------------------------------------------- * * schemapg.h * Schema_pg_xxx macros for use by relcache.c * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * NOTES @@ -317,19 +353,19 @@ EOM # Emit schemapg declarations foreach my $table_name (@tables_needing_macros) { - print SCHEMAPG "\n#define Schema_$table_name \\\n"; - print SCHEMAPG join ", \\\n", @{ $schemapg_entries{$table_name} }; - print SCHEMAPG "\n"; + print $schemapg "\n#define Schema_$table_name \\\n"; + print $schemapg join ", \\\n", @{ $schemapg_entries{$table_name} }; + print $schemapg "\n"; } # Closing boilerplate for schemapg.h -print SCHEMAPG "\n#endif /* SCHEMAPG_H */\n"; +print $schemapg "\n#endif /* SCHEMAPG_H */\n"; # We're done emitting data -close BKI; -close SCHEMAPG; -close DESCR; -close SHDESCR; +close $bki; +close $schemapg; +close $descr; +close $shdescr; # Finally, rename the completed files into place. Catalog::RenameTempFile($bkifile, $tmpext); @@ -413,6 +449,7 @@ sub emit_pgattr_row attcacheoff => '-1', atttypmod => '-1', atthasdef => 'f', + attidentity => '', attisdropped => 'f', attislocal => 't', attinhcount => '0', @@ -428,8 +465,9 @@ sub bki_insert my $row = shift; my @attnames = @_; my $oid = $row->{oid} ? "OID = $row->{oid} " : ''; - my $bki_values = join ' ', map $row->{$_}, @attnames; - printf BKI "insert %s( %s)\n", $oid, $bki_values; + my $bki_values = join ' ', map { $_ eq '' ? '""' : $_ } map $row->{$_}, + @attnames; + printf $bki "insert %s( %s )\n", $oid, $bki_values; } # The field values of a Schema_pg_xxx declaration are similar, but not @@ -439,10 +477,14 @@ sub emit_schemapg_row my $row = shift; my @bool_attrs = @_; + # Replace empty string by zero char constant + $row->{attidentity} ||= '\0'; + # Supply appropriate quoting for these fields. - $row->{attname} = q|{"| . $row->{attname} . q|"}|; - $row->{attstorage} = q|'| . $row->{attstorage} . q|'|; - $row->{attalign} = q|'| . $row->{attalign} . q|'|; + $row->{attname} = q|{"| . $row->{attname} . q|"}|; + $row->{attstorage} = q|'| . $row->{attstorage} . q|'|; + $row->{attalign} = q|'| . $row->{attalign} . q|'|; + $row->{attidentity} = q|'| . $row->{attidentity} . q|'|; # We don't emit initializers for the variable length fields at all. # Only the fixed-size portions of the descriptors are ever used. @@ -476,15 +518,15 @@ sub find_defined_symbol } my $file = $path . $catalog_header; next if !-f $file; - open(FIND_DEFINED_SYMBOL, '<', $file) || die "$file: $!"; - while (<FIND_DEFINED_SYMBOL>) + open(my $find_defined_symbol, '<', $file) || die "$file: $!"; + while (<$find_defined_symbol>) { if (/^#define\s+\Q$symbol\E\s+(\S+)/) { return $1; } } - close FIND_DEFINED_SYMBOL; + close $find_defined_symbol; die "$file: no definition found for $symbol\n"; } die "$catalog_header: not found in any include directory\n"; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index a1df27d43f..ea3d2ade21 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -3,7 +3,7 @@ * heap.c * code to create and destroy POSTGRES heap relations * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -42,6 +42,7 @@ #include "catalog/heap.h" #include "catalog/index.h" #include "catalog/objectaccess.h" +#include "catalog/partition.h" #include "catalog/pg_attrdef.h" #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" @@ -49,7 +50,10 @@ #include "catalog/pg_foreign_table.h" #include "catalog/pg_inherits.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_partitioned_table.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" #include "catalog/pg_type_fn.h" @@ -64,6 +68,7 @@ #include "parser/parse_collate.h" #include "parser/parse_expr.h" #include "parser/parse_relation.h" +#include "storage/lmgr.h" #include "storage/predicate.h" #include "storage/smgr.h" #include "utils/acl.h" @@ -115,6 +120,7 @@ static void StoreConstraints(Relation rel, List *cooked_constraints, bool is_internal); static bool MergeWithExistingConstraint(Relation rel, char *ccname, Node *expr, bool allow_merge, bool is_local, + bool is_initially_valid, bool is_no_inherit); static void SetRelationNumChecks(Relation rel, int numchecks); static Node *cookConstraint(ParseState *pstate, @@ -148,37 +154,37 @@ static List *insert_ordered_unique_oid(List *list, Oid datum); static FormData_pg_attribute a1 = { 0, {"ctid"}, TIDOID, 0, sizeof(ItemPointerData), SelfItemPointerAttributeNumber, 0, -1, -1, - false, 'p', 's', true, false, false, true, 0 + false, 'p', 's', true, false, '\0', false, true, 0 }; static FormData_pg_attribute a2 = { 0, {"oid"}, OIDOID, 0, sizeof(Oid), ObjectIdAttributeNumber, 0, -1, -1, - true, 'p', 'i', true, false, false, true, 0 + true, 'p', 'i', true, false, '\0', false, true, 0 }; static FormData_pg_attribute a3 = { 0, {"xmin"}, XIDOID, 0, sizeof(TransactionId), MinTransactionIdAttributeNumber, 0, -1, -1, - true, 'p', 'i', true, false, false, true, 0 + true, 'p', 'i', true, false, '\0', false, true, 0 }; static FormData_pg_attribute a4 = { 0, {"cmin"}, CIDOID, 0, sizeof(CommandId), MinCommandIdAttributeNumber, 0, -1, -1, - true, 'p', 'i', true, false, false, true, 0 + true, 'p', 'i', true, false, '\0', false, true, 0 }; static FormData_pg_attribute a5 = { 0, {"xmax"}, XIDOID, 0, sizeof(TransactionId), MaxTransactionIdAttributeNumber, 0, -1, -1, - true, 'p', 'i', true, false, false, true, 0 + true, 'p', 'i', true, false, '\0', false, true, 0 }; static FormData_pg_attribute a6 = { 0, {"cmax"}, CIDOID, 0, sizeof(CommandId), MaxCommandIdAttributeNumber, 0, -1, -1, - true, 'p', 'i', true, false, false, true, 0 + true, 'p', 'i', true, false, '\0', false, true, 0 }; /* @@ -190,7 +196,7 @@ static FormData_pg_attribute a6 = { static FormData_pg_attribute a7 = { 0, {"tableoid"}, OIDOID, 0, sizeof(Oid), TableOidAttributeNumber, 0, -1, -1, - true, 'p', 'i', true, false, false, true, 0 + true, 'p', 'i', true, false, '\0', false, true, 0 }; #ifdef PGXC @@ -314,6 +320,7 @@ heap_create(const char *relname, case RELKIND_VIEW: case RELKIND_COMPOSITE_TYPE: case RELKIND_FOREIGN_TABLE: + case RELKIND_PARTITIONED_TABLE: create_storage = false; /* @@ -513,18 +520,7 @@ CheckAttributeType(const char *attname, char att_typtype = get_typtype(atttypid); Oid att_typelem; - if (atttypid == UNKNOWNOID) - { - /* - * Warn user, but don't fail, if column to be created has UNKNOWN type - * (usually as a result of a 'retrieve into' - jolly) - */ - ereport(WARNING, - (errcode(ERRCODE_INVALID_TABLE_DEFINITION), - errmsg("column \"%s\" has type \"unknown\"", attname), - errdetail("Proceeding with relation creation anyway."))); - } - else if (att_typtype == TYPTYPE_PSEUDO) + if (att_typtype == TYPTYPE_PSEUDO) { /* * Refuse any attempt to create a pseudo-type column, except for a @@ -620,9 +616,10 @@ CheckAttributeType(const char *attname, * attribute to insert (but we ignore attacl and attoptions, which are always * initialized to NULL). * - * indstate is the index state for CatalogIndexInsert. It can be passed as - * NULL, in which case we'll fetch the necessary info. (Don't do this when - * inserting multiple attributes, because it's a tad more expensive.) + * indstate is the index state for CatalogTupleInsertWithInfo. It can be + * passed as NULL, in which case we'll fetch the necessary info. (Don't do + * this when inserting multiple attributes, because it's a tad more + * expensive.) */ void InsertPgAttributeTuple(Relation pg_attribute_rel, @@ -651,6 +648,7 @@ InsertPgAttributeTuple(Relation pg_attribute_rel, values[Anum_pg_attribute_attalign - 1] = CharGetDatum(new_attribute->attalign); values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(new_attribute->attnotnull); values[Anum_pg_attribute_atthasdef - 1] = BoolGetDatum(new_attribute->atthasdef); + values[Anum_pg_attribute_attidentity - 1] = CharGetDatum(new_attribute->attidentity); values[Anum_pg_attribute_attisdropped - 1] = BoolGetDatum(new_attribute->attisdropped); values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(new_attribute->attislocal); values[Anum_pg_attribute_attinhcount - 1] = Int32GetDatum(new_attribute->attinhcount); @@ -664,12 +662,10 @@ InsertPgAttributeTuple(Relation pg_attribute_rel, tup = heap_form_tuple(RelationGetDescr(pg_attribute_rel), values, nulls); /* finally insert the new tuple, update the indexes, and clean up */ - simple_heap_insert(pg_attribute_rel, tup); - if (indstate != NULL) - CatalogIndexInsert(indstate, tup); + CatalogTupleInsertWithInfo(pg_attribute_rel, tup, indstate); else - CatalogUpdateIndexes(pg_attribute_rel, tup); + CatalogTupleInsert(pg_attribute_rel, tup); heap_freetuple(tup); } @@ -834,6 +830,7 @@ InsertPgClassTuple(Relation pg_class_desc, values[Anum_pg_class_relhassubclass - 1] = BoolGetDatum(rd_rel->relhassubclass); values[Anum_pg_class_relispopulated - 1] = BoolGetDatum(rd_rel->relispopulated); values[Anum_pg_class_relreplident - 1] = CharGetDatum(rd_rel->relreplident); + values[Anum_pg_class_relispartition - 1] = BoolGetDatum(rd_rel->relispartition); values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid); values[Anum_pg_class_relminmxid - 1] = MultiXactIdGetDatum(rd_rel->relminmxid); if (relacl != (Datum) 0) @@ -845,6 +842,9 @@ InsertPgClassTuple(Relation pg_class_desc, else nulls[Anum_pg_class_reloptions - 1] = true; + /* relpartbound is set by updating this tuple, if necessary */ + nulls[Anum_pg_class_relpartbound - 1] = true; + tup = heap_form_tuple(RelationGetDescr(pg_class_desc), values, nulls); /* @@ -854,9 +854,7 @@ InsertPgClassTuple(Relation pg_class_desc, HeapTupleSetOid(tup, new_rel_oid); /* finally insert the new tuple, update the indexes, and clean up */ - simple_heap_insert(pg_class_desc, tup); - - CatalogUpdateIndexes(pg_class_desc, tup); + CatalogTupleInsert(pg_class_desc, tup); heap_freetuple(tup); } @@ -950,6 +948,9 @@ AddNewRelationTuple(Relation pg_class_desc, new_rel_reltup->reltype = new_type_oid; new_rel_reltup->reloftype = reloftype; + /* relispartition is always set by updating this tuple later */ + new_rel_reltup->relispartition = false; + new_rel_desc->rd_att->tdtypeid = new_type_oid; /* Now build and insert the tuple */ @@ -1484,7 +1485,8 @@ heap_create_with_catalog(const char *relname, if (IsBinaryUpgrade && (relkind == RELKIND_RELATION || relkind == RELKIND_SEQUENCE || relkind == RELKIND_VIEW || relkind == RELKIND_MATVIEW || - relkind == RELKIND_COMPOSITE_TYPE || relkind == RELKIND_FOREIGN_TABLE)) + relkind == RELKIND_COMPOSITE_TYPE || relkind == RELKIND_FOREIGN_TABLE || + relkind == RELKIND_PARTITIONED_TABLE)) { if (!OidIsValid(binary_upgrade_next_heap_pg_class_oid)) ereport(ERROR, @@ -1518,6 +1520,7 @@ heap_create_with_catalog(const char *relname, case RELKIND_VIEW: case RELKIND_MATVIEW: case RELKIND_FOREIGN_TABLE: + case RELKIND_PARTITIONED_TABLE: relacl = get_user_default_acl(ACL_OBJECT_RELATION, ownerid, relnamespace); break; @@ -1562,7 +1565,8 @@ heap_create_with_catalog(const char *relname, relkind == RELKIND_VIEW || relkind == RELKIND_MATVIEW || relkind == RELKIND_FOREIGN_TABLE || - relkind == RELKIND_COMPOSITE_TYPE)) + relkind == RELKIND_COMPOSITE_TYPE || + relkind == RELKIND_PARTITIONED_TABLE)) new_array_oid = AssignTypeArrayOid(); /* @@ -1665,10 +1669,6 @@ heap_create_with_catalog(const char *relname, * should they have any ACL entries. The same applies for extension * dependencies. * - * If it's a temp table, we do not make it an extension member; this - * prevents the unintuitive result that deletion of the temp table at - * session end would make the whole extension go away. - * * Also, skip this in bootstrap mode, since we don't make dependencies * while bootstrapping. */ @@ -1689,8 +1689,7 @@ heap_create_with_catalog(const char *relname, recordDependencyOnOwner(RelationRelationId, relid, ownerid); - if (relpersistence != RELPERSISTENCE_TEMP) - recordDependencyOnCurrentExtension(&myself, false); + recordDependencyOnCurrentExtension(&myself, false); if (reloftypeid) { @@ -1731,12 +1730,13 @@ heap_create_with_catalog(const char *relname, if (oncommit != ONCOMMIT_NOOP) register_on_commit_action(relid, oncommit); - if (relpersistence == RELPERSISTENCE_UNLOGGED) - { - Assert(relkind == RELKIND_RELATION || relkind == RELKIND_MATVIEW || - relkind == RELKIND_TOASTVALUE); + /* + * Unlogged objects need an init fork, except for partitioned tables which + * have no storage at all. + */ + if (relpersistence == RELPERSISTENCE_UNLOGGED && + relkind != RELKIND_PARTITIONED_TABLE) heap_create_init_fork(new_rel_desc); - } /* * ok, the relation has been cataloged, so close our relations and return @@ -1750,18 +1750,22 @@ heap_create_with_catalog(const char *relname, /* * Set up an init fork for an unlogged table so that it can be correctly - * reinitialized on restart. Since we're going to do an immediate sync, we - * only need to xlog this if archiving or streaming is enabled. And the - * immediate sync is required, because otherwise there's no guarantee that - * this will hit the disk before the next checkpoint moves the redo pointer. + * reinitialized on restart. An immediate sync is required even if the + * page has been logged, because the write did not go through + * shared_buffers and therefore a concurrent checkpoint may have moved + * the redo pointer past our xlog record. Recovery may as well remove it + * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE + * record. Therefore, logging is necessary even if wal_level=minimal. */ void heap_create_init_fork(Relation rel) { + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); RelationOpenSmgr(rel); smgrcreate(rel->rd_smgr, INIT_FORKNUM, false); - if (XLogIsNeeded()) - log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM); + log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM); smgrimmedsync(rel->rd_smgr, INIT_FORKNUM); } @@ -1793,7 +1797,7 @@ RelationRemoveInheritance(Oid relid) NULL, 1, &key); while (HeapTupleIsValid(tuple = systable_getnext(scan))) - simple_heap_delete(catalogRelation, &tuple->t_self); + CatalogTupleDelete(catalogRelation, &tuple->t_self); systable_endscan(scan); heap_close(catalogRelation, RowExclusiveLock); @@ -1821,7 +1825,7 @@ DeleteRelationTuple(Oid relid) elog(ERROR, "cache lookup failed for relation %u", relid); /* delete the relation tuple from pg_class, and finish up */ - simple_heap_delete(pg_class_desc, &tup->t_self); + CatalogTupleDelete(pg_class_desc, &tup->t_self); ReleaseSysCache(tup); @@ -1858,7 +1862,7 @@ DeleteAttributeTuples(Oid relid) /* Delete all the matching tuples */ while ((atttup = systable_getnext(scan)) != NULL) - simple_heap_delete(attrel, &atttup->t_self); + CatalogTupleDelete(attrel, &atttup->t_self); /* Clean up after the scan */ systable_endscan(scan); @@ -1899,7 +1903,7 @@ DeleteSystemAttributeTuples(Oid relid) /* Delete all the matching tuples */ while ((atttup = systable_getnext(scan)) != NULL) - simple_heap_delete(attrel, &atttup->t_self); + CatalogTupleDelete(attrel, &atttup->t_self); /* Clean up after the scan */ systable_endscan(scan); @@ -1946,7 +1950,7 @@ RemoveAttributeById(Oid relid, AttrNumber attnum) { /* System attribute (probably OID) ... just delete the row */ - simple_heap_delete(attr_rel, &tuple->t_self); + CatalogTupleDelete(attr_rel, &tuple->t_self); } else { @@ -1979,10 +1983,7 @@ RemoveAttributeById(Oid relid, AttrNumber attnum) "........pg.dropped.%d........", attnum); namestrcpy(&(attStruct->attname), newattname); - simple_heap_update(attr_rel, &tuple->t_self, tuple); - - /* keep the system catalog indexes current */ - CatalogUpdateIndexes(attr_rel, tuple); + CatalogTupleUpdate(attr_rel, &tuple->t_self, tuple); } /* @@ -2094,7 +2095,7 @@ RemoveAttrDefaultById(Oid attrdefId) myrel = relation_open(myrelid, AccessExclusiveLock); /* Now we can delete the pg_attrdef row */ - simple_heap_delete(attrdef_rel, &tuple->t_self); + CatalogTupleDelete(attrdef_rel, &tuple->t_self); systable_endscan(scan); heap_close(attrdef_rel, RowExclusiveLock); @@ -2111,10 +2112,7 @@ RemoveAttrDefaultById(Oid attrdefId) ((Form_pg_attribute) GETSTRUCT(tuple))->atthasdef = false; - simple_heap_update(attr_rel, &tuple->t_self, tuple); - - /* keep the system catalog indexes current */ - CatalogUpdateIndexes(attr_rel, tuple); + CatalogTupleUpdate(attr_rel, &tuple->t_self, tuple); /* * Our update of the pg_attribute row will force a relcache rebuild, so @@ -2139,6 +2137,26 @@ void heap_drop_with_catalog(Oid relid) { Relation rel; + HeapTuple tuple; + Oid parentOid = InvalidOid; + + /* + * To drop a partition safely, we must grab exclusive lock on its parent, + * because another backend might be about to execute a query on the parent + * table. If it relies on previously cached partition descriptor, then it + * could attempt to access the just-dropped relation as its partition. We + * must therefore take a table lock strong enough to prevent all queries + * on the table from proceeding until we commit and send out a + * shared-cache-inval notice that will make them update their index lists. + */ + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (((Form_pg_class) GETSTRUCT(tuple))->relispartition) + { + parentOid = get_partition_parent(relid); + LockRelationOid(parentOid, AccessExclusiveLock); + } + + ReleaseSysCache(tuple); /* * Open and lock the relation. @@ -2174,18 +2192,25 @@ heap_drop_with_catalog(Oid relid) if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for foreign table %u", relid); - simple_heap_delete(rel, &tuple->t_self); + CatalogTupleDelete(rel, &tuple->t_self); ReleaseSysCache(tuple); heap_close(rel, RowExclusiveLock); } /* + * If a partitioned table, delete the pg_partitioned_table tuple. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + RemovePartitionKeyByRelId(relid); + + /* * Schedule unlinking of the relation's physical files at commit. */ if (rel->rd_rel->relkind != RELKIND_VIEW && rel->rd_rel->relkind != RELKIND_COMPOSITE_TYPE && - rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE) + rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE && + rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) { RelationDropStorage(rel); } @@ -2198,6 +2223,11 @@ heap_drop_with_catalog(Oid relid) relation_close(rel, NoLock); /* + * Remove any associated relation synchronization states. + */ + RemoveSubscriptionRel(InvalidOid, relid); + + /* * Forget any ON COMMIT action for the rel */ remove_on_commit_action(relid); @@ -2230,6 +2260,16 @@ heap_drop_with_catalog(Oid relid) * delete relation tuple */ DeleteRelationTuple(relid); + + if (OidIsValid(parentOid)) + { + /* + * Invalidate the parent's relcache so that the partition is no longer + * included in its partition descriptor. + */ + CacheInvalidateRelcacheByRelid(parentOid); + /* keep the lock */ + } } @@ -2279,9 +2319,7 @@ StoreAttrDefault(Relation rel, AttrNumber attnum, adrel = heap_open(AttrDefaultRelationId, RowExclusiveLock); tuple = heap_form_tuple(adrel->rd_att, values, nulls); - attrdefOid = simple_heap_insert(adrel, tuple); - - CatalogUpdateIndexes(adrel, tuple); + attrdefOid = CatalogTupleInsert(adrel, tuple); defobject.classId = AttrDefaultRelationId; defobject.objectId = attrdefOid; @@ -2311,9 +2349,7 @@ StoreAttrDefault(Relation rel, AttrNumber attnum, if (!attStruct->atthasdef) { attStruct->atthasdef = true; - simple_heap_update(attrrel, &atttup->t_self, atttup); - /* keep catalog indexes current */ - CatalogUpdateIndexes(attrrel, atttup); + CatalogTupleUpdate(attrrel, &atttup->t_self, atttup); } heap_close(attrrel, RowExclusiveLock); heap_freetuple(atttup); @@ -2413,6 +2449,17 @@ StoreRelCheck(Relation rel, char *ccname, Node *expr, attNos = NULL; /* + * Partitioned tables do not contain any rows themselves, so a NO INHERIT + * constraint makes no sense. + */ + if (is_no_inherit && + rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("cannot add NO INHERIT constraint to partitioned table \"%s\"", + RelationGetRelationName(rel)))); + + /* * Create the Check Constraint */ constrOid = @@ -2682,6 +2729,7 @@ AddRelationNewConstraints(Relation rel, */ if (MergeWithExistingConstraint(rel, ccname, expr, allow_merge, is_local, + cdef->initially_valid, cdef->is_no_inherit)) continue; } @@ -2770,6 +2818,7 @@ AddRelationNewConstraints(Relation rel, static bool MergeWithExistingConstraint(Relation rel, char *ccname, Node *expr, bool allow_merge, bool is_local, + bool is_initially_valid, bool is_no_inherit) { bool found; @@ -2817,37 +2866,85 @@ MergeWithExistingConstraint(Relation rel, char *ccname, Node *expr, if (equal(expr, stringToNode(TextDatumGetCString(val)))) found = true; } + + /* + * If the existing constraint is purely inherited (no local + * definition) then interpret addition of a local constraint as a + * legal merge. This allows ALTER ADD CONSTRAINT on parent and + * child tables to be given in either order with same end state. + * However if the relation is a partition, all inherited + * constraints are always non-local, including those that were + * merged. + */ + if (is_local && !con->conislocal && !rel->rd_rel->relispartition) + allow_merge = true; + if (!found || !allow_merge) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("constraint \"%s\" for relation \"%s\" already exists", ccname, RelationGetRelationName(rel)))); - tup = heap_copytuple(tup); - con = (Form_pg_constraint) GETSTRUCT(tup); - - /* If the constraint is "no inherit" then cannot merge */ + /* If the child constraint is "no inherit" then cannot merge */ if (con->connoinherit) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("constraint \"%s\" conflicts with non-inherited constraint on relation \"%s\"", ccname, RelationGetRelationName(rel)))); - if (is_local) - con->conislocal = true; + /* + * Must not change an existing inherited constraint to "no + * inherit" status. That's because inherited constraints should + * be able to propagate to lower-level children. + */ + if (con->coninhcount > 0 && is_no_inherit) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("constraint \"%s\" conflicts with inherited constraint on relation \"%s\"", + ccname, RelationGetRelationName(rel)))); + + /* + * If the child constraint is "not valid" then cannot merge with a + * valid parent constraint + */ + if (is_initially_valid && !con->convalidated) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("constraint \"%s\" conflicts with NOT VALID constraint on relation \"%s\"", + ccname, RelationGetRelationName(rel)))); + + /* OK to update the tuple */ + ereport(NOTICE, + (errmsg("merging constraint \"%s\" with inherited definition", + ccname))); + + tup = heap_copytuple(tup); + con = (Form_pg_constraint) GETSTRUCT(tup); + + /* + * In case of partitions, an inherited constraint must be + * inherited only once since it cannot have multiple parents and + * it is never considered local. + */ + if (rel->rd_rel->relispartition) + { + con->coninhcount = 1; + con->conislocal = false; + } else - con->coninhcount++; + { + if (is_local) + con->conislocal = true; + else + con->coninhcount++; + } + if (is_no_inherit) { Assert(is_local); con->connoinherit = true; } - /* OK to update the tuple */ - ereport(NOTICE, - (errmsg("merging constraint \"%s\" with inherited definition", - ccname))); - simple_heap_update(conDesc, &tup->t_self, tup); - CatalogUpdateIndexes(conDesc, tup); + CatalogTupleUpdate(conDesc, &tup->t_self, tup); break; } } @@ -2887,10 +2984,7 @@ SetRelationNumChecks(Relation rel, int numchecks) { relStruct->relchecks = numchecks; - simple_heap_update(relrel, &reltup->t_self, reltup); - - /* keep catalog indexes current */ - CatalogUpdateIndexes(relrel, reltup); + CatalogTupleUpdate(relrel, &reltup->t_self, reltup); } else { @@ -2941,14 +3035,9 @@ cookDefault(ParseState *pstate, /* * transformExpr() should have already rejected subqueries, aggregates, - * and window functions, based on the EXPR_KIND_ for a default expression. - * - * It can't return a set either. + * window functions, and SRFs, based on the EXPR_KIND_ for a default + * expression. */ - if (expression_returns_set(expr)) - ereport(ERROR, - (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("default expression must not return a set"))); /* * Coerce the expression to the correct type and typmod, if given. This @@ -3064,7 +3153,7 @@ RemoveStatistics(Oid relid, AttrNumber attnum) /* we must loop even when attnum != 0, in case of inherited stats */ while (HeapTupleIsValid(tuple = systable_getnext(scan))) - simple_heap_delete(pgstatistic, &tuple->t_self); + CatalogTupleDelete(pgstatistic, &tuple->t_self); systable_endscan(scan); @@ -3369,3 +3458,192 @@ insert_ordered_unique_oid(List *list, Oid datum) lappend_cell_oid(list, prev, datum); return list; } + +/* + * StorePartitionKey + * Store information about the partition key rel into the catalog + */ +void +StorePartitionKey(Relation rel, + char strategy, + int16 partnatts, + AttrNumber *partattrs, + List *partexprs, + Oid *partopclass, + Oid *partcollation) +{ + int i; + int2vector *partattrs_vec; + oidvector *partopclass_vec; + oidvector *partcollation_vec; + Datum partexprDatum; + Relation pg_partitioned_table; + HeapTuple tuple; + Datum values[Natts_pg_partitioned_table]; + bool nulls[Natts_pg_partitioned_table]; + ObjectAddress myself; + ObjectAddress referenced; + + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + + tuple = SearchSysCache1(PARTRELID, + ObjectIdGetDatum(RelationGetRelid(rel))); + + /* Copy the partition attribute numbers, opclass OIDs into arrays */ + partattrs_vec = buildint2vector(partattrs, partnatts); + partopclass_vec = buildoidvector(partopclass, partnatts); + partcollation_vec = buildoidvector(partcollation, partnatts); + + /* Convert the expressions (if any) to a text datum */ + if (partexprs) + { + char *exprString; + + exprString = nodeToString(partexprs); + partexprDatum = CStringGetTextDatum(exprString); + pfree(exprString); + } + else + partexprDatum = (Datum) 0; + + pg_partitioned_table = heap_open(PartitionedRelationId, RowExclusiveLock); + + MemSet(nulls, false, sizeof(nulls)); + + /* Only this can ever be NULL */ + if (!partexprDatum) + nulls[Anum_pg_partitioned_table_partexprs - 1] = true; + + values[Anum_pg_partitioned_table_partrelid - 1] = ObjectIdGetDatum(RelationGetRelid(rel)); + values[Anum_pg_partitioned_table_partstrat - 1] = CharGetDatum(strategy); + values[Anum_pg_partitioned_table_partnatts - 1] = Int16GetDatum(partnatts); + values[Anum_pg_partitioned_table_partattrs - 1] = PointerGetDatum(partattrs_vec); + values[Anum_pg_partitioned_table_partclass - 1] = PointerGetDatum(partopclass_vec); + values[Anum_pg_partitioned_table_partcollation - 1] = PointerGetDatum(partcollation_vec); + values[Anum_pg_partitioned_table_partexprs - 1] = partexprDatum; + + tuple = heap_form_tuple(RelationGetDescr(pg_partitioned_table), values, nulls); + + CatalogTupleInsert(pg_partitioned_table, tuple); + heap_close(pg_partitioned_table, RowExclusiveLock); + + /* Mark this relation as dependent on a few things as follows */ + myself.classId = RelationRelationId; + myself.objectId = RelationGetRelid(rel);; + myself.objectSubId = 0; + + /* Operator class and collation per key column */ + for (i = 0; i < partnatts; i++) + { + referenced.classId = OperatorClassRelationId; + referenced.objectId = partopclass[i]; + referenced.objectSubId = 0; + + recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); + + referenced.classId = CollationRelationId; + referenced.objectId = partcollation[i]; + referenced.objectSubId = 0; + + recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); + } + + /* + * Anything mentioned in the expressions. We must ignore the column + * references, which will depend on the table itself; there is no separate + * partition key object. + */ + if (partexprs) + recordDependencyOnSingleRelExpr(&myself, + (Node *) partexprs, + RelationGetRelid(rel), + DEPENDENCY_NORMAL, + DEPENDENCY_AUTO, true); + + /* + * We must invalidate the relcache so that the next + * CommandCounterIncrement() will cause the same to be rebuilt using the + * information in just created catalog entry. + */ + CacheInvalidateRelcache(rel); +} + +/* + * RemovePartitionKeyByRelId + * Remove pg_partitioned_table entry for a relation + */ +void +RemovePartitionKeyByRelId(Oid relid) +{ + Relation rel; + HeapTuple tuple; + + rel = heap_open(PartitionedRelationId, RowExclusiveLock); + + tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for partition key of relation %u", + relid); + + CatalogTupleDelete(rel, &tuple->t_self); + + ReleaseSysCache(tuple); + heap_close(rel, RowExclusiveLock); +} + +/* + * StorePartitionBound + * Update pg_class tuple of rel to store the partition bound and set + * relispartition to true + * + * Also, invalidate the parent's relcache, so that the next rebuild will load + * the new partition's info into its partition descriptor. + */ +void +StorePartitionBound(Relation rel, Relation parent, PartitionBoundSpec *bound) +{ + Relation classRel; + HeapTuple tuple, + newtuple; + Datum new_val[Natts_pg_class]; + bool new_null[Natts_pg_class], + new_repl[Natts_pg_class]; + + /* Update pg_class tuple */ + classRel = heap_open(RelationRelationId, RowExclusiveLock); + tuple = SearchSysCacheCopy1(RELOID, + ObjectIdGetDatum(RelationGetRelid(rel))); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", + RelationGetRelid(rel)); + +#ifdef USE_ASSERT_CHECKING + { + Form_pg_class classForm; + bool isnull; + + classForm = (Form_pg_class) GETSTRUCT(tuple); + Assert(!classForm->relispartition); + (void) SysCacheGetAttr(RELOID, tuple, Anum_pg_class_relpartbound, + &isnull); + Assert(isnull); + } +#endif + + /* Fill in relpartbound value */ + memset(new_val, 0, sizeof(new_val)); + memset(new_null, false, sizeof(new_null)); + memset(new_repl, false, sizeof(new_repl)); + new_val[Anum_pg_class_relpartbound - 1] = CStringGetTextDatum(nodeToString(bound)); + new_null[Anum_pg_class_relpartbound - 1] = false; + new_repl[Anum_pg_class_relpartbound - 1] = true; + newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel), + new_val, new_null, new_repl); + /* Also set the flag */ + ((Form_pg_class) GETSTRUCT(newtuple))->relispartition = true; + CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple); + heap_freetuple(newtuple); + heap_close(classRel, RowExclusiveLock); + + CacheInvalidateRelcache(parent); +} diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 5d895de9af..9104855ce2 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3,7 +3,7 @@ * index.c * code to create and destroy POSTGRES index relations * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -354,6 +354,7 @@ ConstructTupleDescriptor(Relation heapRelation, to->attcacheoff = -1; to->attnotnull = false; to->atthasdef = false; + to->attidentity = '\0'; to->attislocal = true; to->attinhcount = 0; to->attcollation = collationObjectId[i]; @@ -438,11 +439,28 @@ ConstructTupleDescriptor(Relation heapRelation, keyType = opclassTup->opckeytype; else keyType = amroutine->amkeytype; + + /* + * If keytype is specified as ANYELEMENT, and opcintype is ANYARRAY, + * then the attribute type must be an array (else it'd not have + * matched this opclass); use its element type. + */ + if (keyType == ANYELEMENTOID && opclassTup->opcintype == ANYARRAYOID) + { + keyType = get_base_element_type(to->atttypid); + if (!OidIsValid(keyType)) + elog(ERROR, "could not get element type of array type %u", + to->atttypid); + } + ReleaseSysCache(tuple); + /* + * If a key type different from the heap value is specified, update + * the type-related fields in the index tupdesc. + */ if (OidIsValid(keyType) && keyType != to->atttypid) { - /* index value and heap value have different types */ tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(keyType)); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for type %u", keyType); @@ -633,10 +651,7 @@ UpdateIndexRelation(Oid indexoid, /* * insert the tuple into the pg_index catalog */ - simple_heap_insert(pg_index, tuple); - - /* update the indexes on pg_index */ - CatalogUpdateIndexes(pg_index, tuple); + CatalogTupleInsert(pg_index, tuple); /* * close the relation and free the tuple @@ -1027,7 +1042,7 @@ index_create(Relation heapRelation, (Node *) indexInfo->ii_Expressions, heapRelationId, DEPENDENCY_NORMAL, - DEPENDENCY_AUTO); + DEPENDENCY_AUTO, false); } /* Store dependencies on anything mentioned in predicate */ @@ -1037,7 +1052,7 @@ index_create(Relation heapRelation, (Node *) indexInfo->ii_Predicate, heapRelationId, DEPENDENCY_NORMAL, - DEPENDENCY_AUTO); + DEPENDENCY_AUTO, false); } } else @@ -1308,8 +1323,7 @@ index_constraint_create(Relation heapRelation, if (dirty) { - simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); - CatalogUpdateIndexes(pg_index, indexTuple); + CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); InvokeObjectPostAlterHookArg(IndexRelationId, indexRelationId, 0, InvalidOid, is_internal); @@ -1561,7 +1575,7 @@ index_drop(Oid indexId, bool concurrent) hasexprs = !heap_attisnull(tuple, Anum_pg_index_indexprs); - simple_heap_delete(indexRelation, &tuple->t_self); + CatalogTupleDelete(indexRelation, &tuple->t_self); ReleaseSysCache(tuple); heap_close(indexRelation, RowExclusiveLock); @@ -1646,7 +1660,7 @@ BuildIndexInfo(Relation index) /* fetch index predicate if any */ ii->ii_Predicate = RelationGetIndexPredicate(index); - ii->ii_PredicateState = NIL; + ii->ii_PredicateState = NULL; /* fetch exclusion constraint info if any */ if (indexStruct->indisexclusion) @@ -1675,6 +1689,10 @@ BuildIndexInfo(Relation index) ii->ii_Concurrent = false; ii->ii_BrokenHotChain = false; + /* set up for possible use by index AM */ + ii->ii_AmCache = NULL; + ii->ii_Context = CurrentMemoryContext; + return ii; } @@ -1758,9 +1776,8 @@ FormIndexDatum(IndexInfo *indexInfo, indexInfo->ii_ExpressionsState == NIL) { /* First time through, set up expression evaluation state */ - indexInfo->ii_ExpressionsState = (List *) - ExecPrepareExpr((Expr *) indexInfo->ii_Expressions, - estate); + indexInfo->ii_ExpressionsState = + ExecPrepareExprList(indexInfo->ii_Expressions, estate); /* Check caller has set up context correctly */ Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot); } @@ -1789,8 +1806,7 @@ FormIndexDatum(IndexInfo *indexInfo, elog(ERROR, "wrong number of index expressions"); iDatum = ExecEvalExprSwitchContext((ExprState *) lfirst(indexpr_item), GetPerTupleExprContext(estate), - &isNull, - NULL); + &isNull); indexpr_item = lnext(indexpr_item); } values[i] = iDatum; @@ -1843,8 +1859,8 @@ index_update_stats(Relation rel, * 1. In bootstrap mode, we have no choice --- UPDATE wouldn't work. * * 2. We could be reindexing pg_class itself, in which case we can't move - * its pg_class row because CatalogUpdateIndexes might not know about all - * the indexes yet (see reindex_relation). + * its pg_class row because CatalogTupleInsert/CatalogTupleUpdate might + * not know about all the indexes yet (see reindex_relation). * * 3. Because we execute CREATE INDEX with just share lock on the parent * rel (to allow concurrent index creations), an ordinary update could @@ -2102,8 +2118,7 @@ index_build(Relation heapRelation, Assert(!indexForm->indcheckxmin); indexForm->indcheckxmin = true; - simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); - CatalogUpdateIndexes(pg_index, indexTuple); + CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); heap_freetuple(indexTuple); heap_close(pg_index, RowExclusiveLock); @@ -2208,7 +2223,7 @@ IndexBuildHeapRangeScan(Relation heapRelation, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; double reltuples; - List *predicate; + ExprState *predicate; TupleTableSlot *slot; EState *estate; ExprContext *econtext; @@ -2247,9 +2262,7 @@ IndexBuildHeapRangeScan(Relation heapRelation, econtext->ecxt_scantuple = slot; /* Set up execution state for predicate, if any. */ - predicate = (List *) - ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, - estate); + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); /* * Prepare for scan of the base relation. In a normal index build, we use @@ -2270,7 +2283,7 @@ IndexBuildHeapRangeScan(Relation heapRelation, { snapshot = SnapshotAny; /* okay to ignore lazy VACUUMs here */ - OldestXmin = GetOldestXmin(heapRelation, true); + OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM); } scan = heap_beginscan_strat(heapRelation, /* relation */ @@ -2552,9 +2565,9 @@ IndexBuildHeapRangeScan(Relation heapRelation, * In a partial index, discard tuples that don't satisfy the * predicate. */ - if (predicate != NIL) + if (predicate != NULL) { - if (!ExecQual(predicate, econtext, false)) + if (!ExecQual(predicate, econtext)) continue; } @@ -2619,7 +2632,7 @@ IndexBuildHeapRangeScan(Relation heapRelation, /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; - indexInfo->ii_PredicateState = NIL; + indexInfo->ii_PredicateState = NULL; return reltuples; } @@ -2646,7 +2659,7 @@ IndexCheckExclusion(Relation heapRelation, HeapTuple heapTuple; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - List *predicate; + ExprState *predicate; TupleTableSlot *slot; EState *estate; ExprContext *econtext; @@ -2672,9 +2685,7 @@ IndexCheckExclusion(Relation heapRelation, econtext->ecxt_scantuple = slot; /* Set up execution state for predicate, if any. */ - predicate = (List *) - ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, - estate); + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); /* * Scan all live tuples in the base relation. @@ -2699,9 +2710,9 @@ IndexCheckExclusion(Relation heapRelation, /* * In a partial index, ignore tuples that don't satisfy the predicate. */ - if (predicate != NIL) + if (predicate != NULL) { - if (!ExecQual(predicate, econtext, false)) + if (!ExecQual(predicate, econtext)) continue; } @@ -2732,7 +2743,7 @@ IndexCheckExclusion(Relation heapRelation, /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; - indexInfo->ii_PredicateState = NIL; + indexInfo->ii_PredicateState = NULL; } @@ -2962,7 +2973,7 @@ validate_index_heapscan(Relation heapRelation, HeapTuple heapTuple; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - List *predicate; + ExprState *predicate; TupleTableSlot *slot; EState *estate; ExprContext *econtext; @@ -2992,9 +3003,7 @@ validate_index_heapscan(Relation heapRelation, econtext->ecxt_scantuple = slot; /* Set up execution state for predicate, if any. */ - predicate = (List *) - ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, - estate); + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); /* * Prepare for scan of the base relation. We need just those tuples @@ -3121,9 +3130,9 @@ validate_index_heapscan(Relation heapRelation, * In a partial index, discard tuples that don't satisfy the * predicate. */ - if (predicate != NIL) + if (predicate != NULL) { - if (!ExecQual(predicate, econtext, false)) + if (!ExecQual(predicate, econtext)) continue; } @@ -3162,7 +3171,8 @@ validate_index_heapscan(Relation heapRelation, &rootTuple, heapRelation, indexInfo->ii_Unique ? - UNIQUE_CHECK_YES : UNIQUE_CHECK_NO); + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + indexInfo); state->tups_inserted += 1; } @@ -3176,7 +3186,7 @@ validate_index_heapscan(Relation heapRelation, /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; - indexInfo->ii_PredicateState = NIL; + indexInfo->ii_PredicateState = NULL; } @@ -3447,8 +3457,7 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence, indexForm->indisvalid = true; indexForm->indisready = true; indexForm->indislive = true; - simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); - CatalogUpdateIndexes(pg_index, indexTuple); + CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); /* * Invalidate the relcache for the table, so that after we commit @@ -3542,9 +3551,9 @@ reindex_relation(Oid relid, int flags, int options) * that the updates do not try to insert index entries into indexes we * have not processed yet. (When we are trying to recover from corrupted * indexes, that could easily cause a crash.) We can accomplish this - * because CatalogUpdateIndexes will use the relcache's index list to know - * which indexes to update. We just force the index list to be only the - * stuff we've processed. + * because CatalogTupleInsert/CatalogTupleUpdate will use the relcache's + * index list to know which indexes to update. We just force the index + * list to be only the stuff we've processed. * * It is okay to not insert entries into the indexes we have not processed * yet because all of this is transaction-safe. If we fail partway diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index b9fe10237b..abc344ad69 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -4,7 +4,7 @@ * This file contains routines to support indexes defined on system * catalogs. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -68,7 +68,7 @@ CatalogCloseIndexes(CatalogIndexState indstate) * * This is effectively a cut-down version of ExecInsertIndexTuples. */ -void +static void CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) { int i; @@ -139,26 +139,120 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) &(heapTuple->t_self), /* tid of heap tuple */ heapRelation, relationDescs[i]->rd_index->indisunique ? - UNIQUE_CHECK_YES : UNIQUE_CHECK_NO); + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + indexInfo); } ExecDropSingleTupleTableSlot(slot); } /* - * CatalogUpdateIndexes - do all the indexing work for a new catalog tuple + * CatalogTupleInsert - do heap and indexing work for a new catalog tuple + * + * Insert the tuple data in "tup" into the specified catalog relation. + * The Oid of the inserted tuple is returned. + * + * This is a convenience routine for the common case of inserting a single + * tuple in a system catalog; it inserts a new heap tuple, keeping indexes + * current. Avoid using it for multiple tuples, since opening the indexes + * and building the index info structures is moderately expensive. + * (Use CatalogTupleInsertWithInfo in such cases.) + */ +Oid +CatalogTupleInsert(Relation heapRel, HeapTuple tup) +{ + CatalogIndexState indstate; + Oid oid; + + indstate = CatalogOpenIndexes(heapRel); + + oid = simple_heap_insert(heapRel, tup); + + CatalogIndexInsert(indstate, tup); + CatalogCloseIndexes(indstate); + + return oid; +} + +/* + * CatalogTupleInsertWithInfo - as above, but with caller-supplied index info + * + * This should be used when it's important to amortize CatalogOpenIndexes/ + * CatalogCloseIndexes work across multiple insertions. At some point we + * might cache the CatalogIndexState data somewhere (perhaps in the relcache) + * so that callers needn't trouble over this ... but we don't do so today. + */ +Oid +CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup, + CatalogIndexState indstate) +{ + Oid oid; + + oid = simple_heap_insert(heapRel, tup); + + CatalogIndexInsert(indstate, tup); + + return oid; +} + +/* + * CatalogTupleUpdate - do heap and indexing work for updating a catalog tuple + * + * Update the tuple identified by "otid", replacing it with the data in "tup". * - * This is a convenience routine for the common case where we only need - * to insert or update a single tuple in a system catalog. Avoid using it for - * multiple tuples, since opening the indexes and building the index info - * structures is moderately expensive. + * This is a convenience routine for the common case of updating a single + * tuple in a system catalog; it updates one heap tuple, keeping indexes + * current. Avoid using it for multiple tuples, since opening the indexes + * and building the index info structures is moderately expensive. + * (Use CatalogTupleUpdateWithInfo in such cases.) */ void -CatalogUpdateIndexes(Relation heapRel, HeapTuple heapTuple) +CatalogTupleUpdate(Relation heapRel, ItemPointer otid, HeapTuple tup) { CatalogIndexState indstate; indstate = CatalogOpenIndexes(heapRel); - CatalogIndexInsert(indstate, heapTuple); + + simple_heap_update(heapRel, otid, tup); + + CatalogIndexInsert(indstate, tup); CatalogCloseIndexes(indstate); } + +/* + * CatalogTupleUpdateWithInfo - as above, but with caller-supplied index info + * + * This should be used when it's important to amortize CatalogOpenIndexes/ + * CatalogCloseIndexes work across multiple updates. At some point we + * might cache the CatalogIndexState data somewhere (perhaps in the relcache) + * so that callers needn't trouble over this ... but we don't do so today. + */ +void +CatalogTupleUpdateWithInfo(Relation heapRel, ItemPointer otid, HeapTuple tup, + CatalogIndexState indstate) +{ + simple_heap_update(heapRel, otid, tup); + + CatalogIndexInsert(indstate, tup); +} + +/* + * CatalogTupleDelete - do heap and indexing work for deleting a catalog tuple + * + * Delete the tuple identified by "tid" in the specified catalog. + * + * With Postgres heaps, there is no index work to do at deletion time; + * cleanup will be done later by VACUUM. However, callers of this function + * shouldn't have to know that; we'd like a uniform abstraction for all + * catalog tuple changes. Hence, provide this currently-trivial wrapper. + * + * The abstraction is a bit leaky in that we don't provide an optimized + * CatalogTupleDeleteWithInfo version, because there is currently nothing to + * optimize. If we ever need that, rather than touching a lot of call sites, + * it might be better to do something about caching CatalogIndexState. + */ +void +CatalogTupleDelete(Relation heapRel, ItemPointer tid) +{ + simple_heap_delete(heapRel, tid); +} diff --git a/src/backend/catalog/information_schema.sql b/src/backend/catalog/information_schema.sql index 18be08fead..cbcd6cfbc1 100644 --- a/src/backend/catalog/information_schema.sql +++ b/src/backend/catalog/information_schema.sql @@ -2,7 +2,7 @@ * SQL Information Schema * as defined in ISO/IEC 9075-11:2011 * - * Copyright (c) 2003-2016, PostgreSQL Global Development Group + * Copyright (c) 2003-2017, PostgreSQL Global Development Group * * src/backend/catalog/information_schema.sql * @@ -42,14 +42,14 @@ SET search_path TO information_schema; /* Expand any 1-D array into a set with integers 1..N */ CREATE FUNCTION _pg_expandarray(IN anyarray, OUT x anyelement, OUT n int) RETURNS SETOF RECORD - LANGUAGE sql STRICT IMMUTABLE + LANGUAGE sql STRICT IMMUTABLE PARALLEL SAFE AS 'select $1[s], s - pg_catalog.array_lower($1,1) + 1 from pg_catalog.generate_series(pg_catalog.array_lower($1,1), pg_catalog.array_upper($1,1), 1) as g(s)'; CREATE FUNCTION _pg_keysequal(smallint[], smallint[]) RETURNS boolean - LANGUAGE sql IMMUTABLE -- intentionally not STRICT, to allow inlining + LANGUAGE sql IMMUTABLE PARALLEL SAFE -- intentionally not STRICT, to allow inlining AS 'select $1 operator(pg_catalog.<@) $2 and $2 operator(pg_catalog.<@) $1'; /* Given an index's OID and an underlying-table column number, return the @@ -66,6 +66,7 @@ $$; CREATE FUNCTION _pg_truetypid(pg_attribute, pg_type) RETURNS oid LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT CASE WHEN $2.typtype = 'd' THEN $2.typbasetype ELSE $1.atttypid END$$; @@ -73,6 +74,7 @@ $$SELECT CASE WHEN $2.typtype = 'd' THEN $2.typbasetype ELSE $1.atttypid END$$; CREATE FUNCTION _pg_truetypmod(pg_attribute, pg_type) RETURNS int4 LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT CASE WHEN $2.typtype = 'd' THEN $2.typtypmod ELSE $1.atttypmod END$$; @@ -82,6 +84,7 @@ $$SELECT CASE WHEN $2.typtype = 'd' THEN $2.typtypmod ELSE $1.atttypmod END$$; CREATE FUNCTION _pg_char_max_length(typid oid, typmod int4) RETURNS integer LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT @@ -97,6 +100,7 @@ $$SELECT CREATE FUNCTION _pg_char_octet_length(typid oid, typmod int4) RETURNS integer LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT @@ -112,6 +116,7 @@ $$SELECT CREATE FUNCTION _pg_numeric_precision(typid oid, typmod int4) RETURNS integer LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT @@ -132,6 +137,7 @@ $$SELECT CREATE FUNCTION _pg_numeric_precision_radix(typid oid, typmod int4) RETURNS integer LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT @@ -143,6 +149,7 @@ $$SELECT CREATE FUNCTION _pg_numeric_scale(typid oid, typmod int4) RETURNS integer LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT @@ -158,6 +165,7 @@ $$SELECT CREATE FUNCTION _pg_datetime_precision(typid oid, typmod int4) RETURNS integer LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT @@ -173,6 +181,7 @@ $$SELECT CREATE FUNCTION _pg_interval_type(typid oid, mod int4) RETURNS text LANGUAGE sql IMMUTABLE + PARALLEL SAFE RETURNS NULL ON NULL INPUT AS $$SELECT @@ -365,7 +374,7 @@ CREATE VIEW attributes AS ON a.attcollation = co.oid AND (nco.nspname, co.collname) <> ('pg_catalog', 'default') WHERE a.attnum > 0 AND NOT a.attisdropped - AND c.relkind in ('c') + AND c.relkind IN ('c') AND (pg_has_role(c.relowner, 'USAGE') OR has_type_privilege(c.reltype, 'USAGE')); @@ -453,7 +462,7 @@ CREATE VIEW check_constraints AS AND a.attnum > 0 AND NOT a.attisdropped AND a.attnotnull - AND r.relkind = 'r' + AND r.relkind IN ('r', 'p') AND pg_has_role(r.relowner, 'USAGE'); GRANT SELECT ON check_constraints TO PUBLIC; @@ -525,7 +534,7 @@ CREATE VIEW column_domain_usage AS AND a.attrelid = c.oid AND a.atttypid = t.oid AND t.typtype = 'd' - AND c.relkind IN ('r', 'v', 'f') + AND c.relkind IN ('r', 'v', 'f', 'p') AND a.attnum > 0 AND NOT a.attisdropped AND pg_has_role(t.typowner, 'USAGE'); @@ -564,7 +573,7 @@ CREATE VIEW column_privileges AS pr_c.relowner FROM (SELECT oid, relname, relnamespace, relowner, (aclexplode(coalesce(relacl, acldefault('r', relowner)))).* FROM pg_class - WHERE relkind IN ('r', 'v', 'f') + WHERE relkind IN ('r', 'v', 'f', 'p') ) pr_c (oid, relname, relnamespace, relowner, grantor, grantee, prtype, grantable), pg_attribute a WHERE a.attrelid = pr_c.oid @@ -586,7 +595,7 @@ CREATE VIEW column_privileges AS ) pr_a (attrelid, attname, grantor, grantee, prtype, grantable), pg_class c WHERE pr_a.attrelid = c.oid - AND relkind IN ('r', 'v', 'f') + AND relkind IN ('r', 'v', 'f', 'p') ) x, pg_namespace nc, pg_authid u_grantor, @@ -629,7 +638,8 @@ CREATE VIEW column_udt_usage AS WHERE a.attrelid = c.oid AND a.atttypid = t.oid AND nc.oid = c.relnamespace - AND a.attnum > 0 AND NOT a.attisdropped AND c.relkind in ('r', 'v', 'f') + AND a.attnum > 0 AND NOT a.attisdropped + AND c.relkind in ('r', 'v', 'f', 'p') AND pg_has_role(coalesce(bt.typowner, t.typowner), 'USAGE'); GRANT SELECT ON column_udt_usage TO PUBLIC; @@ -727,18 +737,18 @@ CREATE VIEW columns AS CAST(a.attnum AS sql_identifier) AS dtd_identifier, CAST('NO' AS yes_or_no) AS is_self_referencing, - CAST('NO' AS yes_or_no) AS is_identity, - CAST(null AS character_data) AS identity_generation, - CAST(null AS character_data) AS identity_start, - CAST(null AS character_data) AS identity_increment, - CAST(null AS character_data) AS identity_maximum, - CAST(null AS character_data) AS identity_minimum, - CAST(null AS yes_or_no) AS identity_cycle, + CAST(CASE WHEN a.attidentity IN ('a', 'd') THEN 'YES' ELSE 'NO' END AS yes_or_no) AS is_identity, + CAST(CASE a.attidentity WHEN 'a' THEN 'ALWAYS' WHEN 'd' THEN 'BY DEFAULT' END AS character_data) AS identity_generation, + CAST(seq.seqstart AS character_data) AS identity_start, + CAST(seq.seqincrement AS character_data) AS identity_increment, + CAST(seq.seqmax AS character_data) AS identity_maximum, + CAST(seq.seqmin AS character_data) AS identity_minimum, + CAST(CASE WHEN seq.seqcycle THEN 'YES' ELSE 'NO' END AS yes_or_no) AS identity_cycle, CAST('NEVER' AS character_data) AS is_generated, CAST(null AS character_data) AS generation_expression, - CAST(CASE WHEN c.relkind = 'r' OR + CAST(CASE WHEN c.relkind IN ('r', 'p') OR (c.relkind IN ('v', 'f') AND pg_column_is_updatable(c.oid, a.attnum, false)) THEN 'YES' ELSE 'NO' END AS yes_or_no) AS is_updatable @@ -750,10 +760,13 @@ CREATE VIEW columns AS ON (t.typtype = 'd' AND t.typbasetype = bt.oid) LEFT JOIN (pg_collation co JOIN pg_namespace nco ON (co.collnamespace = nco.oid)) ON a.attcollation = co.oid AND (nco.nspname, co.collname) <> ('pg_catalog', 'default') + LEFT JOIN (pg_depend dep JOIN pg_sequence seq ON (dep.classid = 'pg_class'::regclass AND dep.objid = seq.seqrelid AND dep.deptype = 'i')) + ON (dep.refclassid = 'pg_class'::regclass AND dep.refobjid = c.oid AND dep.refobjsubid = a.attnum) WHERE (NOT pg_is_other_temp_schema(nc.oid)) - AND a.attnum > 0 AND NOT a.attisdropped AND c.relkind in ('r', 'v', 'f') + AND a.attnum > 0 AND NOT a.attisdropped + AND c.relkind IN ('r', 'v', 'f', 'p') AND (pg_has_role(c.relowner, 'USAGE') OR has_column_privilege(c.oid, a.attnum, @@ -789,7 +802,7 @@ CREATE VIEW constraint_column_usage AS AND d.objid = c.oid AND c.connamespace = nc.oid AND c.contype = 'c' - AND r.relkind = 'r' + AND r.relkind IN ('r', 'p') AND NOT a.attisdropped UNION ALL @@ -801,11 +814,11 @@ CREATE VIEW constraint_column_usage AS WHERE nr.oid = r.relnamespace AND r.oid = a.attrelid AND nc.oid = c.connamespace - AND (CASE WHEN c.contype = 'f' THEN r.oid = c.confrelid AND a.attnum = ANY (c.confkey) - ELSE r.oid = c.conrelid AND a.attnum = ANY (c.conkey) END) + AND r.oid = CASE c.contype WHEN 'f' THEN c.confrelid ELSE c.conrelid END + AND a.attnum = ANY (CASE c.contype WHEN 'f' THEN c.confkey ELSE c.conkey END) AND NOT a.attisdropped AND c.contype IN ('p', 'u', 'f') - AND r.relkind = 'r' + AND r.relkind IN ('r', 'p') ) AS x (tblschema, tblname, tblowner, colname, cstrschema, cstrname) @@ -841,7 +854,7 @@ CREATE VIEW constraint_table_usage AS WHERE c.connamespace = nc.oid AND r.relnamespace = nr.oid AND ( (c.contype = 'f' AND c.confrelid = r.oid) OR (c.contype IN ('p', 'u') AND c.conrelid = r.oid) ) - AND r.relkind = 'r' + AND r.relkind IN ('r', 'p') AND pg_has_role(r.relowner, 'USAGE'); GRANT SELECT ON constraint_table_usage TO PUBLIC; @@ -1058,7 +1071,7 @@ CREATE VIEW key_column_usage AS AND r.oid = c.conrelid AND nc.oid = c.connamespace AND c.contype IN ('p', 'u', 'f') - AND r.relkind = 'r' + AND r.relkind IN ('r', 'p') AND (NOT pg_is_other_temp_schema(nr.oid)) ) AS ss WHERE ss.roid = a.attrelid AND a.attnum = (ss.x).x @@ -1531,19 +1544,21 @@ CREATE VIEW sequences AS SELECT CAST(current_database() AS sql_identifier) AS sequence_catalog, CAST(nc.nspname AS sql_identifier) AS sequence_schema, CAST(c.relname AS sql_identifier) AS sequence_name, - CAST('bigint' AS character_data) AS data_type, - CAST(64 AS cardinal_number) AS numeric_precision, + CAST(format_type(s.seqtypid, null) AS character_data) AS data_type, + CAST(_pg_numeric_precision(s.seqtypid, -1) AS cardinal_number) AS numeric_precision, CAST(2 AS cardinal_number) AS numeric_precision_radix, CAST(0 AS cardinal_number) AS numeric_scale, - CAST(p.start_value AS character_data) AS start_value, - CAST(p.minimum_value AS character_data) AS minimum_value, - CAST(p.maximum_value AS character_data) AS maximum_value, - CAST(p.increment AS character_data) AS increment, - CAST(CASE WHEN p.cycle_option THEN 'YES' ELSE 'NO' END AS yes_or_no) AS cycle_option - FROM pg_namespace nc, pg_class c, LATERAL pg_sequence_parameters(c.oid) p + CAST(s.seqstart AS character_data) AS start_value, + CAST(s.seqmin AS character_data) AS minimum_value, + CAST(s.seqmax AS character_data) AS maximum_value, + CAST(s.seqincrement AS character_data) AS increment, + CAST(CASE WHEN s.seqcycle THEN 'YES' ELSE 'NO' END AS yes_or_no) AS cycle_option + FROM pg_namespace nc, pg_class c, pg_sequence s WHERE c.relnamespace = nc.oid AND c.relkind = 'S' + AND NOT EXISTS (SELECT 1 FROM pg_depend WHERE classid = 'pg_class'::regclass AND objid = c.oid AND deptype = 'i') AND (NOT pg_is_other_temp_schema(nc.oid)) + AND c.oid = s.seqrelid AND (pg_has_role(c.relowner, 'USAGE') OR has_sequence_privilege(c.oid, 'SELECT, UPDATE, USAGE') ); @@ -1773,7 +1788,7 @@ CREATE VIEW table_constraints AS WHERE nc.oid = c.connamespace AND nr.oid = r.relnamespace AND c.conrelid = r.oid AND c.contype NOT IN ('t', 'x') -- ignore nonstandard constraints - AND r.relkind = 'r' + AND r.relkind IN ('r', 'p') AND (NOT pg_is_other_temp_schema(nr.oid)) AND (pg_has_role(r.relowner, 'USAGE') -- SELECT privilege omitted, per SQL standard @@ -1803,7 +1818,7 @@ CREATE VIEW table_constraints AS AND a.attnotnull AND a.attnum > 0 AND NOT a.attisdropped - AND r.relkind = 'r' + AND r.relkind IN ('r', 'p') AND (NOT pg_is_other_temp_schema(nr.oid)) AND (pg_has_role(r.relowner, 'USAGE') -- SELECT privilege omitted, per SQL standard @@ -1853,7 +1868,7 @@ CREATE VIEW table_privileges AS ) AS grantee (oid, rolname) WHERE c.relnamespace = nc.oid - AND c.relkind IN ('r', 'v') + AND c.relkind IN ('r', 'v', 'p') AND c.grantee = grantee.oid AND c.grantor = u_grantor.oid AND c.prtype IN ('INSERT', 'SELECT', 'UPDATE', 'DELETE', 'TRUNCATE', 'REFERENCES', 'TRIGGER') @@ -1897,7 +1912,7 @@ CREATE VIEW tables AS CAST( CASE WHEN nc.oid = pg_my_temp_schema() THEN 'LOCAL TEMPORARY' - WHEN c.relkind = 'r' THEN 'BASE TABLE' + WHEN c.relkind IN ('r', 'p') THEN 'BASE TABLE' WHEN c.relkind = 'v' THEN 'VIEW' WHEN c.relkind = 'f' THEN 'FOREIGN TABLE' ELSE null END @@ -1910,7 +1925,7 @@ CREATE VIEW tables AS CAST(nt.nspname AS sql_identifier) AS user_defined_type_schema, CAST(t.typname AS sql_identifier) AS user_defined_type_name, - CAST(CASE WHEN c.relkind = 'r' OR + CAST(CASE WHEN c.relkind IN ('r', 'p') OR (c.relkind IN ('v', 'f') AND -- 1 << CMD_INSERT pg_relation_is_updatable(c.oid, false) & 8 = 8) @@ -1922,7 +1937,7 @@ CREATE VIEW tables AS FROM pg_namespace nc JOIN pg_class c ON (nc.oid = c.relnamespace) LEFT JOIN (pg_type t JOIN pg_namespace nt ON (t.typnamespace = nt.oid)) ON (c.reloftype = t.oid) - WHERE c.relkind IN ('r', 'v', 'f') + WHERE c.relkind IN ('r', 'v', 'f', 'p') AND (NOT pg_is_other_temp_schema(nc.oid)) AND (pg_has_role(c.relowner, 'USAGE') OR has_table_privilege(c.oid, 'SELECT, INSERT, UPDATE, DELETE, TRUNCATE, REFERENCES, TRIGGER') @@ -2068,7 +2083,7 @@ CREATE VIEW triggers AS -- XXX strange hacks follow CAST( CASE WHEN pg_has_role(c.relowner, 'USAGE') - THEN (SELECT m[1] FROM regexp_matches(pg_get_triggerdef(t.oid), E'.{35,} WHEN \\((.+)\\) EXECUTE PROCEDURE') AS rm(m) LIMIT 1) + THEN (regexp_match(pg_get_triggerdef(t.oid), E'.{35,} WHEN \\((.+)\\) EXECUTE PROCEDURE'))[1] ELSE null END AS character_data) AS action_condition, CAST( @@ -2441,7 +2456,7 @@ CREATE VIEW view_column_usage AS AND dt.refclassid = 'pg_catalog.pg_class'::regclass AND dt.refobjid = t.oid AND t.relnamespace = nt.oid - AND t.relkind IN ('r', 'v', 'f') + AND t.relkind IN ('r', 'v', 'f', 'p') AND t.oid = a.attrelid AND dt.refobjsubid = a.attnum AND pg_has_role(t.relowner, 'USAGE'); @@ -2519,7 +2534,7 @@ CREATE VIEW view_table_usage AS AND dt.refclassid = 'pg_catalog.pg_class'::regclass AND dt.refobjid = t.oid AND t.relnamespace = nt.oid - AND t.relkind IN ('r', 'v', 'f') + AND t.relkind IN ('r', 'v', 'f', 'p') AND pg_has_role(t.relowner, 'USAGE'); GRANT SELECT ON view_table_usage TO PUBLIC; @@ -2672,7 +2687,7 @@ CREATE VIEW element_types AS a.attnum, a.atttypid, a.attcollation FROM pg_class c, pg_attribute a WHERE c.oid = a.attrelid - AND c.relkind IN ('r', 'v', 'f', 'c') + AND c.relkind IN ('r', 'v', 'f', 'c', 'p') AND attnum > 0 AND NOT attisdropped UNION ALL diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index 5caaef144f..d7f6075b13 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -10,7 +10,7 @@ * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -39,6 +39,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_ts_config.h" #include "catalog/pg_ts_dict.h" #include "catalog/pg_ts_parser.h" @@ -64,6 +65,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" +#include "utils/varlena.h" /* @@ -209,22 +211,6 @@ static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, static void FindTemporaryNamespace(void); #endif -/* These don't really need to appear in any header file */ -Datum pg_table_is_visible(PG_FUNCTION_ARGS); -Datum pg_type_is_visible(PG_FUNCTION_ARGS); -Datum pg_function_is_visible(PG_FUNCTION_ARGS); -Datum pg_operator_is_visible(PG_FUNCTION_ARGS); -Datum pg_opclass_is_visible(PG_FUNCTION_ARGS); -Datum pg_opfamily_is_visible(PG_FUNCTION_ARGS); -Datum pg_collation_is_visible(PG_FUNCTION_ARGS); -Datum pg_conversion_is_visible(PG_FUNCTION_ARGS); -Datum pg_ts_parser_is_visible(PG_FUNCTION_ARGS); -Datum pg_ts_dict_is_visible(PG_FUNCTION_ARGS); -Datum pg_ts_template_is_visible(PG_FUNCTION_ARGS); -Datum pg_ts_config_is_visible(PG_FUNCTION_ARGS); -Datum pg_my_temp_schema(PG_FUNCTION_ARGS); -Datum pg_is_other_temp_schema(PG_FUNCTION_ARGS); - /* * RangeVarGetRelid @@ -2144,6 +2130,128 @@ ConversionIsVisible(Oid conid) } /* + * get_statistics_object_oid - find a statistics object by possibly qualified name + * + * If not found, returns InvalidOid if missing_ok, else throws error + */ +Oid +get_statistics_object_oid(List *names, bool missing_ok) +{ + char *schemaname; + char *stats_name; + Oid namespaceId; + Oid stats_oid = InvalidOid; + ListCell *l; + + /* deconstruct the name list */ + DeconstructQualifiedName(names, &schemaname, &stats_name); + + if (schemaname) + { + /* use exact schema given */ + namespaceId = LookupExplicitNamespace(schemaname, missing_ok); + if (missing_ok && !OidIsValid(namespaceId)) + stats_oid = InvalidOid; + else + stats_oid = GetSysCacheOid2(STATEXTNAMENSP, + PointerGetDatum(stats_name), + ObjectIdGetDatum(namespaceId)); + } + else + { + /* search for it in search path */ + recomputeNamespacePath(); + + foreach(l, activeSearchPath) + { + namespaceId = lfirst_oid(l); + + if (namespaceId == myTempNamespace) + continue; /* do not look in temp namespace */ + stats_oid = GetSysCacheOid2(STATEXTNAMENSP, + PointerGetDatum(stats_name), + ObjectIdGetDatum(namespaceId)); + if (OidIsValid(stats_oid)) + break; + } + } + + if (!OidIsValid(stats_oid) && !missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("statistics object \"%s\" does not exist", + NameListToString(names)))); + + return stats_oid; +} + +/* + * StatisticsObjIsVisible + * Determine whether a statistics object (identified by OID) is visible in + * the current search path. Visible means "would be found by searching + * for the unqualified statistics object name". + */ +bool +StatisticsObjIsVisible(Oid relid) +{ + HeapTuple stxtup; + Form_pg_statistic_ext stxform; + Oid stxnamespace; + bool visible; + + stxtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(stxtup)) + elog(ERROR, "cache lookup failed for statistics object %u", relid); + stxform = (Form_pg_statistic_ext) GETSTRUCT(stxtup); + + recomputeNamespacePath(); + + /* + * Quick check: if it ain't in the path at all, it ain't visible. Items in + * the system namespace are surely in the path and so we needn't even do + * list_member_oid() for them. + */ + stxnamespace = stxform->stxnamespace; + if (stxnamespace != PG_CATALOG_NAMESPACE && + !list_member_oid(activeSearchPath, stxnamespace)) + visible = false; + else + { + /* + * If it is in the path, it might still not be visible; it could be + * hidden by another statistics object of the same name earlier in the + * path. So we must do a slow check for conflicting objects. + */ + char *stxname = NameStr(stxform->stxname); + ListCell *l; + + visible = false; + foreach(l, activeSearchPath) + { + Oid namespaceId = lfirst_oid(l); + + if (namespaceId == stxnamespace) + { + /* Found it first in path */ + visible = true; + break; + } + if (SearchSysCacheExists2(STATEXTNAMENSP, + PointerGetDatum(stxname), + ObjectIdGetDatum(namespaceId))) + { + /* Found something else first in path */ + break; + } + } + } + + ReleaseSysCache(stxtup); + + return visible; +} + +/* * get_ts_parser_oid - find a TS parser by possibly qualified name * * If not found, returns InvalidOid if missing_ok, else throws error @@ -4013,14 +4121,19 @@ RemoveTempRelations(Oid tempNamespaceId) /* * We want to get rid of everything in the target namespace, but not the * namespace itself (deleting it only to recreate it later would be a - * waste of cycles). We do this by finding everything that has a - * dependency on the namespace. + * waste of cycles). Hence, specify SKIP_ORIGINAL. It's also an INTERNAL + * deletion, and we want to not drop any extensions that might happen to + * own temp objects. */ object.classId = NamespaceRelationId; object.objectId = tempNamespaceId; object.objectSubId = 0; - deleteWhatDependsOn(&object, false); + performDeletion(&object, DROP_CASCADE, + PERFORM_DELETION_INTERNAL | + PERFORM_DELETION_QUIETLY | + PERFORM_DELETION_SKIP_ORIGINAL | + PERFORM_DELETION_SKIP_EXTENSIONS); } /* @@ -4380,6 +4493,17 @@ pg_conversion_is_visible(PG_FUNCTION_ARGS) } Datum +pg_statistics_obj_is_visible(PG_FUNCTION_ARGS) +{ + Oid oid = PG_GETARG_OID(0); + + if (!SearchSysCacheExists1(STATEXTOID, ObjectIdGetDatum(oid))) + PG_RETURN_NULL(); + + PG_RETURN_BOOL(StatisticsObjIsVisible(oid)); +} + +Datum pg_ts_parser_is_visible(PG_FUNCTION_ARGS) { Oid oid = PG_GETARG_OID(0); diff --git a/src/backend/catalog/objectaccess.c b/src/backend/catalog/objectaccess.c index 23103d0cb6..9d5eb7b9da 100644 --- a/src/backend/catalog/objectaccess.c +++ b/src/backend/catalog/objectaccess.c @@ -3,7 +3,7 @@ * objectaccess.c * functions for object_access_hook on various events * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * ------------------------------------------------------------------------- diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 8068b82eab..6a365dceec 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -3,7 +3,7 @@ * objectaddress.c * functions for working with ObjectAddresses * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -45,7 +45,11 @@ #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" #include "catalog/pg_policy.h" +#include "catalog/pg_publication.h" +#include "catalog/pg_publication_rel.h" #include "catalog/pg_rewrite.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" #include "catalog/pg_trigger.h" @@ -78,6 +82,7 @@ #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/memutils.h" +#include "utils/regproc.h" #include "utils/syscache.h" #include "utils/tqual.h" @@ -450,6 +455,42 @@ static const ObjectPropertyType ObjectProperty[] = Anum_pg_type_typacl, ACL_KIND_TYPE, true + }, + { + PublicationRelationId, + PublicationObjectIndexId, + PUBLICATIONOID, + PUBLICATIONNAME, + Anum_pg_publication_pubname, + InvalidAttrNumber, + Anum_pg_publication_pubowner, + InvalidAttrNumber, + ACL_KIND_PUBLICATION, + true + }, + { + SubscriptionRelationId, + SubscriptionObjectIndexId, + SUBSCRIPTIONOID, + SUBSCRIPTIONNAME, + Anum_pg_subscription_subname, + InvalidAttrNumber, + Anum_pg_subscription_subowner, + InvalidAttrNumber, + ACL_KIND_SUBSCRIPTION, + true + }, + { + StatisticExtRelationId, + StatisticExtOidIndexId, + STATEXTOID, + STATEXTNAMENSP, + Anum_pg_statistic_ext_stxname, + Anum_pg_statistic_ext_stxnamespace, + Anum_pg_statistic_ext_stxowner, + InvalidAttrNumber, /* no ACL (same as relation) */ + ACL_KIND_STATISTICS, + true } }; @@ -653,9 +694,25 @@ static const struct object_type_map { "policy", OBJECT_POLICY }, + /* OCLASS_PUBLICATION */ + { + "publication", OBJECT_PUBLICATION + }, + /* OCLASS_PUBLICATION_REL */ + { + "publication relation", OBJECT_PUBLICATION_REL + }, + /* OCLASS_SUBSCRIPTION */ + { + "subscription", OBJECT_SUBSCRIPTION + }, /* OCLASS_TRANSFORM */ { "transform", OBJECT_TRANSFORM + }, + /* OBJECT_STATISTIC_EXT */ + { + "statistics object", OBJECT_STATISTIC_EXT } }; @@ -667,28 +724,31 @@ const ObjectAddress InvalidObjectAddress = }; static ObjectAddress get_object_address_unqualified(ObjectType objtype, - List *qualname, bool missing_ok); + Value *strval, bool missing_ok); static ObjectAddress get_relation_by_qualified_name(ObjectType objtype, - List *objname, Relation *relp, + List *object, Relation *relp, LOCKMODE lockmode, bool missing_ok); static ObjectAddress get_object_address_relobject(ObjectType objtype, - List *objname, Relation *relp, bool missing_ok); + List *object, Relation *relp, bool missing_ok); static ObjectAddress get_object_address_attribute(ObjectType objtype, - List *objname, Relation *relp, + List *object, Relation *relp, LOCKMODE lockmode, bool missing_ok); static ObjectAddress get_object_address_attrdef(ObjectType objtype, - List *objname, Relation *relp, LOCKMODE lockmode, + List *object, Relation *relp, LOCKMODE lockmode, bool missing_ok); static ObjectAddress get_object_address_type(ObjectType objtype, - ListCell *typecell, bool missing_ok); -static ObjectAddress get_object_address_opcf(ObjectType objtype, List *objname, + TypeName *typename, bool missing_ok); +static ObjectAddress get_object_address_opcf(ObjectType objtype, List *object, bool missing_ok); static ObjectAddress get_object_address_opf_member(ObjectType objtype, - List *objname, List *objargs, bool missing_ok); - -static ObjectAddress get_object_address_usermapping(List *objname, - List *objargs, bool missing_ok); -static ObjectAddress get_object_address_defacl(List *objname, List *objargs, + List *object, bool missing_ok); + +static ObjectAddress get_object_address_usermapping(List *object, + bool missing_ok); +static ObjectAddress get_object_address_publication_rel(List *object, + Relation *relp, + bool missing_ok); +static ObjectAddress get_object_address_defacl(List *object, bool missing_ok); static const ObjectPropertyType *get_object_property_data(Oid class_id); @@ -698,8 +758,8 @@ static void getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId); static void getProcedureTypeDescription(StringInfo buffer, Oid procid); static void getConstraintTypeDescription(StringInfo buffer, Oid constroid); -static void getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **objname); -static void getRelationIdentity(StringInfo buffer, Oid relid, List **objname); +static void getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object); +static void getRelationIdentity(StringInfo buffer, Oid relid, List **object); /* * Translate an object name and arguments (as passed by the parser) to an @@ -727,13 +787,13 @@ static void getRelationIdentity(StringInfo buffer, Oid relid, List **objname); * * Note: If the object is not found, we don't give any indication of the * reason. (It might have been a missing schema if the name was qualified, or - * an inexistant type name in case of a cast, function or operator; etc). + * a nonexistent type name in case of a cast, function or operator; etc). * Currently there is only one caller that might be interested in such info, so * we don't spend much effort here. If more callers start to care, it might be * better to add some support for that in this function. */ ObjectAddress -get_object_address(ObjectType objtype, List *objname, List *objargs, +get_object_address(ObjectType objtype, Node *object, Relation *relp, LOCKMODE lockmode, bool missing_ok) { ObjectAddress address; @@ -763,19 +823,19 @@ get_object_address(ObjectType objtype, List *objname, List *objargs, case OBJECT_MATVIEW: case OBJECT_FOREIGN_TABLE: address = - get_relation_by_qualified_name(objtype, objname, + get_relation_by_qualified_name(objtype, castNode(List, object), &relation, lockmode, missing_ok); break; case OBJECT_COLUMN: address = - get_object_address_attribute(objtype, objname, + get_object_address_attribute(objtype, castNode(List, object), &relation, lockmode, missing_ok); break; case OBJECT_DEFAULT: address = - get_object_address_attrdef(objtype, objname, + get_object_address_attrdef(objtype, castNode(List, object), &relation, lockmode, missing_ok); break; @@ -783,17 +843,20 @@ get_object_address(ObjectType objtype, List *objname, List *objargs, case OBJECT_TRIGGER: case OBJECT_TABCONSTRAINT: case OBJECT_POLICY: - address = get_object_address_relobject(objtype, objname, + address = get_object_address_relobject(objtype, castNode(List, object), &relation, missing_ok); break; case OBJECT_DOMCONSTRAINT: { + List *objlist; ObjectAddress domaddr; char *constrname; + objlist = castNode(List, object); domaddr = get_object_address_type(OBJECT_DOMAIN, - list_head(objname), missing_ok); - constrname = strVal(linitial(objargs)); + linitial_node(TypeName, objlist), + missing_ok); + constrname = strVal(lsecond(objlist)); address.classId = ConstraintRelationId; address.objectId = get_domain_constraint_oid(domaddr.objectId, @@ -812,58 +875,51 @@ get_object_address(ObjectType objtype, List *objname, List *objargs, case OBJECT_FOREIGN_SERVER: case OBJECT_EVENT_TRIGGER: case OBJECT_ACCESS_METHOD: + case OBJECT_PUBLICATION: + case OBJECT_SUBSCRIPTION: address = get_object_address_unqualified(objtype, - objname, missing_ok); + (Value *) object, missing_ok); break; case OBJECT_TYPE: case OBJECT_DOMAIN: - address = get_object_address_type(objtype, list_head(objname), missing_ok); + address = get_object_address_type(objtype, castNode(TypeName, object), missing_ok); break; case OBJECT_AGGREGATE: address.classId = ProcedureRelationId; - address.objectId = - LookupAggNameTypeNames(objname, objargs, missing_ok); + address.objectId = LookupAggWithArgs(castNode(ObjectWithArgs, object), missing_ok); address.objectSubId = 0; break; case OBJECT_FUNCTION: address.classId = ProcedureRelationId; - address.objectId = - LookupFuncNameTypeNames(objname, objargs, missing_ok); + address.objectId = LookupFuncWithArgs(castNode(ObjectWithArgs, object), missing_ok); address.objectSubId = 0; break; case OBJECT_OPERATOR: - Assert(list_length(objargs) == 2); address.classId = OperatorRelationId; - address.objectId = - LookupOperNameTypeNames(NULL, objname, - (TypeName *) linitial(objargs), - (TypeName *) lsecond(objargs), - missing_ok, -1); + address.objectId = LookupOperWithArgs(castNode(ObjectWithArgs, object), missing_ok); address.objectSubId = 0; break; case OBJECT_COLLATION: address.classId = CollationRelationId; - address.objectId = get_collation_oid(objname, missing_ok); + address.objectId = get_collation_oid(castNode(List, object), missing_ok); address.objectSubId = 0; break; case OBJECT_CONVERSION: address.classId = ConversionRelationId; - address.objectId = get_conversion_oid(objname, missing_ok); + address.objectId = get_conversion_oid(castNode(List, object), missing_ok); address.objectSubId = 0; break; case OBJECT_OPCLASS: case OBJECT_OPFAMILY: - address = get_object_address_opcf(objtype, objname, missing_ok); + address = get_object_address_opcf(objtype, castNode(List, object), missing_ok); break; case OBJECT_AMOP: case OBJECT_AMPROC: - address = get_object_address_opf_member(objtype, objname, - objargs, missing_ok); + address = get_object_address_opf_member(objtype, castNode(List, object), missing_ok); break; case OBJECT_LARGEOBJECT: - Assert(list_length(objname) == 1); address.classId = LargeObjectRelationId; - address.objectId = oidparse(linitial(objname)); + address.objectId = oidparse(object); address.objectSubId = 0; if (!LargeObjectExists(address.objectId)) { @@ -876,8 +932,8 @@ get_object_address(ObjectType objtype, List *objname, List *objargs, break; case OBJECT_CAST: { - TypeName *sourcetype = (TypeName *) linitial(objname); - TypeName *targettype = (TypeName *) linitial(objargs); + TypeName *sourcetype = linitial_node(TypeName, castNode(List, object)); + TypeName *targettype = lsecond_node(TypeName, castNode(List, object)); Oid sourcetypeid; Oid targettypeid; @@ -891,8 +947,8 @@ get_object_address(ObjectType objtype, List *objname, List *objargs, break; case OBJECT_TRANSFORM: { - TypeName *typename = (TypeName *) linitial(objname); - char *langname = strVal(linitial(objargs)); + TypeName *typename = linitial_node(TypeName, castNode(List, object)); + char *langname = strVal(lsecond(castNode(List, object))); Oid type_id = LookupTypeNameOid(NULL, typename, missing_ok); Oid lang_id = get_language_oid(langname, missing_ok); @@ -904,32 +960,43 @@ get_object_address(ObjectType objtype, List *objname, List *objargs, break; case OBJECT_TSPARSER: address.classId = TSParserRelationId; - address.objectId = get_ts_parser_oid(objname, missing_ok); + address.objectId = get_ts_parser_oid(castNode(List, object), missing_ok); address.objectSubId = 0; break; case OBJECT_TSDICTIONARY: address.classId = TSDictionaryRelationId; - address.objectId = get_ts_dict_oid(objname, missing_ok); + address.objectId = get_ts_dict_oid(castNode(List, object), missing_ok); address.objectSubId = 0; break; case OBJECT_TSTEMPLATE: address.classId = TSTemplateRelationId; - address.objectId = get_ts_template_oid(objname, missing_ok); + address.objectId = get_ts_template_oid(castNode(List, object), missing_ok); address.objectSubId = 0; break; case OBJECT_TSCONFIGURATION: address.classId = TSConfigRelationId; - address.objectId = get_ts_config_oid(objname, missing_ok); + address.objectId = get_ts_config_oid(castNode(List, object), missing_ok); address.objectSubId = 0; break; case OBJECT_USER_MAPPING: - address = get_object_address_usermapping(objname, objargs, + address = get_object_address_usermapping(castNode(List, object), missing_ok); break; + case OBJECT_PUBLICATION_REL: + address = get_object_address_publication_rel(castNode(List, object), + &relation, + missing_ok); + break; case OBJECT_DEFACL: - address = get_object_address_defacl(objname, objargs, + address = get_object_address_defacl(castNode(List, object), missing_ok); break; + case OBJECT_STATISTIC_EXT: + address.classId = StatisticExtRelationId; + address.objectId = get_statistics_object_oid(castNode(List, object), + missing_ok); + address.objectSubId = 0; + break; default: elog(ERROR, "unrecognized objtype: %d", (int) objtype); /* placate compiler, in case it thinks elog might return */ @@ -1018,25 +1085,25 @@ get_object_address(ObjectType objtype, List *objname, List *objargs, /* * Return an ObjectAddress based on a RangeVar and an object name. The * name of the relation identified by the RangeVar is prepended to the - * (possibly empty) list passed in as objname. This is useful to find + * (possibly empty) list passed in as object. This is useful to find * the ObjectAddress of objects that depend on a relation. All other * considerations are exactly as for get_object_address above. */ ObjectAddress -get_object_address_rv(ObjectType objtype, RangeVar *rel, List *objname, - List *objargs, Relation *relp, LOCKMODE lockmode, +get_object_address_rv(ObjectType objtype, RangeVar *rel, List *object, + Relation *relp, LOCKMODE lockmode, bool missing_ok) { if (rel) { - objname = lcons(makeString(rel->relname), objname); + object = lcons(makeString(rel->relname), object); if (rel->schemaname) - objname = lcons(makeString(rel->schemaname), objname); + object = lcons(makeString(rel->schemaname), object); if (rel->catalogname) - objname = lcons(makeString(rel->catalogname), objname); + object = lcons(makeString(rel->catalogname), object); } - return get_object_address(objtype, objname, objargs, + return get_object_address(objtype, (Node *) object, relp, lockmode, missing_ok); } @@ -1046,62 +1113,12 @@ get_object_address_rv(ObjectType objtype, RangeVar *rel, List *objname, */ static ObjectAddress get_object_address_unqualified(ObjectType objtype, - List *qualname, bool missing_ok) + Value *strval, bool missing_ok) { const char *name; ObjectAddress address; - /* - * The types of names handled by this function are not permitted to be - * schema-qualified or catalog-qualified. - */ - if (list_length(qualname) != 1) - { - const char *msg; - - switch (objtype) - { - case OBJECT_ACCESS_METHOD: - msg = gettext_noop("access method name cannot be qualified"); - break; - case OBJECT_DATABASE: - msg = gettext_noop("database name cannot be qualified"); - break; - case OBJECT_EXTENSION: - msg = gettext_noop("extension name cannot be qualified"); - break; - case OBJECT_TABLESPACE: - msg = gettext_noop("tablespace name cannot be qualified"); - break; - case OBJECT_ROLE: - msg = gettext_noop("role name cannot be qualified"); - break; - case OBJECT_SCHEMA: - msg = gettext_noop("schema name cannot be qualified"); - break; - case OBJECT_LANGUAGE: - msg = gettext_noop("language name cannot be qualified"); - break; - case OBJECT_FDW: - msg = gettext_noop("foreign-data wrapper name cannot be qualified"); - break; - case OBJECT_FOREIGN_SERVER: - msg = gettext_noop("server name cannot be qualified"); - break; - case OBJECT_EVENT_TRIGGER: - msg = gettext_noop("event trigger name cannot be qualified"); - break; - default: - elog(ERROR, "unrecognized objtype: %d", (int) objtype); - msg = NULL; /* placate compiler */ - } - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("%s", _(msg)))); - } - - /* Format is valid, extract the actual name. */ - name = strVal(linitial(qualname)); + name = strVal(strval); /* Translate name to OID. */ switch (objtype) @@ -1156,6 +1173,16 @@ get_object_address_unqualified(ObjectType objtype, address.objectId = get_event_trigger_oid(name, missing_ok); address.objectSubId = 0; break; + case OBJECT_PUBLICATION: + address.classId = PublicationRelationId; + address.objectId = get_publication_oid(name, missing_ok); + address.objectSubId = 0; + break; + case OBJECT_SUBSCRIPTION: + address.classId = SubscriptionRelationId; + address.objectId = get_subscription_oid(name, missing_ok); + address.objectSubId = 0; + break; default: elog(ERROR, "unrecognized objtype: %d", (int) objtype); /* placate compiler, which doesn't know elog won't return */ @@ -1171,7 +1198,7 @@ get_object_address_unqualified(ObjectType objtype, * Locate a relation by qualified name. */ static ObjectAddress -get_relation_by_qualified_name(ObjectType objtype, List *objname, +get_relation_by_qualified_name(ObjectType objtype, List *object, Relation *relp, LOCKMODE lockmode, bool missing_ok) { @@ -1182,7 +1209,7 @@ get_relation_by_qualified_name(ObjectType objtype, List *objname, address.objectId = InvalidOid; address.objectSubId = 0; - relation = relation_openrv_extended(makeRangeVarFromNameList(objname), + relation = relation_openrv_extended(makeRangeVarFromNameList(object), lockmode, missing_ok); if (!relation) return address; @@ -1204,7 +1231,8 @@ get_relation_by_qualified_name(ObjectType objtype, List *objname, RelationGetRelationName(relation)))); break; case OBJECT_TABLE: - if (relation->rd_rel->relkind != RELKIND_RELATION) + if (relation->rd_rel->relkind != RELKIND_RELATION && + relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table", @@ -1251,102 +1279,74 @@ get_relation_by_qualified_name(ObjectType objtype, List *objname, * mode for the object itself, not the relation to which it is attached. */ static ObjectAddress -get_object_address_relobject(ObjectType objtype, List *objname, +get_object_address_relobject(ObjectType objtype, List *object, Relation *relp, bool missing_ok) { ObjectAddress address; Relation relation = NULL; int nnames; const char *depname; + List *relname; + Oid reloid; /* Extract name of dependent object. */ - depname = strVal(llast(objname)); + depname = strVal(llast(object)); /* Separate relation name from dependent object name. */ - nnames = list_length(objname); + nnames = list_length(object); if (nnames < 2) - { - Oid reloid; - - /* - * For compatibility with very old releases, we sometimes allow users - * to attempt to specify a rule without mentioning the relation name. - * If there's only rule by that name in the entire database, this will - * work. But objects other than rules don't get this special - * treatment. - */ - if (objtype != OBJECT_RULE) - elog(ERROR, "must specify relation and object name"); - address.classId = RewriteRelationId; - address.objectId = - get_rewrite_oid_without_relid(depname, &reloid, missing_ok); - address.objectSubId = 0; - - /* - * Caller is expecting to get back the relation, even though we didn't - * end up using it to find the rule. - */ - if (OidIsValid(address.objectId)) - relation = heap_open(reloid, AccessShareLock); - } - else - { - List *relname; - Oid reloid; + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("must specify relation and object name"))); - /* Extract relation name and open relation. */ - relname = list_truncate(list_copy(objname), nnames - 1); - relation = heap_openrv_extended(makeRangeVarFromNameList(relname), - AccessShareLock, - missing_ok); + /* Extract relation name and open relation. */ + relname = list_truncate(list_copy(object), nnames - 1); + relation = heap_openrv_extended(makeRangeVarFromNameList(relname), + AccessShareLock, + missing_ok); - reloid = relation ? RelationGetRelid(relation) : InvalidOid; + reloid = relation ? RelationGetRelid(relation) : InvalidOid; - switch (objtype) - { - case OBJECT_RULE: - address.classId = RewriteRelationId; - address.objectId = relation ? - get_rewrite_oid(reloid, depname, missing_ok) : InvalidOid; - address.objectSubId = 0; - break; - case OBJECT_TRIGGER: - address.classId = TriggerRelationId; - address.objectId = relation ? - get_trigger_oid(reloid, depname, missing_ok) : InvalidOid; - address.objectSubId = 0; - break; - case OBJECT_TABCONSTRAINT: - address.classId = ConstraintRelationId; - address.objectId = relation ? - get_relation_constraint_oid(reloid, depname, missing_ok) : - InvalidOid; - address.objectSubId = 0; - break; - case OBJECT_POLICY: - address.classId = PolicyRelationId; - address.objectId = relation ? - get_relation_policy_oid(reloid, depname, missing_ok) : - InvalidOid; - address.objectSubId = 0; - break; - default: - elog(ERROR, "unrecognized objtype: %d", (int) objtype); - /* placate compiler, which doesn't know elog won't return */ - address.classId = InvalidOid; - address.objectId = InvalidOid; - address.objectSubId = 0; - } + switch (objtype) + { + case OBJECT_RULE: + address.classId = RewriteRelationId; + address.objectId = relation ? + get_rewrite_oid(reloid, depname, missing_ok) : InvalidOid; + address.objectSubId = 0; + break; + case OBJECT_TRIGGER: + address.classId = TriggerRelationId; + address.objectId = relation ? + get_trigger_oid(reloid, depname, missing_ok) : InvalidOid; + address.objectSubId = 0; + break; + case OBJECT_TABCONSTRAINT: + address.classId = ConstraintRelationId; + address.objectId = relation ? + get_relation_constraint_oid(reloid, depname, missing_ok) : + InvalidOid; + address.objectSubId = 0; + break; + case OBJECT_POLICY: + address.classId = PolicyRelationId; + address.objectId = relation ? + get_relation_policy_oid(reloid, depname, missing_ok) : + InvalidOid; + address.objectSubId = 0; + break; + default: + elog(ERROR, "unrecognized objtype: %d", (int) objtype); + } - /* Avoid relcache leak when object not found. */ - if (!OidIsValid(address.objectId)) - { - if (relation != NULL) - heap_close(relation, AccessShareLock); + /* Avoid relcache leak when object not found. */ + if (!OidIsValid(address.objectId)) + { + if (relation != NULL) + heap_close(relation, AccessShareLock); - relation = NULL; /* department of accident prevention */ - return address; - } + relation = NULL; /* department of accident prevention */ + return address; } /* Done. */ @@ -1358,7 +1358,7 @@ get_object_address_relobject(ObjectType objtype, List *objname, * Find the ObjectAddress for an attribute. */ static ObjectAddress -get_object_address_attribute(ObjectType objtype, List *objname, +get_object_address_attribute(ObjectType objtype, List *object, Relation *relp, LOCKMODE lockmode, bool missing_ok) { @@ -1370,12 +1370,12 @@ get_object_address_attribute(ObjectType objtype, List *objname, AttrNumber attnum; /* Extract relation name and open relation. */ - if (list_length(objname) < 2) + if (list_length(object) < 2) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("column name must be qualified"))); - attname = strVal(lfirst(list_tail(objname))); - relname = list_truncate(list_copy(objname), list_length(objname) - 1); + attname = strVal(lfirst(list_tail(object))); + relname = list_truncate(list_copy(object), list_length(object) - 1); /* XXX no missing_ok support here */ relation = relation_openrv(makeRangeVarFromNameList(relname), lockmode); reloid = RelationGetRelid(relation); @@ -1409,7 +1409,7 @@ get_object_address_attribute(ObjectType objtype, List *objname, * Find the ObjectAddress for an attribute's default value. */ static ObjectAddress -get_object_address_attrdef(ObjectType objtype, List *objname, +get_object_address_attrdef(ObjectType objtype, List *object, Relation *relp, LOCKMODE lockmode, bool missing_ok) { @@ -1423,12 +1423,12 @@ get_object_address_attrdef(ObjectType objtype, List *objname, Oid defoid; /* Extract relation name and open relation. */ - if (list_length(objname) < 2) + if (list_length(object) < 2) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("column name must be qualified"))); - attname = strVal(llast(objname)); - relname = list_truncate(list_copy(objname), list_length(objname) - 1); + attname = strVal(llast(object)); + relname = list_truncate(list_copy(object), list_length(object) - 1); /* XXX no missing_ok support here */ relation = relation_openrv(makeRangeVarFromNameList(relname), lockmode); reloid = RelationGetRelid(relation); @@ -1491,14 +1491,11 @@ get_object_address_attrdef(ObjectType objtype, List *objname, * Find the ObjectAddress for a type or domain */ static ObjectAddress -get_object_address_type(ObjectType objtype, ListCell *typecell, bool missing_ok) +get_object_address_type(ObjectType objtype, TypeName *typename, bool missing_ok) { ObjectAddress address; - TypeName *typename; Type tup; - typename = (TypeName *) lfirst(typecell); - address.classId = TypeRelationId; address.objectId = InvalidOid; address.objectSubId = 0; @@ -1533,25 +1530,25 @@ get_object_address_type(ObjectType objtype, ListCell *typecell, bool missing_ok) * Find the ObjectAddress for an opclass or opfamily. */ static ObjectAddress -get_object_address_opcf(ObjectType objtype, List *objname, bool missing_ok) +get_object_address_opcf(ObjectType objtype, List *object, bool missing_ok) { Oid amoid; ObjectAddress address; /* XXX no missing_ok support here */ - amoid = get_index_am_oid(strVal(linitial(objname)), false); - objname = list_copy_tail(objname, 1); + amoid = get_index_am_oid(strVal(linitial(object)), false); + object = list_copy_tail(object, 1); switch (objtype) { case OBJECT_OPCLASS: address.classId = OperatorClassRelationId; - address.objectId = get_opclass_oid(amoid, objname, missing_ok); + address.objectId = get_opclass_oid(amoid, object, missing_ok); address.objectSubId = 0; break; case OBJECT_OPFAMILY: address.classId = OperatorFamilyRelationId; - address.objectId = get_opfamily_oid(amoid, objname, missing_ok); + address.objectId = get_opfamily_oid(amoid, object, missing_ok); address.objectSubId = 0; break; default: @@ -1572,36 +1569,36 @@ get_object_address_opcf(ObjectType objtype, List *objname, bool missing_ok) */ static ObjectAddress get_object_address_opf_member(ObjectType objtype, - List *objname, List *objargs, bool missing_ok) + List *object, bool missing_ok) { ObjectAddress famaddr; ObjectAddress address; ListCell *cell; List *copy; - char *typenames[2]; + TypeName *typenames[2]; Oid typeoids[2]; int membernum; int i; /* - * The last element of the objname list contains the strategy or procedure + * The last element of the object list contains the strategy or procedure * number. We need to strip that out before getting the opclass/family * address. The rest can be used directly by get_object_address_opcf(). */ - membernum = atoi(strVal(llast(objname))); - copy = list_truncate(list_copy(objname), list_length(objname) - 1); + membernum = atoi(strVal(llast(linitial(object)))); + copy = list_truncate(list_copy(linitial(object)), list_length(linitial(object)) - 1); /* no missing_ok support here */ famaddr = get_object_address_opcf(OBJECT_OPFAMILY, copy, false); /* find out left/right type names and OIDs */ i = 0; - foreach(cell, objargs) + foreach(cell, lsecond(object)) { ObjectAddress typaddr; - typenames[i] = strVal(lfirst(cell)); - typaddr = get_object_address_type(OBJECT_TYPE, cell, missing_ok); + typenames[i] = lfirst_node(TypeName, cell); + typaddr = get_object_address_type(OBJECT_TYPE, typenames[i], missing_ok); typeoids[i] = typaddr.objectId; if (++i >= 2) break; @@ -1627,7 +1624,9 @@ get_object_address_opf_member(ObjectType objtype, ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("operator %d (%s, %s) of %s does not exist", - membernum, typenames[0], typenames[1], + membernum, + TypeNameToString(typenames[0]), + TypeNameToString(typenames[1]), getObjectDescription(&famaddr)))); } else @@ -1656,7 +1655,9 @@ get_object_address_opf_member(ObjectType objtype, ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("function %d (%s, %s) of %s does not exist", - membernum, typenames[0], typenames[1], + membernum, + TypeNameToString(typenames[0]), + TypeNameToString(typenames[1]), getObjectDescription(&famaddr)))); } else @@ -1677,7 +1678,7 @@ get_object_address_opf_member(ObjectType objtype, * Find the ObjectAddress for a user mapping. */ static ObjectAddress -get_object_address_usermapping(List *objname, List *objargs, bool missing_ok) +get_object_address_usermapping(List *object, bool missing_ok) { ObjectAddress address; Oid userid; @@ -1689,8 +1690,8 @@ get_object_address_usermapping(List *objname, List *objargs, bool missing_ok) ObjectAddressSet(address, UserMappingRelationId, InvalidOid); /* fetch string names from input lists, for error messages */ - username = strVal(linitial(objname)); - servername = strVal(linitial(objargs)); + username = strVal(linitial(object)); + servername = strVal(lsecond(object)); /* look up pg_authid OID of mapped user; InvalidOid if PUBLIC */ if (strcmp(username, "public") == 0) @@ -1743,10 +1744,64 @@ get_object_address_usermapping(List *objname, List *objargs, bool missing_ok) } /* + * Find the ObjectAddress for a publication relation. The first element of + * the object parameter is the relation name, the second is the + * publication name. + */ +static ObjectAddress +get_object_address_publication_rel(List *object, + Relation *relp, bool missing_ok) +{ + ObjectAddress address; + Relation relation; + List *relname; + char *pubname; + Publication *pub; + + ObjectAddressSet(address, PublicationRelRelationId, InvalidOid); + + relname = linitial(object); + relation = relation_openrv_extended(makeRangeVarFromNameList(relname), + AccessShareLock, missing_ok); + if (!relation) + return address; + + /* fetch publication name from input list */ + pubname = strVal(lsecond(object)); + + /* Now look up the pg_publication tuple */ + pub = GetPublicationByName(pubname, missing_ok); + if (!pub) + { + relation_close(relation, AccessShareLock); + return address; + } + + /* Find the publication relation mapping in syscache. */ + address.objectId = + GetSysCacheOid2(PUBLICATIONRELMAP, + ObjectIdGetDatum(RelationGetRelid(relation)), + ObjectIdGetDatum(pub->oid)); + if (!OidIsValid(address.objectId)) + { + if (!missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("publication relation \"%s\" in publication \"%s\" does not exist", + RelationGetRelationName(relation), pubname))); + relation_close(relation, AccessShareLock); + return address; + } + + *relp = relation; + return address; +} + +/* * Find the ObjectAddress for a default ACL. */ static ObjectAddress -get_object_address_defacl(List *objname, List *objargs, bool missing_ok) +get_object_address_defacl(List *object, bool missing_ok) { HeapTuple tp; Oid userid; @@ -1763,9 +1818,9 @@ get_object_address_defacl(List *objname, List *objargs, bool missing_ok) * First figure out the textual attributes so that they can be used for * error reporting. */ - username = strVal(linitial(objname)); - if (list_length(objname) >= 2) - schema = (char *) strVal(lsecond(objname)); + username = strVal(lsecond(object)); + if (list_length(object) >= 3) + schema = (char *) strVal(lthird(object)); else schema = NULL; @@ -1773,7 +1828,7 @@ get_object_address_defacl(List *objname, List *objargs, bool missing_ok) * Decode defaclobjtype. Only first char is considered; the rest of the * string, if any, is blissfully ignored. */ - objtype = ((char *) strVal(linitial(objargs)))[0]; + objtype = ((char *) strVal(linitial(object)))[0]; switch (objtype) { case DEFACLOBJ_RELATION: @@ -1788,11 +1843,14 @@ get_object_address_defacl(List *objname, List *objargs, bool missing_ok) case DEFACLOBJ_TYPE: objtype_str = "types"; break; + case DEFACLOBJ_NAMESPACE: + objtype_str = "schemas"; + break; default: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized default ACL object type %c", objtype), - errhint("Valid object types are \"r\", \"S\", \"f\", and \"T\"."))); + errhint("Valid object types are \"r\", \"S\", \"f\", \"T\" and \"s\"."))); } /* @@ -1888,8 +1946,10 @@ pg_get_object_address(PG_FUNCTION_ARGS) ArrayType *argsarr = PG_GETARG_ARRAYTYPE_P(2); int itype; ObjectType type; - List *name; - List *args; + List *name = NIL; + TypeName *typename = NULL; + List *args = NIL; + Node *objnode = NULL; ObjectAddress addr; TupleDesc tupdesc; Datum values[3]; @@ -1927,7 +1987,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("name or argument lists may not contain nulls"))); - name = list_make1(typeStringToTypeName(TextDatumGetCString(elems[0]))); + typename = typeStringToTypeName(TextDatumGetCString(elems[0])); } else if (type == OBJECT_LARGEOBJECT) { @@ -1945,7 +2005,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("large object OID may not be null"))); - name = list_make1(makeFloat(TextDatumGetCString(elems[0]))); + objnode = (Node *) makeFloat(TextDatumGetCString(elems[0])); } else { @@ -1993,7 +2053,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) } /* - * get_object_name is pretty sensitive to the length its input lists; + * get_object_address is pretty sensitive to the length its input lists; * check that they're what it wants. */ switch (type) @@ -2001,6 +2061,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) case OBJECT_DOMCONSTRAINT: case OBJECT_CAST: case OBJECT_USER_MAPPING: + case OBJECT_PUBLICATION_REL: case OBJECT_DEFACL: case OBJECT_TRANSFORM: if (list_length(args) != 1) @@ -2032,7 +2093,97 @@ pg_get_object_address(PG_FUNCTION_ARGS) break; } - addr = get_object_address(type, name, args, + /* + * Now build the Node type that get_object_address() expects for the given + * type. + */ + switch (type) + { + case OBJECT_TABLE: + case OBJECT_SEQUENCE: + case OBJECT_VIEW: + case OBJECT_MATVIEW: + case OBJECT_INDEX: + case OBJECT_FOREIGN_TABLE: + case OBJECT_COLUMN: + case OBJECT_ATTRIBUTE: + case OBJECT_COLLATION: + case OBJECT_CONVERSION: + case OBJECT_STATISTIC_EXT: + case OBJECT_TSPARSER: + case OBJECT_TSDICTIONARY: + case OBJECT_TSTEMPLATE: + case OBJECT_TSCONFIGURATION: + case OBJECT_DEFAULT: + case OBJECT_POLICY: + case OBJECT_RULE: + case OBJECT_TRIGGER: + case OBJECT_TABCONSTRAINT: + case OBJECT_OPCLASS: + case OBJECT_OPFAMILY: + objnode = (Node *) name; + break; + case OBJECT_ACCESS_METHOD: + case OBJECT_DATABASE: + case OBJECT_EVENT_TRIGGER: + case OBJECT_EXTENSION: + case OBJECT_FDW: + case OBJECT_FOREIGN_SERVER: + case OBJECT_LANGUAGE: + case OBJECT_PUBLICATION: + case OBJECT_ROLE: + case OBJECT_SCHEMA: + case OBJECT_SUBSCRIPTION: + case OBJECT_TABLESPACE: + if (list_length(name) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("name list length must be exactly %d", 1))); + objnode = linitial(name); + break; + case OBJECT_TYPE: + case OBJECT_DOMAIN: + objnode = (Node *) typename; + break; + case OBJECT_CAST: + case OBJECT_DOMCONSTRAINT: + case OBJECT_TRANSFORM: + objnode = (Node *) list_make2(typename, linitial(args)); + break; + case OBJECT_PUBLICATION_REL: + objnode = (Node *) list_make2(name, linitial(args)); + break; + case OBJECT_USER_MAPPING: + objnode = (Node *) list_make2(linitial(name), linitial(args)); + break; + case OBJECT_DEFACL: + objnode = (Node *) lcons(linitial(args), name); + break; + case OBJECT_AMOP: + case OBJECT_AMPROC: + objnode = (Node *) list_make2(name, args); + break; + case OBJECT_FUNCTION: + case OBJECT_AGGREGATE: + case OBJECT_OPERATOR: + { + ObjectWithArgs *owa = makeNode(ObjectWithArgs); + + owa->objname = name; + owa->objargs = args; + objnode = (Node *) owa; + break; + } + case OBJECT_LARGEOBJECT: + /* already handled above */ + break; + /* no default, to let compiler warn about missing case */ + } + + if (objnode == NULL) + elog(ERROR, "unrecognized object type: %d", type); + + addr = get_object_address(type, objnode, &relation, AccessShareLock, false); /* We don't need the relcache entry, thank you very much */ @@ -2065,7 +2216,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) */ void check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, - List *objname, List *objargs, Relation relation) + Node *object, Relation relation) { switch (objtype) { @@ -2087,7 +2238,7 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, case OBJECT_DATABASE: if (!pg_database_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_DATABASE, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_TYPE: case OBJECT_DOMAIN: @@ -2100,62 +2251,62 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, case OBJECT_FUNCTION: if (!pg_proc_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_PROC, - NameListToString(objname)); + NameListToString((castNode(ObjectWithArgs, object))->objname)); break; case OBJECT_OPERATOR: if (!pg_oper_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_OPER, - NameListToString(objname)); + NameListToString((castNode(ObjectWithArgs, object))->objname)); break; case OBJECT_SCHEMA: if (!pg_namespace_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_NAMESPACE, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_COLLATION: if (!pg_collation_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_COLLATION, - NameListToString(objname)); + NameListToString(castNode(List, object))); break; case OBJECT_CONVERSION: if (!pg_conversion_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CONVERSION, - NameListToString(objname)); + NameListToString(castNode(List, object))); break; case OBJECT_EXTENSION: if (!pg_extension_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_EXTENSION, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_FDW: if (!pg_foreign_data_wrapper_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_FDW, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_FOREIGN_SERVER: if (!pg_foreign_server_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_FOREIGN_SERVER, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_EVENT_TRIGGER: if (!pg_event_trigger_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_EVENT_TRIGGER, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_LANGUAGE: if (!pg_language_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_LANGUAGE, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_OPCLASS: if (!pg_opclass_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_OPCLASS, - NameListToString(objname)); + NameListToString(castNode(List, object))); break; case OBJECT_OPFAMILY: if (!pg_opfamily_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_OPFAMILY, - NameListToString(objname)); + NameListToString(castNode(List, object))); break; case OBJECT_LARGEOBJECT: if (!lo_compat_privileges && @@ -2168,8 +2319,8 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, case OBJECT_CAST: { /* We can only check permissions on the source/target types */ - TypeName *sourcetype = (TypeName *) linitial(objname); - TypeName *targettype = (TypeName *) linitial(objargs); + TypeName *sourcetype = linitial_node(TypeName, castNode(List, object)); + TypeName *targettype = lsecond_node(TypeName, castNode(List, object)); Oid sourcetypeid = typenameTypeId(NULL, sourcetype); Oid targettypeid = typenameTypeId(NULL, targettype); @@ -2182,9 +2333,19 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, format_type_be(targettypeid)))); } break; + case OBJECT_PUBLICATION: + if (!pg_publication_ownercheck(address.objectId, roleid)) + aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_PUBLICATION, + strVal((Value *) object)); + break; + case OBJECT_SUBSCRIPTION: + if (!pg_subscription_ownercheck(address.objectId, roleid)) + aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_SUBSCRIPTION, + strVal((Value *) object)); + break; case OBJECT_TRANSFORM: { - TypeName *typename = (TypeName *) linitial(objname); + TypeName *typename = linitial_node(TypeName, castNode(List, object)); Oid typeid = typenameTypeId(NULL, typename); if (!pg_type_ownercheck(typeid, roleid)) @@ -2194,17 +2355,17 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, case OBJECT_TABLESPACE: if (!pg_tablespace_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TABLESPACE, - NameListToString(objname)); + strVal((Value *) object)); break; case OBJECT_TSDICTIONARY: if (!pg_ts_dict_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TSDICTIONARY, - NameListToString(objname)); + NameListToString(castNode(List, object))); break; case OBJECT_TSCONFIGURATION: if (!pg_ts_config_ownercheck(address.objectId, roleid)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TSCONFIGURATION, - NameListToString(objname)); + NameListToString(castNode(List, object))); break; case OBJECT_ROLE: @@ -2236,6 +2397,10 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser"))); break; + case OBJECT_STATISTIC_EXT: + if (!pg_statistics_object_ownercheck(address.objectId, roleid)) + aclcheck_error_type(ACLCHECK_NOT_OWNER, address.objectId); + break; default: elog(ERROR, "unrecognized object type: %d", (int) objtype); @@ -2290,23 +2455,18 @@ get_object_namespace(const ObjectAddress *address) int read_objtype_from_string(const char *objtype) { - ObjectType type; int i; for (i = 0; i < lengthof(ObjectTypeMap); i++) { if (strcmp(ObjectTypeMap[i].tm_name, objtype) == 0) - { - type = ObjectTypeMap[i].tm_type; - break; - } + return ObjectTypeMap[i].tm_type; } - if (i >= lengthof(ObjectTypeMap)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized object type \"%s\"", objtype))); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized object type \"%s\"", objtype))); - return type; + return -1; /* keep compiler quiet */ } /* @@ -2709,6 +2869,21 @@ getObjectDescription(const ObjectAddress *object) getOpFamilyDescription(&buffer, object->objectId); break; + case OCLASS_AM: + { + HeapTuple tup; + + tup = SearchSysCache1(AMOID, + ObjectIdGetDatum(object->objectId)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for access method %u", + object->objectId); + appendStringInfo(&buffer, _("access method %s"), + NameStr(((Form_pg_am) GETSTRUCT(tup))->amname)); + ReleaseSysCache(tup); + break; + } + case OCLASS_AMOP: { Relation amopDesc; @@ -2844,27 +3019,6 @@ getObjectDescription(const ObjectAddress *object) break; } - case OCLASS_TRANSFORM: - { - HeapTuple trfTup; - Form_pg_transform trfForm; - - trfTup = SearchSysCache1(TRFOID, - ObjectIdGetDatum(object->objectId)); - if (!HeapTupleIsValid(trfTup)) - elog(ERROR, "could not find tuple for transform %u", - object->objectId); - - trfForm = (Form_pg_transform) GETSTRUCT(trfTup); - - appendStringInfo(&buffer, _("transform for %s language %s"), - format_type_be(trfForm->trftype), - get_language_name(trfForm->trflang, false)); - - ReleaseSysCache(trfTup); - break; - } - case OCLASS_TRIGGER: { Relation trigDesc; @@ -2912,6 +3066,26 @@ getObjectDescription(const ObjectAddress *object) break; } + case OCLASS_STATISTIC_EXT: + { + HeapTuple stxTup; + Form_pg_statistic_ext stxForm; + + stxTup = SearchSysCache1(STATEXTOID, + ObjectIdGetDatum(object->objectId)); + if (!HeapTupleIsValid(stxTup)) + elog(ERROR, "could not find tuple for statistics object %u", + object->objectId); + + stxForm = (Form_pg_statistic_ext) GETSTRUCT(stxTup); + + appendStringInfo(&buffer, _("statistics object %s"), + NameStr(stxForm->stxname)); + + ReleaseSysCache(stxTup); + break; + } + case OCLASS_TSPARSER: { HeapTuple tup; @@ -3098,6 +3272,11 @@ getObjectDescription(const ObjectAddress *object) _("default privileges on new types belonging to role %s"), GetUserNameFromId(defacl->defaclrole, false)); break; + case DEFACLOBJ_NAMESPACE: + appendStringInfo(&buffer, + _("default privileges on new schemas belonging to role %s"), + GetUserNameFromId(defacl->defaclrole, false)); + break; default: /* shouldn't get here */ appendStringInfo(&buffer, @@ -3180,27 +3359,80 @@ getObjectDescription(const ObjectAddress *object) break; } - case OCLASS_AM: + case OCLASS_PUBLICATION: + { + appendStringInfo(&buffer, _("publication %s"), + get_publication_name(object->objectId)); + break; + } + + case OCLASS_PUBLICATION_REL: { HeapTuple tup; + char *pubname; + Form_pg_publication_rel prform; - tup = SearchSysCache1(AMOID, + tup = SearchSysCache1(PUBLICATIONREL, ObjectIdGetDatum(object->objectId)); if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for access method %u", + elog(ERROR, "cache lookup failed for publication table %u", object->objectId); - appendStringInfo(&buffer, _("access method %s"), - NameStr(((Form_pg_am) GETSTRUCT(tup))->amname)); + + prform = (Form_pg_publication_rel) GETSTRUCT(tup); + pubname = get_publication_name(prform->prpubid); + + appendStringInfo(&buffer, _("publication table %s in publication %s"), + get_rel_name(prform->prrelid), pubname); ReleaseSysCache(tup); break; } - default: - appendStringInfo(&buffer, "unrecognized object %u %u %d", - object->classId, - object->objectId, - object->objectSubId); - break; + case OCLASS_SUBSCRIPTION: + { + appendStringInfo(&buffer, _("subscription %s"), + get_subscription_name(object->objectId)); + break; + } + + case OCLASS_TRANSFORM: + { + HeapTuple trfTup; + Form_pg_transform trfForm; + + trfTup = SearchSysCache1(TRFOID, + ObjectIdGetDatum(object->objectId)); + if (!HeapTupleIsValid(trfTup)) + elog(ERROR, "could not find tuple for transform %u", + object->objectId); + + trfForm = (Form_pg_transform) GETSTRUCT(trfTup); + + appendStringInfo(&buffer, _("transform for %s language %s"), + format_type_be(trfForm->trftype), + get_language_name(trfForm->trflang, false)); + + ReleaseSysCache(trfTup); + break; + } + + case OCLASS_PGXC_NODE: + { + appendStringInfo(&buffer, _("node %s"), + get_pgxc_nodename(object->objectId)); + break; + } + + case OCLASS_PGXC_GROUP: + { + appendStringInfo(&buffer, _("node group %s"), + get_pgxc_groupname(object->objectId)); + break; + } + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new OCLASS hasn't been handled above. + */ } return buffer.data; @@ -3249,6 +3481,7 @@ getRelationDescription(StringInfo buffer, Oid relid) switch (relForm->relkind) { case RELKIND_RELATION: + case RELKIND_PARTITIONED_TABLE: appendStringInfo(buffer, _("table %s"), relname); break; @@ -3336,7 +3569,7 @@ pg_describe_object(PG_FUNCTION_ARGS) { Oid classid = PG_GETARG_OID(0); Oid objid = PG_GETARG_OID(1); - int32 subobjid = PG_GETARG_INT32(2); + int32 objsubid = PG_GETARG_INT32(2); char *description; ObjectAddress address; @@ -3346,7 +3579,7 @@ pg_describe_object(PG_FUNCTION_ARGS) address.classId = classid; address.objectId = objid; - address.objectSubId = subobjid; + address.objectSubId = objsubid; description = getObjectDescription(&address); PG_RETURN_TEXT_P(cstring_to_text(description)); @@ -3360,7 +3593,7 @@ pg_identify_object(PG_FUNCTION_ARGS) { Oid classid = PG_GETARG_OID(0); Oid objid = PG_GETARG_OID(1); - int32 subobjid = PG_GETARG_INT32(2); + int32 objsubid = PG_GETARG_INT32(2); Oid schema_oid = InvalidOid; const char *objname = NULL; ObjectAddress address; @@ -3371,7 +3604,7 @@ pg_identify_object(PG_FUNCTION_ARGS) address.classId = classid; address.objectId = objid; - address.objectSubId = subobjid; + address.objectSubId = objsubid; /* * Construct a tuple descriptor for the result row. This must match this @@ -3476,7 +3709,7 @@ pg_identify_object_as_address(PG_FUNCTION_ARGS) { Oid classid = PG_GETARG_OID(0); Oid objid = PG_GETARG_OID(1); - int32 subobjid = PG_GETARG_INT32(2); + int32 objsubid = PG_GETARG_INT32(2); ObjectAddress address; char *identity; List *names; @@ -3488,7 +3721,7 @@ pg_identify_object_as_address(PG_FUNCTION_ARGS) address.classId = classid; address.objectId = objid; - address.objectSubId = subobjid; + address.objectSubId = objsubid; /* * Construct a tuple descriptor for the result row. This must match this @@ -3596,6 +3829,10 @@ getObjectTypeDescription(const ObjectAddress *object) appendStringInfoString(&buffer, "operator family"); break; + case OCLASS_AM: + appendStringInfoString(&buffer, "access method"); + break; + case OCLASS_AMOP: appendStringInfoString(&buffer, "operator of access method"); break; @@ -3616,6 +3853,10 @@ getObjectTypeDescription(const ObjectAddress *object) appendStringInfoString(&buffer, "schema"); break; + case OCLASS_STATISTIC_EXT: + appendStringInfoString(&buffer, "statistics object"); + break; + case OCLASS_TSPARSER: appendStringInfoString(&buffer, "text search parser"); break; @@ -3672,17 +3913,38 @@ getObjectTypeDescription(const ObjectAddress *object) appendStringInfoString(&buffer, "policy"); break; + case OCLASS_PUBLICATION: + appendStringInfoString(&buffer, "publication"); + break; + + case OCLASS_PUBLICATION_REL: + appendStringInfoString(&buffer, "publication relation"); + break; + + case OCLASS_SUBSCRIPTION: + appendStringInfoString(&buffer, "subscription"); + break; + case OCLASS_TRANSFORM: appendStringInfoString(&buffer, "transform"); break; - case OCLASS_AM: - appendStringInfoString(&buffer, "access method"); + case OCLASS_PGXC_CLASS: + appendStringInfoString(&buffer, "pgxc_class"); break; - default: - appendStringInfo(&buffer, "unrecognized %u", object->classId); + case OCLASS_PGXC_NODE: + appendStringInfoString(&buffer, "node"); + break; + + case OCLASS_PGXC_GROUP: + appendStringInfoString(&buffer, "node group"); break; + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new OCLASS hasn't been handled above. + */ } return buffer.data; @@ -3706,6 +3968,7 @@ getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId) switch (relForm->relkind) { case RELKIND_RELATION: + case RELKIND_PARTITIONED_TABLE: appendStringInfoString(buffer, "table"); break; case RELKIND_INDEX: @@ -4097,6 +4360,20 @@ getObjectIdentityParts(const ObjectAddress *object, getOpFamilyIdentity(&buffer, object->objectId, objname); break; + case OCLASS_AM: + { + char *amname; + + amname = get_am_name(object->objectId); + if (!amname) + elog(ERROR, "cache lookup failed for access method %u", + object->objectId); + appendStringInfoString(&buffer, quote_identifier(amname)); + if (objname) + *objname = list_make1(amname); + } + break; + case OCLASS_AMOP: { Relation amopDesc; @@ -4257,32 +4534,6 @@ getObjectIdentityParts(const ObjectAddress *object, break; } - case OCLASS_POLICY: - { - Relation polDesc; - HeapTuple tup; - Form_pg_policy policy; - - polDesc = heap_open(PolicyRelationId, AccessShareLock); - - tup = get_catalog_object_by_oid(polDesc, object->objectId); - - if (!HeapTupleIsValid(tup)) - elog(ERROR, "could not find tuple for policy %u", - object->objectId); - - policy = (Form_pg_policy) GETSTRUCT(tup); - - appendStringInfo(&buffer, "%s on ", - quote_identifier(NameStr(policy->polname))); - getRelationIdentity(&buffer, policy->polrelid, objname); - if (objname) - *objname = lappend(*objname, pstrdup(NameStr(policy->polname))); - - heap_close(polDesc, AccessShareLock); - break; - } - case OCLASS_SCHEMA: { char *nspname; @@ -4298,6 +4549,29 @@ getObjectIdentityParts(const ObjectAddress *object, break; } + case OCLASS_STATISTIC_EXT: + { + HeapTuple tup; + Form_pg_statistic_ext formStatistic; + char *schema; + + tup = SearchSysCache1(STATEXTOID, + ObjectIdGetDatum(object->objectId)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for statistics object %u", + object->objectId); + formStatistic = (Form_pg_statistic_ext) GETSTRUCT(tup); + schema = get_namespace_name_or_temp(formStatistic->stxnamespace); + appendStringInfoString(&buffer, + quote_qualified_identifier(schema, + NameStr(formStatistic->stxname))); + if (objname) + *objname = list_make2(schema, + pstrdup(NameStr(formStatistic->stxname))); + ReleaseSysCache(tup); + } + break; + case OCLASS_TSPARSER: { HeapTuple tup; @@ -4552,6 +4826,10 @@ getObjectIdentityParts(const ObjectAddress *object, appendStringInfoString(&buffer, " on types"); break; + case DEFACLOBJ_NAMESPACE: + appendStringInfoString(&buffer, + " on schemas"); + break; } if (objname) @@ -4602,6 +4880,84 @@ getObjectIdentityParts(const ObjectAddress *object, break; } + case OCLASS_POLICY: + { + Relation polDesc; + HeapTuple tup; + Form_pg_policy policy; + + polDesc = heap_open(PolicyRelationId, AccessShareLock); + + tup = get_catalog_object_by_oid(polDesc, object->objectId); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "could not find tuple for policy %u", + object->objectId); + + policy = (Form_pg_policy) GETSTRUCT(tup); + + appendStringInfo(&buffer, "%s on ", + quote_identifier(NameStr(policy->polname))); + getRelationIdentity(&buffer, policy->polrelid, objname); + if (objname) + *objname = lappend(*objname, pstrdup(NameStr(policy->polname))); + + heap_close(polDesc, AccessShareLock); + break; + } + + case OCLASS_PUBLICATION: + { + char *pubname; + + pubname = get_publication_name(object->objectId); + appendStringInfoString(&buffer, + quote_identifier(pubname)); + if (objname) + *objname = list_make1(pubname); + break; + } + + case OCLASS_PUBLICATION_REL: + { + HeapTuple tup; + char *pubname; + Form_pg_publication_rel prform; + + tup = SearchSysCache1(PUBLICATIONREL, + ObjectIdGetDatum(object->objectId)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for publication table %u", + object->objectId); + + prform = (Form_pg_publication_rel) GETSTRUCT(tup); + pubname = get_publication_name(prform->prpubid); + + appendStringInfo(&buffer, _("%s in publication %s"), + get_rel_name(prform->prrelid), pubname); + + if (objname) + { + getRelationIdentity(&buffer, prform->prrelid, objname); + *objargs = list_make1(pubname); + } + + ReleaseSysCache(tup); + break; + } + + case OCLASS_SUBSCRIPTION: + { + char *subname; + + subname = get_subscription_name(object->objectId); + appendStringInfoString(&buffer, + quote_identifier(subname)); + if (objname) + *objname = list_make1(subname); + break; + } + case OCLASS_TRANSFORM: { Relation transformDesc; @@ -4635,27 +4991,44 @@ getObjectIdentityParts(const ObjectAddress *object, heap_close(transformDesc, AccessShareLock); } break; + + case OCLASS_PGXC_CLASS: + /* + * XXX PG10MERGE: ISTM that we don't record dependencies on + * pgxc_class, pgxc_node and pgxc_group. So it's not clear if we + * really need corresponding OCLASS_* either. We should check this + * in more detail. + */ + break; - case OCLASS_AM: + case OCLASS_PGXC_NODE: { - char *amname; + char *nodename; - amname = get_am_name(object->objectId); - if (!amname) - elog(ERROR, "cache lookup failed for access method %u", - object->objectId); - appendStringInfoString(&buffer, quote_identifier(amname)); + nodename = get_pgxc_nodename(object->objectId); if (objname) - *objname = list_make1(amname); + *objname = list_make1(nodename); + appendStringInfoString(&buffer, + quote_identifier(nodename)); + break; } - break; - default: - appendStringInfo(&buffer, "unrecognized object %u %u %d", - object->classId, - object->objectId, - object->objectSubId); - break; + case OCLASS_PGXC_GROUP: + { + char *groupname; + + groupname = get_pgxc_groupname(object->objectId); + if (objname) + *objname = list_make1(groupname); + appendStringInfoString(&buffer, + quote_identifier(groupname)); + break; + } + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new OCLASS hasn't been handled above. + */ } /* @@ -4671,7 +5044,7 @@ getObjectIdentityParts(const ObjectAddress *object, } static void -getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **objname) +getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object) { HeapTuple opfTup; Form_pg_opfamily opfForm; @@ -4696,10 +5069,10 @@ getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **objname) NameStr(opfForm->opfname)), NameStr(amForm->amname)); - if (objname) - *objname = list_make3(pstrdup(NameStr(amForm->amname)), - pstrdup(schema), - pstrdup(NameStr(opfForm->opfname))); + if (object) + *object = list_make3(pstrdup(NameStr(amForm->amname)), + pstrdup(schema), + pstrdup(NameStr(opfForm->opfname))); ReleaseSysCache(amTup); ReleaseSysCache(opfTup); @@ -4710,7 +5083,7 @@ getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **objname) * StringInfo. */ static void -getRelationIdentity(StringInfo buffer, Oid relid, List **objname) +getRelationIdentity(StringInfo buffer, Oid relid, List **object) { HeapTuple relTup; Form_pg_class relForm; @@ -4726,8 +5099,8 @@ getRelationIdentity(StringInfo buffer, Oid relid, List **objname) appendStringInfoString(buffer, quote_qualified_identifier(schema, NameStr(relForm->relname))); - if (objname) - *objname = list_make2(schema, pstrdup(NameStr(relForm->relname))); + if (object) + *object = list_make2(schema, pstrdup(NameStr(relForm->relname))); ReleaseSysCache(relTup); } @@ -4747,9 +5120,7 @@ strlist_to_textarray(List *list) memcxt = AllocSetContextCreate(CurrentMemoryContext, "strlist to array", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(memcxt); datums = palloc(sizeof(text *) * list_length(list)); diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c new file mode 100644 index 0000000000..37fa1458be --- /dev/null +++ b/src/backend/catalog/partition.c @@ -0,0 +1,2314 @@ +/*------------------------------------------------------------------------- + * + * partition.c + * Partitioning related data structures and functions. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/catalog/partition.c + * + *------------------------------------------------------------------------- +*/ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/sysattr.h" +#include "catalog/dependency.h" +#include "catalog/indexing.h" +#include "catalog/objectaddress.h" +#include "catalog/partition.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_inherits_fn.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_type.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "nodes/parsenodes.h" +#include "optimizer/clauses.h" +#include "optimizer/planmain.h" +#include "optimizer/var.h" +#include "rewrite/rewriteManip.h" +#include "storage/lmgr.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/memutils.h" +#include "utils/fmgroids.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/ruleutils.h" +#include "utils/syscache.h" + +/* + * Information about bounds of a partitioned relation + * + * A list partition datum that is known to be NULL is never put into the + * datums array. Instead, it is tracked using the null_index field. + * + * In the case of range partitioning, ndatums will typically be far less than + * 2 * nparts, because a partition's upper bound and the next partition's lower + * bound are the same in most common cases, and we only store one of them. + * + * In the case of list partitioning, the indexes array stores one entry for + * every datum, which is the index of the partition that accepts a given datum. + * In case of range partitioning, it stores one entry per distinct range + * datum, which is the index of the partition for which a given datum + * is an upper bound. + */ + +/* Ternary value to represent what's contained in a range bound datum */ +typedef enum RangeDatumContent +{ + RANGE_DATUM_FINITE = 0, /* actual datum stored elsewhere */ + RANGE_DATUM_NEG_INF, /* negative infinity */ + RANGE_DATUM_POS_INF /* positive infinity */ +} RangeDatumContent; + +typedef struct PartitionBoundInfoData +{ + char strategy; /* list or range bounds? */ + int ndatums; /* Length of the datums following array */ + Datum **datums; /* Array of datum-tuples with key->partnatts + * datums each */ + RangeDatumContent **content;/* what's contained in each range bound datum? + * (see the above enum); NULL for list + * partitioned tables */ + int *indexes; /* Partition indexes; one entry per member of + * the datums array (plus one if range + * partitioned table) */ + int null_index; /* Index of the null-accepting partition; -1 + * if there isn't one */ +} PartitionBoundInfoData; + +#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1) + +/* + * When qsort'ing partition bounds after reading from the catalog, each bound + * is represented with one of the following structs. + */ + +/* One value coming from some (index'th) list partition */ +typedef struct PartitionListValue +{ + int index; + Datum value; +} PartitionListValue; + +/* One bound of a range partition */ +typedef struct PartitionRangeBound +{ + int index; + Datum *datums; /* range bound datums */ + RangeDatumContent *content; /* what's contained in each datum? */ + bool lower; /* this is the lower (vs upper) bound */ +} PartitionRangeBound; + +static int32 qsort_partition_list_value_cmp(const void *a, const void *b, + void *arg); +static int32 qsort_partition_rbound_cmp(const void *a, const void *b, + void *arg); + +static Oid get_partition_operator(PartitionKey key, int col, + StrategyNumber strategy, bool *need_relabel); +static Expr *make_partition_op_expr(PartitionKey key, int keynum, + uint16 strategy, Expr *arg1, Expr *arg2); +static void get_range_key_properties(PartitionKey key, int keynum, + PartitionRangeDatum *ldatum, + PartitionRangeDatum *udatum, + ListCell **partexprs_item, + Expr **keyCol, + Const **lower_val, Const **upper_val); +static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec); +static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec); +static List *generate_partition_qual(Relation rel); + +static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index, + List *datums, bool lower); +static int32 partition_rbound_cmp(PartitionKey key, + Datum *datums1, RangeDatumContent *content1, bool lower1, + PartitionRangeBound *b2); +static int32 partition_rbound_datum_cmp(PartitionKey key, + Datum *rb_datums, RangeDatumContent *rb_content, + Datum *tuple_datums); + +static int32 partition_bound_cmp(PartitionKey key, + PartitionBoundInfo boundinfo, + int offset, void *probe, bool probe_is_bound); +static int partition_bound_bsearch(PartitionKey key, + PartitionBoundInfo boundinfo, + void *probe, bool probe_is_bound, bool *is_equal); + +/* + * RelationBuildPartitionDesc + * Form rel's partition descriptor + * + * Not flushed from the cache by RelationClearRelation() unless changed because + * of addition or removal of partition. + */ +void +RelationBuildPartitionDesc(Relation rel) +{ + List *inhoids, + *partoids; + Oid *oids = NULL; + List *boundspecs = NIL; + ListCell *cell; + int i, + nparts; + PartitionKey key = RelationGetPartitionKey(rel); + PartitionDesc result; + MemoryContext oldcxt; + + int ndatums = 0; + + /* List partitioning specific */ + PartitionListValue **all_values = NULL; + int null_index = -1; + + /* Range partitioning specific */ + PartitionRangeBound **rbounds = NULL; + + /* + * The following could happen in situations where rel has a pg_class entry + * but not the pg_partitioned_table entry yet. + */ + if (key == NULL) + return; + + /* Get partition oids from pg_inherits */ + inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); + + /* Collect bound spec nodes in a list */ + i = 0; + partoids = NIL; + foreach(cell, inhoids) + { + Oid inhrelid = lfirst_oid(cell); + HeapTuple tuple; + Datum datum; + bool isnull; + Node *boundspec; + + tuple = SearchSysCache1(RELOID, inhrelid); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", inhrelid); + + /* + * It is possible that the pg_class tuple of a partition has not been + * updated yet to set its relpartbound field. The only case where + * this happens is when we open the parent relation to check using its + * partition descriptor that a new partition's bound does not overlap + * some existing partition. + */ + if (!((Form_pg_class) GETSTRUCT(tuple))->relispartition) + { + ReleaseSysCache(tuple); + continue; + } + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + Assert(!isnull); + boundspec = (Node *) stringToNode(TextDatumGetCString(datum)); + boundspecs = lappend(boundspecs, boundspec); + partoids = lappend_oid(partoids, inhrelid); + ReleaseSysCache(tuple); + } + + nparts = list_length(partoids); + + if (nparts > 0) + { + oids = (Oid *) palloc(nparts * sizeof(Oid)); + i = 0; + foreach(cell, partoids) + oids[i++] = lfirst_oid(cell); + + /* Convert from node to the internal representation */ + if (key->strategy == PARTITION_STRATEGY_LIST) + { + List *non_null_values = NIL; + + /* + * Create a unified list of non-null values across all partitions. + */ + i = 0; + null_index = -1; + foreach(cell, boundspecs) + { + PartitionBoundSpec *spec = castNode(PartitionBoundSpec, + lfirst(cell)); + ListCell *c; + + if (spec->strategy != PARTITION_STRATEGY_LIST) + elog(ERROR, "invalid strategy in partition bound spec"); + + foreach(c, spec->listdatums) + { + Const *val = castNode(Const, lfirst(c)); + PartitionListValue *list_value = NULL; + + if (!val->constisnull) + { + list_value = (PartitionListValue *) + palloc0(sizeof(PartitionListValue)); + list_value->index = i; + list_value->value = val->constvalue; + } + else + { + /* + * Never put a null into the values array, flag + * instead for the code further down below where we + * construct the actual relcache struct. + */ + if (null_index != -1) + elog(ERROR, "found null more than once"); + null_index = i; + } + + if (list_value) + non_null_values = lappend(non_null_values, + list_value); + } + + i++; + } + + ndatums = list_length(non_null_values); + + /* + * Collect all list values in one array. Alongside the value, we + * also save the index of partition the value comes from. + */ + all_values = (PartitionListValue **) palloc(ndatums * + sizeof(PartitionListValue *)); + i = 0; + foreach(cell, non_null_values) + { + PartitionListValue *src = lfirst(cell); + + all_values[i] = (PartitionListValue *) + palloc(sizeof(PartitionListValue)); + all_values[i]->value = src->value; + all_values[i]->index = src->index; + i++; + } + + qsort_arg(all_values, ndatums, sizeof(PartitionListValue *), + qsort_partition_list_value_cmp, (void *) key); + } + else if (key->strategy == PARTITION_STRATEGY_RANGE) + { + int j, + k; + PartitionRangeBound **all_bounds, + *prev; + bool *distinct_indexes; + + all_bounds = (PartitionRangeBound **) palloc0(2 * nparts * + sizeof(PartitionRangeBound *)); + distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool)); + + /* + * Create a unified list of range bounds across all the + * partitions. + */ + i = j = 0; + foreach(cell, boundspecs) + { + PartitionBoundSpec *spec = castNode(PartitionBoundSpec, + lfirst(cell)); + PartitionRangeBound *lower, + *upper; + + if (spec->strategy != PARTITION_STRATEGY_RANGE) + elog(ERROR, "invalid strategy in partition bound spec"); + + lower = make_one_range_bound(key, i, spec->lowerdatums, + true); + upper = make_one_range_bound(key, i, spec->upperdatums, + false); + all_bounds[j] = lower; + all_bounds[j + 1] = upper; + j += 2; + i++; + } + Assert(j == 2 * nparts); + + /* Sort all the bounds in ascending order */ + qsort_arg(all_bounds, 2 * nparts, + sizeof(PartitionRangeBound *), + qsort_partition_rbound_cmp, + (void *) key); + + /* + * Count the number of distinct bounds to allocate an array of + * that size. + */ + ndatums = 0; + prev = NULL; + for (i = 0; i < 2 * nparts; i++) + { + PartitionRangeBound *cur = all_bounds[i]; + bool is_distinct = false; + int j; + + /* Is current bound is distinct from the previous? */ + for (j = 0; j < key->partnatts; j++) + { + Datum cmpval; + + if (prev == NULL) + { + is_distinct = true; + break; + } + + /* + * If either of them has infinite element, we can't equate + * them. Even when both are infinite, they'd have + * opposite signs, because only one of cur and prev is a + * lower bound). + */ + if (cur->content[j] != RANGE_DATUM_FINITE || + prev->content[j] != RANGE_DATUM_FINITE) + { + is_distinct = true; + break; + } + cmpval = FunctionCall2Coll(&key->partsupfunc[j], + key->partcollation[j], + cur->datums[j], + prev->datums[j]); + if (DatumGetInt32(cmpval) != 0) + { + is_distinct = true; + break; + } + } + + /* + * Count the current bound if it is distinct from the previous + * one. Also, store if the index i contains a distinct bound + * that we'd like put in the relcache array. + */ + if (is_distinct) + { + distinct_indexes[i] = true; + ndatums++; + } + else + distinct_indexes[i] = false; + + prev = cur; + } + + /* + * Finally save them in an array from where they will be copied + * into the relcache. + */ + rbounds = (PartitionRangeBound **) palloc(ndatums * + sizeof(PartitionRangeBound *)); + k = 0; + for (i = 0; i < 2 * nparts; i++) + { + if (distinct_indexes[i]) + rbounds[k++] = all_bounds[i]; + } + Assert(k == ndatums); + } + else + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + /* Now build the actual relcache partition descriptor */ + rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext, + RelationGetRelationName(rel), + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt); + + result = (PartitionDescData *) palloc0(sizeof(PartitionDescData)); + result->nparts = nparts; + if (nparts > 0) + { + PartitionBoundInfo boundinfo; + int *mapping; + int next_index = 0; + + result->oids = (Oid *) palloc0(nparts * sizeof(Oid)); + + boundinfo = (PartitionBoundInfoData *) + palloc0(sizeof(PartitionBoundInfoData)); + boundinfo->strategy = key->strategy; + boundinfo->ndatums = ndatums; + boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); + + /* Initialize mapping array with invalid values */ + mapping = (int *) palloc(sizeof(int) * nparts); + for (i = 0; i < nparts; i++) + mapping[i] = -1; + + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + { + boundinfo->indexes = (int *) palloc(ndatums * sizeof(int)); + + /* + * Copy values. Indexes of individual values are mapped + * to canonical values so that they match for any two list + * partitioned tables with same number of partitions and + * same lists per partition. One way to canonicalize is + * to assign the index in all_values[] of the smallest + * value of each partition, as the index of all of the + * partition's values. + */ + for (i = 0; i < ndatums; i++) + { + boundinfo->datums[i] = (Datum *) palloc(sizeof(Datum)); + boundinfo->datums[i][0] = datumCopy(all_values[i]->value, + key->parttypbyval[0], + key->parttyplen[0]); + + /* If the old index has no mapping, assign one */ + if (mapping[all_values[i]->index] == -1) + mapping[all_values[i]->index] = next_index++; + + boundinfo->indexes[i] = mapping[all_values[i]->index]; + } + + /* + * If null-accepting partition has no mapped index yet, + * assign one. This could happen if such partition + * accepts only null and hence not covered in the above + * loop which only handled non-null values. + */ + if (null_index != -1) + { + Assert(null_index >= 0); + if (mapping[null_index] == -1) + mapping[null_index] = next_index++; + boundinfo->null_index = mapping[null_index]; + } + else + boundinfo->null_index = -1; + + /* All partition must now have a valid mapping */ + Assert(next_index == nparts); + break; + } + + case PARTITION_STRATEGY_RANGE: + { + boundinfo->content = (RangeDatumContent **) palloc(ndatums * + sizeof(RangeDatumContent *)); + boundinfo->indexes = (int *) palloc((ndatums + 1) * + sizeof(int)); + + for (i = 0; i < ndatums; i++) + { + int j; + + boundinfo->datums[i] = (Datum *) palloc(key->partnatts * + sizeof(Datum)); + boundinfo->content[i] = (RangeDatumContent *) + palloc(key->partnatts * + sizeof(RangeDatumContent)); + for (j = 0; j < key->partnatts; j++) + { + if (rbounds[i]->content[j] == RANGE_DATUM_FINITE) + boundinfo->datums[i][j] = + datumCopy(rbounds[i]->datums[j], + key->parttypbyval[j], + key->parttyplen[j]); + /* Remember, we are storing the tri-state value. */ + boundinfo->content[i][j] = rbounds[i]->content[j]; + } + + /* + * There is no mapping for invalid indexes. + * + * Any lower bounds in the rbounds array have invalid + * indexes assigned, because the values between the + * previous bound (if there is one) and this (lower) + * bound are not part of the range of any existing + * partition. + */ + if (rbounds[i]->lower) + boundinfo->indexes[i] = -1; + else + { + int orig_index = rbounds[i]->index; + + /* If the old index has no mapping, assign one */ + if (mapping[orig_index] == -1) + mapping[orig_index] = next_index++; + + boundinfo->indexes[i] = mapping[orig_index]; + } + } + boundinfo->indexes[i] = -1; + break; + } + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + result->boundinfo = boundinfo; + + /* + * Now assign OIDs from the original array into mapped indexes of the + * result array. Order of OIDs in the former is defined by the + * catalog scan that retrived them, whereas that in the latter is + * defined by canonicalized representation of the list values or the + * range bounds. + */ + for (i = 0; i < nparts; i++) + result->oids[mapping[i]] = oids[i]; + pfree(mapping); + } + + MemoryContextSwitchTo(oldcxt); + rel->rd_partdesc = result; +} + +/* + * Are two partition bound collections logically equal? + * + * Used in the keep logic of relcache.c (ie, in RelationClearRelation()). + * This is also useful when b1 and b2 are bound collections of two separate + * relations, respectively, because PartitionBoundInfo is a canonical + * representation of partition bounds. + */ +bool +partition_bounds_equal(PartitionKey key, + PartitionBoundInfo b1, PartitionBoundInfo b2) +{ + int i; + + if (b1->strategy != b2->strategy) + return false; + + if (b1->ndatums != b2->ndatums) + return false; + + if (b1->null_index != b2->null_index) + return false; + + for (i = 0; i < b1->ndatums; i++) + { + int j; + + for (j = 0; j < key->partnatts; j++) + { + /* For range partitions, the bounds might not be finite. */ + if (b1->content != NULL) + { + /* + * A finite bound always differs from an infinite bound, and + * different kinds of infinities differ from each other. + */ + if (b1->content[i][j] != b2->content[i][j]) + return false; + + /* Non-finite bounds are equal without further examination. */ + if (b1->content[i][j] != RANGE_DATUM_FINITE) + continue; + } + + /* + * Compare the actual values. Note that it would be both incorrect + * and unsafe to invoke the comparison operator derived from the + * partitioning specification here. It would be incorrect because + * we want the relcache entry to be updated for ANY change to the + * partition bounds, not just those that the partitioning operator + * thinks are significant. It would be unsafe because we might + * reach this code in the context of an aborted transaction, and + * an arbitrary partitioning operator might not be safe in that + * context. datumIsEqual() should be simple enough to be safe. + */ + if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j], + key->parttypbyval[j], + key->parttyplen[j])) + return false; + } + + if (b1->indexes[i] != b2->indexes[i]) + return false; + } + + /* There are ndatums+1 indexes in case of range partitions */ + if (key->strategy == PARTITION_STRATEGY_RANGE && + b1->indexes[i] != b2->indexes[i]) + return false; + + return true; +} + +/* + * check_new_partition_bound + * + * Checks if the new partition's bound overlaps any of the existing partitions + * of parent. Also performs additional checks as necessary per strategy. + */ +void +check_new_partition_bound(char *relname, Relation parent, + PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); + PartitionDesc partdesc = RelationGetPartitionDesc(parent); + ParseState *pstate = make_parsestate(NULL); + int with = -1; + bool overlap = false; + + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + { + Assert(spec->strategy == PARTITION_STRATEGY_LIST); + + if (partdesc->nparts > 0) + { + PartitionBoundInfo boundinfo = partdesc->boundinfo; + ListCell *cell; + + Assert(boundinfo && + boundinfo->strategy == PARTITION_STRATEGY_LIST && + (boundinfo->ndatums > 0 || + partition_bound_accepts_nulls(boundinfo))); + + foreach(cell, spec->listdatums) + { + Const *val = castNode(Const, lfirst(cell)); + + if (!val->constisnull) + { + int offset; + bool equal; + + offset = partition_bound_bsearch(key, boundinfo, + &val->constvalue, + true, &equal); + if (offset >= 0 && equal) + { + overlap = true; + with = boundinfo->indexes[offset]; + break; + } + } + else if (partition_bound_accepts_nulls(boundinfo)) + { + overlap = true; + with = boundinfo->null_index; + break; + } + } + } + + break; + } + + case PARTITION_STRATEGY_RANGE: + { + PartitionRangeBound *lower, + *upper; + + Assert(spec->strategy == PARTITION_STRATEGY_RANGE); + lower = make_one_range_bound(key, -1, spec->lowerdatums, true); + upper = make_one_range_bound(key, -1, spec->upperdatums, false); + + /* + * First check if the resulting range would be empty with + * specified lower and upper bounds + */ + if (partition_rbound_cmp(key, lower->datums, lower->content, true, + upper) >= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("cannot create range partition with empty range"), + parser_errposition(pstate, spec->location))); + + if (partdesc->nparts > 0) + { + PartitionBoundInfo boundinfo = partdesc->boundinfo; + int off1, + off2; + bool equal = false; + + Assert(boundinfo && boundinfo->ndatums > 0 && + boundinfo->strategy == PARTITION_STRATEGY_RANGE); + + /* + * Firstly, find the greatest range bound that is less + * than or equal to the new lower bound. + */ + off1 = partition_bound_bsearch(key, boundinfo, lower, true, + &equal); + + /* + * off1 == -1 means that all existing bounds are greater + * than the new lower bound. In that case and the case + * where no partition is defined between the bounds at + * off1 and off1 + 1, we have a "gap" in the range that + * could be occupied by the new partition. We confirm if + * so by checking whether the new upper bound is confined + * within the gap. + */ + if (!equal && boundinfo->indexes[off1 + 1] < 0) + { + off2 = partition_bound_bsearch(key, boundinfo, upper, + true, &equal); + + /* + * If the new upper bound is returned to be equal to + * the bound at off2, the latter must be the upper + * bound of some partition with which the new + * partition clearly overlaps. + * + * Also, if bound at off2 is not same as the one + * returned for the new lower bound (IOW, off1 != + * off2), then the new partition overlaps at least one + * partition. + */ + if (equal || off1 != off2) + { + overlap = true; + + /* + * The bound at off2 could be the lower bound of + * the partition with which the new partition + * overlaps. In that case, use the upper bound + * (that is, the bound at off2 + 1) to get the + * index of that partition. + */ + if (boundinfo->indexes[off2] < 0) + with = boundinfo->indexes[off2 + 1]; + else + with = boundinfo->indexes[off2]; + } + } + else + { + /* + * Equal has been set to true and there is no "gap" + * between the bound at off1 and that at off1 + 1, so + * the new partition will overlap some partition. In + * the former case, the new lower bound is found to be + * equal to the bound at off1, which could only ever + * be true if the latter is the lower bound of some + * partition. It's clear in such a case that the new + * partition overlaps that partition, whose index we + * get using its upper bound (that is, using the bound + * at off1 + 1). + */ + overlap = true; + with = boundinfo->indexes[off1 + 1]; + } + } + + break; + } + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + if (overlap) + { + Assert(with >= 0); + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("partition \"%s\" would overlap partition \"%s\"", + relname, get_rel_name(partdesc->oids[with])), + parser_errposition(pstate, spec->location))); + } +} + +/* + * get_partition_parent + * + * Returns inheritance parent of a partition by scanning pg_inherits + * + * Note: Because this function assumes that the relation whose OID is passed + * as an argument will have precisely one parent, it should only be called + * when it is known that the relation is a partition. + */ +Oid +get_partition_parent(Oid relid) +{ + Form_pg_inherits form; + Relation catalogRelation; + SysScanDesc scan; + ScanKeyData key[2]; + HeapTuple tuple; + Oid result; + + catalogRelation = heap_open(InheritsRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_inherits_inhrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + ScanKeyInit(&key[1], + Anum_pg_inherits_inhseqno, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(1)); + + scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true, + NULL, 2, key); + + tuple = systable_getnext(scan); + Assert(HeapTupleIsValid(tuple)); + + form = (Form_pg_inherits) GETSTRUCT(tuple); + result = form->inhparent; + + systable_endscan(scan); + heap_close(catalogRelation, AccessShareLock); + + return result; +} + +/* + * get_qual_from_partbound + * Given a parser node for partition bound, return the list of executable + * expressions as partition constraint + */ +List * +get_qual_from_partbound(Relation rel, Relation parent, + PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); + List *my_qual = NIL; + + Assert(key != NULL); + + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + Assert(spec->strategy == PARTITION_STRATEGY_LIST); + my_qual = get_qual_for_list(key, spec); + break; + + case PARTITION_STRATEGY_RANGE: + Assert(spec->strategy == PARTITION_STRATEGY_RANGE); + my_qual = get_qual_for_range(key, spec); + break; + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + return my_qual; +} + +/* + * map_partition_varattnos - maps varattno of any Vars in expr from the + * parent attno to partition attno. + * + * We must allow for cases where physical attnos of a partition can be + * different from the parent's. + * + * Note: this will work on any node tree, so really the argument and result + * should be declared "Node *". But a substantial majority of the callers + * are working on Lists, so it's less messy to do the casts internally. + */ +List * +map_partition_varattnos(List *expr, int target_varno, + Relation partrel, Relation parent) +{ + AttrNumber *part_attnos; + bool found_whole_row; + + if (expr == NIL) + return NIL; + + part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel), + RelationGetDescr(parent), + gettext_noop("could not convert row type")); + expr = (List *) map_variable_attnos((Node *) expr, + target_varno, 0, + part_attnos, + RelationGetDescr(parent)->natts, + &found_whole_row); + /* There can never be a whole-row reference here */ + if (found_whole_row) + elog(ERROR, "unexpected whole-row reference found in partition key"); + + return expr; +} + +/* + * RelationGetPartitionQual + * + * Returns a list of partition quals + */ +List * +RelationGetPartitionQual(Relation rel) +{ + /* Quick exit */ + if (!rel->rd_rel->relispartition) + return NIL; + + return generate_partition_qual(rel); +} + +/* + * get_partition_qual_relid + * + * Returns an expression tree describing the passed-in relation's partition + * constraint. + */ +Expr * +get_partition_qual_relid(Oid relid) +{ + Relation rel = heap_open(relid, AccessShareLock); + Expr *result = NULL; + List *and_args; + + /* Do the work only if this relation is a partition. */ + if (rel->rd_rel->relispartition) + { + and_args = generate_partition_qual(rel); + if (list_length(and_args) > 1) + result = makeBoolExpr(AND_EXPR, and_args, -1); + else + result = linitial(and_args); + } + + /* Keep the lock. */ + heap_close(rel, NoLock); + + return result; +} + +/* + * Append OIDs of rel's partitions to the list 'partoids' and for each OID, + * append pointer rel to the list 'parents'. + */ +#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \ + do\ + {\ + int i;\ + for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\ + {\ + (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\ + (parents) = lappend((parents), (rel));\ + }\ + } while(0) + +/* + * RelationGetPartitionDispatchInfo + * Returns information necessary to route tuples down a partition tree + * + * All the partitions will be locked with lockmode, unless it is NoLock. + * A list of the OIDs of all the leaf partitions of rel is returned in + * *leaf_part_oids. + */ +PartitionDispatch * +RelationGetPartitionDispatchInfo(Relation rel, int lockmode, + int *num_parted, List **leaf_part_oids) +{ + PartitionDispatchData **pd; + List *all_parts = NIL, + *all_parents = NIL, + *parted_rels, + *parted_rel_parents; + ListCell *lc1, + *lc2; + int i, + k, + offset; + + /* + * Lock partitions and make a list of the partitioned ones to prepare + * their PartitionDispatch objects below. + * + * Cannot use find_all_inheritors() here, because then the order of OIDs + * in parted_rels list would be unknown, which does not help, because we + * assign indexes within individual PartitionDispatch in an order that is + * predetermined (determined by the order of OIDs in individual partition + * descriptors). + */ + *num_parted = 1; + parted_rels = list_make1(rel); + /* Root partitioned table has no parent, so NULL for parent */ + parted_rel_parents = list_make1(NULL); + APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents); + forboth(lc1, all_parts, lc2, all_parents) + { + Relation partrel = heap_open(lfirst_oid(lc1), lockmode); + Relation parent = lfirst(lc2); + PartitionDesc partdesc = RelationGetPartitionDesc(partrel); + + /* + * If this partition is a partitioned table, add its children to the + * end of the list, so that they are processed as well. + */ + if (partdesc) + { + (*num_parted)++; + parted_rels = lappend(parted_rels, partrel); + parted_rel_parents = lappend(parted_rel_parents, parent); + APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents); + } + else + heap_close(partrel, NoLock); + + /* + * We keep the partitioned ones open until we're done using the + * information being collected here (for example, see + * ExecEndModifyTable). + */ + } + + /* + * We want to create two arrays - one for leaf partitions and another for + * partitioned tables (including the root table and internal partitions). + * While we only create the latter here, leaf partition array of suitable + * objects (such as, ResultRelInfo) is created by the caller using the + * list of OIDs we return. Indexes into these arrays get assigned in a + * breadth-first manner, whereby partitions of any given level are placed + * consecutively in the respective arrays. + */ + pd = (PartitionDispatchData **) palloc(*num_parted * + sizeof(PartitionDispatchData *)); + *leaf_part_oids = NIL; + i = k = offset = 0; + forboth(lc1, parted_rels, lc2, parted_rel_parents) + { + Relation partrel = lfirst(lc1); + Relation parent = lfirst(lc2); + PartitionKey partkey = RelationGetPartitionKey(partrel); + TupleDesc tupdesc = RelationGetDescr(partrel); + PartitionDesc partdesc = RelationGetPartitionDesc(partrel); + int j, + m; + + pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); + pd[i]->reldesc = partrel; + pd[i]->key = partkey; + pd[i]->keystate = NIL; + pd[i]->partdesc = partdesc; + if (parent != NULL) + { + /* + * For every partitioned table other than root, we must store a + * tuple table slot initialized with its tuple descriptor and a + * tuple conversion map to convert a tuple from its parent's + * rowtype to its own. That is to make sure that we are looking at + * the correct row using the correct tuple descriptor when + * computing its partition key for tuple routing. + */ + pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + tupdesc, + gettext_noop("could not convert row type")); + } + else + { + /* Not required for the root partitioned table */ + pd[i]->tupslot = NULL; + pd[i]->tupmap = NULL; + } + pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); + + /* + * Indexes corresponding to the internal partitions are multiplied by + * -1 to distinguish them from those of leaf partitions. Encountering + * an index >= 0 means we found a leaf partition, which is immediately + * returned as the partition we are looking for. A negative index + * means we found a partitioned table, whose PartitionDispatch object + * is located at the above index multiplied back by -1. Using the + * PartitionDispatch object, search is continued further down the + * partition tree. + */ + m = 0; + for (j = 0; j < partdesc->nparts; j++) + { + Oid partrelid = partdesc->oids[j]; + + if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) + { + *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); + pd[i]->indexes[j] = k++; + } + else + { + /* + * offset denotes the number of partitioned tables of upper + * levels including those of the current level. Any partition + * of this table must belong to the next level and hence will + * be placed after the last partitioned table of this level. + */ + pd[i]->indexes[j] = -(1 + offset + m); + m++; + } + } + i++; + + /* + * This counts the number of partitioned tables at upper levels + * including those of the current level. + */ + offset += m; + } + + return pd; +} + +/* Module-local functions */ + +/* + * get_partition_operator + * + * Return oid of the operator of given strategy for a given partition key + * column. + */ +static Oid +get_partition_operator(PartitionKey key, int col, StrategyNumber strategy, + bool *need_relabel) +{ + Oid operoid; + + /* + * First check if there exists an operator of the given strategy, with + * this column's type as both its lefttype and righttype, in the + * partitioning operator family specified for the column. + */ + operoid = get_opfamily_member(key->partopfamily[col], + key->parttypid[col], + key->parttypid[col], + strategy); + + /* + * If one doesn't exist, we must resort to using an operator in the same + * opreator family but with the operator class declared input type. It is + * OK to do so, because the column's type is known to be binary-coercible + * with the operator class input type (otherwise, the operator class in + * question would not have been accepted as the partitioning operator + * class). We must however inform the caller to wrap the non-Const + * expression with a RelabelType node to denote the implicit coercion. It + * ensures that the resulting expression structurally matches similarly + * processed expressions within the optimizer. + */ + if (!OidIsValid(operoid)) + { + operoid = get_opfamily_member(key->partopfamily[col], + key->partopcintype[col], + key->partopcintype[col], + strategy); + *need_relabel = true; + } + else + *need_relabel = false; + + if (!OidIsValid(operoid)) + elog(ERROR, "could not find operator for partitioning"); + + return operoid; +} + +/* + * make_partition_op_expr + * Returns an Expr for the given partition key column with arg1 and + * arg2 as its leftop and rightop, respectively + */ +static Expr * +make_partition_op_expr(PartitionKey key, int keynum, + uint16 strategy, Expr *arg1, Expr *arg2) +{ + Oid operoid; + bool need_relabel = false; + Expr *result = NULL; + + /* Get the correct btree operator for this partitioning column */ + operoid = get_partition_operator(key, keynum, strategy, &need_relabel); + + /* + * Chosen operator may be such that the non-Const operand needs to be + * coerced, so apply the same; see the comment in + * get_partition_operator(). + */ + if (!IsA(arg1, Const) && + (need_relabel || + key->partcollation[keynum] != key->parttypcoll[keynum])) + arg1 = (Expr *) makeRelabelType(arg1, + key->partopcintype[keynum], + -1, + key->partcollation[keynum], + COERCE_EXPLICIT_CAST); + + /* Generate the actual expression */ + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + { + ScalarArrayOpExpr *saopexpr; + + /* Build leftop = ANY (rightop) */ + saopexpr = makeNode(ScalarArrayOpExpr); + saopexpr->opno = operoid; + saopexpr->opfuncid = get_opcode(operoid); + saopexpr->useOr = true; + saopexpr->inputcollid = key->partcollation[keynum]; + saopexpr->args = list_make2(arg1, arg2); + saopexpr->location = -1; + + result = (Expr *) saopexpr; + break; + } + + case PARTITION_STRATEGY_RANGE: + result = make_opclause(operoid, + BOOLOID, + false, + arg1, arg2, + InvalidOid, + key->partcollation[keynum]); + break; + + default: + elog(ERROR, "invalid partitioning strategy"); + break; + } + + return result; +} + +/* + * get_qual_for_list + * + * Returns an implicit-AND list of expressions to use as a list partition's + * constraint, given the partition key and bound structures. + */ +static List * +get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec) +{ + List *result; + Expr *keyCol; + ArrayExpr *arr; + Expr *opexpr; + NullTest *nulltest; + ListCell *cell; + List *arrelems = NIL; + bool list_has_null = false; + + /* Construct Var or expression representing the partition column */ + if (key->partattrs[0] != 0) + keyCol = (Expr *) makeVar(1, + key->partattrs[0], + key->parttypid[0], + key->parttypmod[0], + key->parttypcoll[0], + 0); + else + keyCol = (Expr *) copyObject(linitial(key->partexprs)); + + /* Create list of Consts for the allowed values, excluding any nulls */ + foreach(cell, spec->listdatums) + { + Const *val = castNode(Const, lfirst(cell)); + + if (val->constisnull) + list_has_null = true; + else + arrelems = lappend(arrelems, copyObject(val)); + } + + /* Construct an ArrayExpr for the non-null partition values */ + arr = makeNode(ArrayExpr); + arr->array_typeid = !type_is_array(key->parttypid[0]) + ? get_array_type(key->parttypid[0]) + : key->parttypid[0]; + arr->array_collid = key->parttypcoll[0]; + arr->element_typeid = key->parttypid[0]; + arr->elements = arrelems; + arr->multidims = false; + arr->location = -1; + + /* Generate the main expression, i.e., keyCol = ANY (arr) */ + opexpr = make_partition_op_expr(key, 0, BTEqualStrategyNumber, + keyCol, (Expr *) arr); + + if (!list_has_null) + { + /* + * Gin up a "col IS NOT NULL" test that will be AND'd with the main + * expression. This might seem redundant, but the partition routing + * machinery needs it. + */ + nulltest = makeNode(NullTest); + nulltest->arg = keyCol; + nulltest->nulltesttype = IS_NOT_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + + result = list_make2(nulltest, opexpr); + } + else + { + /* + * Gin up a "col IS NULL" test that will be OR'd with the main + * expression. + */ + Expr *or; + + nulltest = makeNode(NullTest); + nulltest->arg = keyCol; + nulltest->nulltesttype = IS_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + + or = makeBoolExpr(OR_EXPR, list_make2(nulltest, opexpr), -1); + result = list_make1(or); + } + + return result; +} + +/* + * get_range_key_properties + * Returns range partition key information for a given column + * + * This is a subroutine for get_qual_for_range, and its API is pretty + * specialized to that caller. + * + * Constructs an Expr for the key column (returned in *keyCol) and Consts + * for the lower and upper range limits (returned in *lower_val and + * *upper_val). For UNBOUNDED limits, NULL is returned instead of a Const. + * All of these structures are freshly palloc'd. + * + * *partexprs_item points to the cell containing the next expression in + * the key->partexprs list, or NULL. It may be advanced upon return. + */ +static void +get_range_key_properties(PartitionKey key, int keynum, + PartitionRangeDatum *ldatum, + PartitionRangeDatum *udatum, + ListCell **partexprs_item, + Expr **keyCol, + Const **lower_val, Const **upper_val) +{ + /* Get partition key expression for this column */ + if (key->partattrs[keynum] != 0) + { + *keyCol = (Expr *) makeVar(1, + key->partattrs[keynum], + key->parttypid[keynum], + key->parttypmod[keynum], + key->parttypcoll[keynum], + 0); + } + else + { + if (*partexprs_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + *keyCol = copyObject(lfirst(*partexprs_item)); + *partexprs_item = lnext(*partexprs_item); + } + + /* Get appropriate Const nodes for the bounds */ + if (!ldatum->infinite) + *lower_val = castNode(Const, copyObject(ldatum->value)); + else + *lower_val = NULL; + + if (!udatum->infinite) + *upper_val = castNode(Const, copyObject(udatum->value)); + else + *upper_val = NULL; +} + +/* + * get_qual_for_range + * + * Returns an implicit-AND list of expressions to use as a range partition's + * constraint, given the partition key and bound structures. + * + * For a multi-column range partition key, say (a, b, c), with (al, bl, cl) + * as the lower bound tuple and (au, bu, cu) as the upper bound tuple, we + * generate an expression tree of the following form: + * + * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL) + * AND + * (a > al OR (a = al AND b > bl) OR (a = al AND b = bl AND c >= cl)) + * AND + * (a < au OR (a = au AND b < bu) OR (a = au AND b = bu AND c < cu)) + * + * It is often the case that a prefix of lower and upper bound tuples contains + * the same values, for example, (al = au), in which case, we will emit an + * expression tree of the following form: + * + * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL) + * AND + * (a = al) + * AND + * (b > bl OR (b = bl AND c >= cl)) + * AND + * (b < bu) OR (b = bu AND c < cu)) + * + * If cu happens to be UNBOUNDED, we need not emit any expression for it, so + * the last line would be: + * + * (b < bu) OR (b = bu), which is simplified to (b <= bu) + * + * In most common cases with only one partition column, say a, the following + * expression tree will be generated: a IS NOT NULL AND a >= al AND a < au + * + * If all values of both lower and upper bounds are UNBOUNDED, the partition + * does not really have a constraint, except the IS NOT NULL constraint for + * partition keys. + * + * If we end up with an empty result list, we return a single-member list + * containing a constant TRUE, because callers expect a non-empty list. + */ +static List * +get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec) +{ + List *result = NIL; + ListCell *cell1, + *cell2, + *partexprs_item, + *partexprs_item_saved; + int i, + j; + PartitionRangeDatum *ldatum, + *udatum; + Expr *keyCol; + Const *lower_val, + *upper_val; + NullTest *nulltest; + List *lower_or_arms, + *upper_or_arms; + int num_or_arms, + current_or_arm; + ListCell *lower_or_start_datum, + *upper_or_start_datum; + bool need_next_lower_arm, + need_next_upper_arm; + + lower_or_start_datum = list_head(spec->lowerdatums); + upper_or_start_datum = list_head(spec->upperdatums); + num_or_arms = key->partnatts; + + /* + * A range-partitioned table does not currently allow partition keys to be + * null, so emit an IS NOT NULL expression for each key column. + */ + partexprs_item = list_head(key->partexprs); + for (i = 0; i < key->partnatts; i++) + { + Expr *keyCol; + + if (key->partattrs[i] != 0) + { + keyCol = (Expr *) makeVar(1, + key->partattrs[i], + key->parttypid[i], + key->parttypmod[i], + key->parttypcoll[i], + 0); + } + else + { + if (partexprs_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + keyCol = copyObject(lfirst(partexprs_item)); + partexprs_item = lnext(partexprs_item); + } + + nulltest = makeNode(NullTest); + nulltest->arg = keyCol; + nulltest->nulltesttype = IS_NOT_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + result = lappend(result, nulltest); + } + + /* + * Iterate over the key columns and check if the corresponding lower and + * upper datums are equal using the btree equality operator for the + * column's type. If equal, we emit single keyCol = common_value + * expression. Starting from the first column for which the corresponding + * lower and upper bound datums are not equal, we generate OR expressions + * as shown in the function's header comment. + */ + i = 0; + partexprs_item = list_head(key->partexprs); + partexprs_item_saved = partexprs_item; /* placate compiler */ + forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums) + { + EState *estate; + MemoryContext oldcxt; + Expr *test_expr; + ExprState *test_exprstate; + Datum test_result; + bool isNull; + + ldatum = castNode(PartitionRangeDatum, lfirst(cell1)); + udatum = castNode(PartitionRangeDatum, lfirst(cell2)); + + /* + * Since get_range_key_properties() modifies partexprs_item, and we + * might need to start over from the previous expression in the later + * part of this function, save away the current value. + */ + partexprs_item_saved = partexprs_item; + + get_range_key_properties(key, i, ldatum, udatum, + &partexprs_item, + &keyCol, + &lower_val, &upper_val); + + /* + * If either or both of lower_val and upper_val is NULL, they are + * unequal, because being NULL means the column is unbounded in the + * respective direction. + */ + if (!lower_val || !upper_val) + break; + + /* Create the test expression */ + estate = CreateExecutorState(); + oldcxt = MemoryContextSwitchTo(estate->es_query_cxt); + test_expr = make_partition_op_expr(key, i, BTEqualStrategyNumber, + (Expr *) lower_val, + (Expr *) upper_val); + fix_opfuncids((Node *) test_expr); + test_exprstate = ExecInitExpr(test_expr, NULL); + test_result = ExecEvalExprSwitchContext(test_exprstate, + GetPerTupleExprContext(estate), + &isNull); + MemoryContextSwitchTo(oldcxt); + FreeExecutorState(estate); + + /* If not equal, go generate the OR expressions */ + if (!DatumGetBool(test_result)) + break; + + /* + * The bounds for the last key column can't be equal, because such a + * range partition would never be allowed to be defined (it would have + * an empty range otherwise). + */ + if (i == key->partnatts - 1) + elog(ERROR, "invalid range bound specification"); + + /* Equal, so generate keyCol = lower_val expression */ + result = lappend(result, + make_partition_op_expr(key, i, BTEqualStrategyNumber, + keyCol, (Expr *) lower_val)); + + i++; + } + + /* First pair of lower_val and upper_val that are not equal. */ + lower_or_start_datum = cell1; + upper_or_start_datum = cell2; + + /* OR will have as many arms as there are key columns left. */ + num_or_arms = key->partnatts - i; + current_or_arm = 0; + lower_or_arms = upper_or_arms = NIL; + need_next_lower_arm = need_next_upper_arm = true; + while (current_or_arm < num_or_arms) + { + List *lower_or_arm_args = NIL, + *upper_or_arm_args = NIL; + + /* Restart scan of columns from the i'th one */ + j = i; + partexprs_item = partexprs_item_saved; + + for_both_cell(cell1, lower_or_start_datum, cell2, upper_or_start_datum) + { + PartitionRangeDatum *ldatum_next = NULL, + *udatum_next = NULL; + + ldatum = castNode(PartitionRangeDatum, lfirst(cell1)); + if (lnext(cell1)) + ldatum_next = castNode(PartitionRangeDatum, + lfirst(lnext(cell1))); + udatum = castNode(PartitionRangeDatum, lfirst(cell2)); + if (lnext(cell2)) + udatum_next = castNode(PartitionRangeDatum, + lfirst(lnext(cell2))); + get_range_key_properties(key, j, ldatum, udatum, + &partexprs_item, + &keyCol, + &lower_val, &upper_val); + + if (need_next_lower_arm && lower_val) + { + uint16 strategy; + + /* + * For the non-last columns of this arm, use the EQ operator. + * For the last or the last finite-valued column, use GE. + */ + if (j - i < current_or_arm) + strategy = BTEqualStrategyNumber; + else if ((ldatum_next && ldatum_next->infinite) || + j == key->partnatts - 1) + strategy = BTGreaterEqualStrategyNumber; + else + strategy = BTGreaterStrategyNumber; + + lower_or_arm_args = lappend(lower_or_arm_args, + make_partition_op_expr(key, j, + strategy, + keyCol, + (Expr *) lower_val)); + } + + if (need_next_upper_arm && upper_val) + { + uint16 strategy; + + /* + * For the non-last columns of this arm, use the EQ operator. + * For the last finite-valued column, use LE. + */ + if (j - i < current_or_arm) + strategy = BTEqualStrategyNumber; + else if (udatum_next && udatum_next->infinite) + strategy = BTLessEqualStrategyNumber; + else + strategy = BTLessStrategyNumber; + + upper_or_arm_args = lappend(upper_or_arm_args, + make_partition_op_expr(key, j, + strategy, + keyCol, + (Expr *) upper_val)); + } + + /* + * Did we generate enough of OR's arguments? First arm considers + * the first of the remaining columns, second arm considers first + * two of the remaining columns, and so on. + */ + ++j; + if (j - i > current_or_arm) + { + /* + * We need not emit the next arm if the new column that will + * be considered is unbounded. + */ + need_next_lower_arm = ldatum_next && !ldatum_next->infinite; + need_next_upper_arm = udatum_next && !udatum_next->infinite; + break; + } + } + + if (lower_or_arm_args != NIL) + lower_or_arms = lappend(lower_or_arms, + list_length(lower_or_arm_args) > 1 + ? makeBoolExpr(AND_EXPR, lower_or_arm_args, -1) + : linitial(lower_or_arm_args)); + + if (upper_or_arm_args != NIL) + upper_or_arms = lappend(upper_or_arms, + list_length(upper_or_arm_args) > 1 + ? makeBoolExpr(AND_EXPR, upper_or_arm_args, -1) + : linitial(upper_or_arm_args)); + + /* If no work to do in the next iteration, break away. */ + if (!need_next_lower_arm && !need_next_upper_arm) + break; + + ++current_or_arm; + } + + /* + * Generate the OR expressions for each of lower and upper bounds (if + * required), and append to the list of implicitly ANDed list of + * expressions. + */ + if (lower_or_arms != NIL) + result = lappend(result, + list_length(lower_or_arms) > 1 + ? makeBoolExpr(OR_EXPR, lower_or_arms, -1) + : linitial(lower_or_arms)); + if (upper_or_arms != NIL) + result = lappend(result, + list_length(upper_or_arms) > 1 + ? makeBoolExpr(OR_EXPR, upper_or_arms, -1) + : linitial(upper_or_arms)); + + /* As noted above, caller expects the list to be non-empty. */ + if (result == NIL) + result = list_make1(makeBoolConst(true, false)); + + return result; +} + +/* + * generate_partition_qual + * + * Generate partition predicate from rel's partition bound expression + * + * Result expression tree is stored CacheMemoryContext to ensure it survives + * as long as the relcache entry. But we should be running in a less long-lived + * working context. To avoid leaking cache memory if this routine fails partway + * through, we build in working memory and then copy the completed structure + * into cache memory. + */ +static List * +generate_partition_qual(Relation rel) +{ + HeapTuple tuple; + MemoryContext oldcxt; + Datum boundDatum; + bool isnull; + PartitionBoundSpec *bound; + List *my_qual = NIL, + *result = NIL; + Relation parent; + + /* Guard against stack overflow due to overly deep partition tree */ + check_stack_depth(); + + /* Quick copy */ + if (rel->rd_partcheck != NIL) + return copyObject(rel->rd_partcheck); + + /* Grab at least an AccessShareLock on the parent table */ + parent = heap_open(get_partition_parent(RelationGetRelid(rel)), + AccessShareLock); + + /* Get pg_class.relpartbound */ + tuple = SearchSysCache1(RELOID, RelationGetRelid(rel)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", + RelationGetRelid(rel)); + + boundDatum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + if (isnull) /* should not happen */ + elog(ERROR, "relation \"%s\" has relpartbound = null", + RelationGetRelationName(rel)); + bound = castNode(PartitionBoundSpec, + stringToNode(TextDatumGetCString(boundDatum))); + ReleaseSysCache(tuple); + + my_qual = get_qual_from_partbound(rel, parent, bound); + + /* Add the parent's quals to the list (if any) */ + if (parent->rd_rel->relispartition) + result = list_concat(generate_partition_qual(parent), my_qual); + else + result = my_qual; + + /* + * Change Vars to have partition's attnos instead of the parent's. We do + * this after we concatenate the parent's quals, because we want every Var + * in it to bear this relation's attnos. It's safe to assume varno = 1 + * here. + */ + result = map_partition_varattnos(result, 1, rel, parent); + + /* Save a copy in the relcache */ + oldcxt = MemoryContextSwitchTo(CacheMemoryContext); + rel->rd_partcheck = copyObject(result); + MemoryContextSwitchTo(oldcxt); + + /* Keep the parent locked until commit */ + heap_close(parent, NoLock); + + return result; +} + +/* ---------------- + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for the partition key + * of a tuple. + * + * pd Partition dispatch object of the partitioned table + * slot Heap tuple from which to extract partition key + * estate executor state for evaluating any partition key + * expressions (must be non-NULL) + * values Array of partition key Datums (output area) + * isnull Array of is-null indicators (output area) + * + * the ecxt_scantuple slot of estate's per-tuple expr context must point to + * the heap tuple passed in. + * ---------------- + */ +void +FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pd->key->partexprs != NIL && pd->keystate == NIL) + { + /* Check caller has set up context correctly */ + Assert(estate != NULL && + GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + + /* First time through, set up expression evaluation state */ + pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); + } + + partexpr_item = list_head(pd->keystate); + for (i = 0; i < pd->key->partnatts; i++) + { + AttrNumber keycol = pd->key->partattrs[i]; + Datum datum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + datum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull); + partexpr_item = lnext(partexpr_item); + } + values[i] = datum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * get_partition_for_tuple + * Finds a leaf partition for tuple contained in *slot + * + * Returned value is the sequence number of the leaf partition thus found, + * or -1 if no leaf partition is found for the tuple. *failed_at is set + * to the OID of the partitioned table whose partition was not found in + * the latter case. + */ +int +get_partition_for_tuple(PartitionDispatch *pd, + TupleTableSlot *slot, + EState *estate, + PartitionDispatchData **failed_at, + TupleTableSlot **failed_slot) +{ + PartitionDispatch parent; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + int cur_offset, + cur_index; + int i, + result; + ExprContext *ecxt = GetPerTupleExprContext(estate); + TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; + + /* start with the root partitioned table */ + parent = pd[0]; + while (true) + { + PartitionKey key = parent->key; + PartitionDesc partdesc = parent->partdesc; + TupleTableSlot *myslot = parent->tupslot; + TupleConversionMap *map = parent->tupmap; + + if (myslot != NULL && map != NULL) + { + HeapTuple tuple = ExecFetchSlotTuple(slot); + + ExecClearTuple(myslot); + tuple = do_convert_tuple(tuple, map); + ExecStoreTuple(tuple, myslot, InvalidBuffer, true); + slot = myslot; + } + + /* Quick exit */ + if (partdesc->nparts == 0) + { + *failed_at = parent; + *failed_slot = slot; + result = -1; + goto error_exit; + } + + /* + * Extract partition key from tuple. Expression evaluation machinery + * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to + * point to the correct tuple slot. The slot might have changed from + * what was used for the parent table if the table of the current + * partitioning level has different tuple descriptor from the parent. + * So update ecxt_scantuple accordingly. + */ + ecxt->ecxt_scantuple = slot; + FormPartitionKeyDatum(parent, slot, estate, values, isnull); + + if (key->strategy == PARTITION_STRATEGY_RANGE) + { + /* + * Since we cannot route tuples with NULL partition keys through a + * range-partitioned table, simply return that no partition exists + */ + for (i = 0; i < key->partnatts; i++) + { + if (isnull[i]) + { + *failed_at = parent; + *failed_slot = slot; + result = -1; + goto error_exit; + } + } + } + + /* + * A null partition key is only acceptable if null-accepting list + * partition exists. + */ + cur_index = -1; + if (isnull[0] && partition_bound_accepts_nulls(partdesc->boundinfo)) + cur_index = partdesc->boundinfo->null_index; + else if (!isnull[0]) + { + /* Else bsearch in partdesc->boundinfo */ + bool equal = false; + + cur_offset = partition_bound_bsearch(key, partdesc->boundinfo, + values, false, &equal); + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + if (cur_offset >= 0 && equal) + cur_index = partdesc->boundinfo->indexes[cur_offset]; + else + cur_index = -1; + break; + + case PARTITION_STRATEGY_RANGE: + + /* + * Offset returned is such that the bound at offset is + * found to be less or equal with the tuple. So, the bound + * at offset+1 would be the upper bound. + */ + cur_index = partdesc->boundinfo->indexes[cur_offset + 1]; + break; + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + } + + /* + * cur_index < 0 means we failed to find a partition of this parent. + * cur_index >= 0 means we either found the leaf partition, or the + * next parent to find a partition of. + */ + if (cur_index < 0) + { + result = -1; + *failed_at = parent; + *failed_slot = slot; + break; + } + else if (parent->indexes[cur_index] >= 0) + { + result = parent->indexes[cur_index]; + break; + } + else + parent = pd[-parent->indexes[cur_index]]; + } + +error_exit: + ecxt->ecxt_scantuple = ecxt_scantuple_old; + return result; +} + +/* + * qsort_partition_list_value_cmp + * + * Compare two list partition bound datums + */ +static int32 +qsort_partition_list_value_cmp(const void *a, const void *b, void *arg) +{ + Datum val1 = (*(const PartitionListValue **) a)->value, + val2 = (*(const PartitionListValue **) b)->value; + PartitionKey key = (PartitionKey) arg; + + return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], + key->partcollation[0], + val1, val2)); +} + +/* + * make_one_range_bound + * + * Return a PartitionRangeBound given a list of PartitionRangeDatum elements + * and a flag telling whether the bound is lower or not. Made into a function + * because there are multiple sites that want to use this facility. + */ +static PartitionRangeBound * +make_one_range_bound(PartitionKey key, int index, List *datums, bool lower) +{ + PartitionRangeBound *bound; + ListCell *lc; + int i; + + bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound)); + bound->index = index; + bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum)); + bound->content = (RangeDatumContent *) palloc0(key->partnatts * + sizeof(RangeDatumContent)); + bound->lower = lower; + + i = 0; + foreach(lc, datums) + { + PartitionRangeDatum *datum = castNode(PartitionRangeDatum, lfirst(lc)); + + /* What's contained in this range datum? */ + bound->content[i] = !datum->infinite + ? RANGE_DATUM_FINITE + : (lower ? RANGE_DATUM_NEG_INF + : RANGE_DATUM_POS_INF); + + if (bound->content[i] == RANGE_DATUM_FINITE) + { + Const *val = castNode(Const, datum->value); + + if (val->constisnull) + elog(ERROR, "invalid range bound datum"); + bound->datums[i] = val->constvalue; + } + + i++; + } + + return bound; +} + +/* Used when sorting range bounds across all range partitions */ +static int32 +qsort_partition_rbound_cmp(const void *a, const void *b, void *arg) +{ + PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a); + PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b); + PartitionKey key = (PartitionKey) arg; + + return partition_rbound_cmp(key, b1->datums, b1->content, b1->lower, b2); +} + +/* + * partition_rbound_cmp + * + * Return for two range bounds whether the 1st one (specified in datum1, + * content1, and lower1) is <=, =, >= the bound specified in *b2 + */ +static int32 +partition_rbound_cmp(PartitionKey key, + Datum *datums1, RangeDatumContent *content1, bool lower1, + PartitionRangeBound *b2) +{ + int32 cmpval = 0; /* placate compiler */ + int i; + Datum *datums2 = b2->datums; + RangeDatumContent *content2 = b2->content; + bool lower2 = b2->lower; + + for (i = 0; i < key->partnatts; i++) + { + /* + * First, handle cases involving infinity, which don't require + * invoking the comparison proc. + */ + if (content1[i] != RANGE_DATUM_FINITE && + content2[i] != RANGE_DATUM_FINITE) + + /* + * Both are infinity, so they are equal unless one is negative + * infinity and other positive (or vice versa) + */ + return content1[i] == content2[i] ? 0 + : (content1[i] < content2[i] ? -1 : 1); + else if (content1[i] != RANGE_DATUM_FINITE) + return content1[i] == RANGE_DATUM_NEG_INF ? -1 : 1; + else if (content2[i] != RANGE_DATUM_FINITE) + return content2[i] == RANGE_DATUM_NEG_INF ? 1 : -1; + + cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i], + key->partcollation[i], + datums1[i], + datums2[i])); + if (cmpval != 0) + break; + } + + /* + * If the comparison is anything other than equal, we're done. If they + * compare equal though, we still have to consider whether the boundaries + * are inclusive or exclusive. Exclusive one is considered smaller of the + * two. + */ + if (cmpval == 0 && lower1 != lower2) + cmpval = lower1 ? 1 : -1; + + return cmpval; +} + +/* + * partition_rbound_datum_cmp + * + * Return whether range bound (specified in rb_datums, rb_content, and + * rb_lower) <=, =, >= partition key of tuple (tuple_datums) + */ +static int32 +partition_rbound_datum_cmp(PartitionKey key, + Datum *rb_datums, RangeDatumContent *rb_content, + Datum *tuple_datums) +{ + int i; + int32 cmpval = -1; + + for (i = 0; i < key->partnatts; i++) + { + if (rb_content[i] != RANGE_DATUM_FINITE) + return rb_content[i] == RANGE_DATUM_NEG_INF ? -1 : 1; + + cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i], + key->partcollation[i], + rb_datums[i], + tuple_datums[i])); + if (cmpval != 0) + break; + } + + return cmpval; +} + +/* + * partition_bound_cmp + * + * Return whether the bound at offset in boundinfo is <=, =, >= the argument + * specified in *probe. + */ +static int32 +partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo, + int offset, void *probe, bool probe_is_bound) +{ + Datum *bound_datums = boundinfo->datums[offset]; + int32 cmpval = -1; + + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], + key->partcollation[0], + bound_datums[0], + *(Datum *) probe)); + break; + + case PARTITION_STRATEGY_RANGE: + { + RangeDatumContent *content = boundinfo->content[offset]; + + if (probe_is_bound) + { + /* + * We need to pass whether the existing bound is a lower + * bound, so that two equal-valued lower and upper bounds + * are not regarded equal. + */ + bool lower = boundinfo->indexes[offset] < 0; + + cmpval = partition_rbound_cmp(key, + bound_datums, content, lower, + (PartitionRangeBound *) probe); + } + else + cmpval = partition_rbound_datum_cmp(key, + bound_datums, content, + (Datum *) probe); + break; + } + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + return cmpval; +} + +/* + * Binary search on a collection of partition bounds. Returns greatest + * bound in array boundinfo->datums which is less than or equal to *probe + * If all bounds in the array are greater than *probe, -1 is returned. + * + * *probe could either be a partition bound or a Datum array representing + * the partition key of a tuple being routed; probe_is_bound tells which. + * We pass that down to the comparison function so that it can interpret the + * contents of *probe accordingly. + * + * *is_equal is set to whether the bound at the returned index is equal with + * *probe. + */ +static int +partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, + void *probe, bool probe_is_bound, bool *is_equal) +{ + int lo, + hi, + mid; + + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) + { + int32 cmpval; + + mid = (lo + hi + 1) / 2; + cmpval = partition_bound_cmp(key, boundinfo, mid, probe, + probe_is_bound); + if (cmpval <= 0) + { + lo = mid; + *is_equal = (cmpval == 0); + + if (*is_equal) + break; + } + else + hi = mid - 1; + } + + return lo; +} diff --git a/src/backend/catalog/pg_aggregate.c b/src/backend/catalog/pg_aggregate.c index 959d3845df..65c2e88e93 100644 --- a/src/backend/catalog/pg_aggregate.c +++ b/src/backend/catalog/pg_aggregate.c @@ -3,7 +3,7 @@ * pg_aggregate.c * routines to support manipulation of the pg_aggregate relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -433,7 +433,7 @@ AggregateCreate(const char *aggName, if (aggTransType == INTERNALOID && func_strict(combinefn)) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), - errmsg("combine function with \"%s\" transition type must not be declared STRICT", + errmsg("combine function with transition type %s must not be declared STRICT", format_type_be(aggTransType)))); } @@ -674,9 +674,7 @@ AggregateCreate(const char *aggName, tupDesc = aggdesc->rd_att; tup = heap_form_tuple(tupDesc, values, nulls); - simple_heap_insert(aggdesc, tup); - - CatalogUpdateIndexes(aggdesc, tup); + CatalogTupleInsert(aggdesc, tup); heap_close(aggdesc, RowExclusiveLock); diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index f37cf37c4a..30cd0cba19 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -3,7 +3,7 @@ * pg_collation.c * routines to support manipulation of the pg_collation relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -27,6 +27,7 @@ #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/pg_locale.h" #include "utils/rel.h" #include "utils/syscache.h" #include "utils/tqual.h" @@ -40,8 +41,11 @@ Oid CollationCreate(const char *collname, Oid collnamespace, Oid collowner, + char collprovider, int32 collencoding, - const char *collcollate, const char *collctype) + const char *collcollate, const char *collctype, + const char *collversion, + bool if_not_exists) { Relation rel; TupleDesc tupDesc; @@ -72,27 +76,65 @@ CollationCreate(const char *collname, Oid collnamespace, PointerGetDatum(collname), Int32GetDatum(collencoding), ObjectIdGetDatum(collnamespace))) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("collation \"%s\" for encoding \"%s\" already exists", - collname, pg_encoding_to_char(collencoding)))); + { + if (if_not_exists) + { + ereport(NOTICE, + (errcode(ERRCODE_DUPLICATE_OBJECT), + collencoding == -1 + ? errmsg("collation \"%s\" already exists, skipping", + collname) + : errmsg("collation \"%s\" for encoding \"%s\" already exists, skipping", + collname, pg_encoding_to_char(collencoding)))); + return InvalidOid; + } + else + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + collencoding == -1 + ? errmsg("collation \"%s\" already exists", + collname) + : errmsg("collation \"%s\" for encoding \"%s\" already exists", + collname, pg_encoding_to_char(collencoding)))); + } + + /* open pg_collation; see below about the lock level */ + rel = heap_open(CollationRelationId, ShareRowExclusiveLock); /* - * Also forbid matching an any-encoding entry. This test of course is not - * backed up by the unique index, but it's not a problem since we don't - * support adding any-encoding entries after initdb. + * Also forbid a specific-encoding collation shadowing an any-encoding + * collation, or an any-encoding collation being shadowed (see + * get_collation_name()). This test is not backed up by the unique index, + * so we take a ShareRowExclusiveLock earlier, to protect against + * concurrent changes fooling this check. */ - if (SearchSysCacheExists3(COLLNAMEENCNSP, - PointerGetDatum(collname), - Int32GetDatum(-1), - ObjectIdGetDatum(collnamespace))) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("collation \"%s\" already exists", - collname))); + if ((collencoding == -1 && + SearchSysCacheExists3(COLLNAMEENCNSP, + PointerGetDatum(collname), + Int32GetDatum(GetDatabaseEncoding()), + ObjectIdGetDatum(collnamespace))) || + (collencoding != -1 && + SearchSysCacheExists3(COLLNAMEENCNSP, + PointerGetDatum(collname), + Int32GetDatum(-1), + ObjectIdGetDatum(collnamespace)))) + { + if (if_not_exists) + { + heap_close(rel, NoLock); + ereport(NOTICE, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("collation \"%s\" already exists, skipping", + collname))); + return InvalidOid; + } + else + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("collation \"%s\" already exists", + collname))); + } - /* open pg_collation */ - rel = heap_open(CollationRelationId, RowExclusiveLock); tupDesc = RelationGetDescr(rel); /* form a tuple */ @@ -102,21 +144,23 @@ CollationCreate(const char *collname, Oid collnamespace, values[Anum_pg_collation_collname - 1] = NameGetDatum(&name_name); values[Anum_pg_collation_collnamespace - 1] = ObjectIdGetDatum(collnamespace); values[Anum_pg_collation_collowner - 1] = ObjectIdGetDatum(collowner); + values[Anum_pg_collation_collprovider - 1] = CharGetDatum(collprovider); values[Anum_pg_collation_collencoding - 1] = Int32GetDatum(collencoding); namestrcpy(&name_collate, collcollate); values[Anum_pg_collation_collcollate - 1] = NameGetDatum(&name_collate); namestrcpy(&name_ctype, collctype); values[Anum_pg_collation_collctype - 1] = NameGetDatum(&name_ctype); + if (collversion) + values[Anum_pg_collation_collversion - 1] = CStringGetTextDatum(collversion); + else + nulls[Anum_pg_collation_collversion - 1] = true; tup = heap_form_tuple(tupDesc, values, nulls); /* insert a new tuple */ - oid = simple_heap_insert(rel, tup); + oid = CatalogTupleInsert(rel, tup); Assert(OidIsValid(oid)); - /* update the index if any */ - CatalogUpdateIndexes(rel, tup); - /* set up dependencies for the new collation */ myself.classId = CollationRelationId; myself.objectId = oid; @@ -139,7 +183,7 @@ CollationCreate(const char *collname, Oid collnamespace, InvokeObjectPostCreateHook(CollationRelationId, oid, 0); heap_freetuple(tup); - heap_close(rel, RowExclusiveLock); + heap_close(rel, NoLock); return oid; } @@ -171,7 +215,7 @@ RemoveCollationById(Oid collationOid) tuple = systable_getnext(scandesc); if (HeapTupleIsValid(tuple)) - simple_heap_delete(rel, &tuple->t_self); + CatalogTupleDelete(rel, &tuple->t_self); else elog(ERROR, "could not find tuple for collation %u", collationOid); diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index 8fabe6899f..e5ae3d9292 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -3,7 +3,7 @@ * pg_constraint.c * routines to support manipulation of the pg_constraint relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -226,10 +226,7 @@ CreateConstraintEntry(const char *constraintName, tup = heap_form_tuple(RelationGetDescr(conDesc), values, nulls); - conOid = simple_heap_insert(conDesc, tup); - - /* update catalog indexes */ - CatalogUpdateIndexes(conDesc, tup); + conOid = CatalogTupleInsert(conDesc, tup); conobject.classId = ConstraintRelationId; conobject.objectId = conOid; @@ -368,7 +365,7 @@ CreateConstraintEntry(const char *constraintName, */ recordDependencyOnSingleRelExpr(&conobject, conExpr, relId, DEPENDENCY_NORMAL, - DEPENDENCY_NORMAL); + DEPENDENCY_NORMAL, false); } /* Post creation hook for new constraint */ @@ -584,9 +581,7 @@ RemoveConstraintById(Oid conId) RelationGetRelationName(rel)); classForm->relchecks--; - simple_heap_update(pgrel, &relTup->t_self, relTup); - - CatalogUpdateIndexes(pgrel, relTup); + CatalogTupleUpdate(pgrel, &relTup->t_self, relTup); heap_freetuple(relTup); @@ -609,7 +604,7 @@ RemoveConstraintById(Oid conId) elog(ERROR, "constraint %u is not of a known type", conId); /* Fry the constraint itself */ - simple_heap_delete(conDesc, &tup->t_self); + CatalogTupleDelete(conDesc, &tup->t_self); /* Clean up */ ReleaseSysCache(tup); @@ -666,10 +661,7 @@ RenameConstraintById(Oid conId, const char *newname) /* OK, do the rename --- tuple is a copy, so OK to scribble on it */ namestrcpy(&(con->conname), newname); - simple_heap_update(conDesc, &tuple->t_self, tuple); - - /* update the system catalog indexes */ - CatalogUpdateIndexes(conDesc, tuple); + CatalogTupleUpdate(conDesc, &tuple->t_self, tuple); InvokeObjectPostAlterHook(ConstraintRelationId, conId, 0); @@ -736,8 +728,7 @@ AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, conform->connamespace = newNspId; - simple_heap_update(conRel, &tup->t_self, tup); - CatalogUpdateIndexes(conRel, tup); + CatalogTupleUpdate(conRel, &tup->t_self, tup); /* * Note: currently, the constraint will not have its own @@ -852,8 +843,8 @@ get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok) if (OidIsValid(conOid)) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("domain \"%s\" has multiple constraints named \"%s\"", - format_type_be(typid), conname))); + errmsg("domain %s has multiple constraints named \"%s\"", + format_type_be(typid), conname))); conOid = HeapTupleGetOid(tuple); } } @@ -864,7 +855,7 @@ get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok) if (!OidIsValid(conOid) && !missing_ok) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("constraint \"%s\" for domain \"%s\" does not exist", + errmsg("constraint \"%s\" for domain %s does not exist", conname, format_type_be(typid)))); heap_close(pg_constraint, AccessShareLock); diff --git a/src/backend/catalog/pg_conversion.c b/src/backend/catalog/pg_conversion.c index e2feb1709c..5746dc349a 100644 --- a/src/backend/catalog/pg_conversion.c +++ b/src/backend/catalog/pg_conversion.c @@ -3,7 +3,7 @@ * pg_conversion.c * routines to support manipulation of the pg_conversion relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -105,10 +105,7 @@ ConversionCreate(const char *conname, Oid connamespace, tup = heap_form_tuple(tupDesc, values, nulls); /* insert a new tuple */ - simple_heap_insert(rel, tup); - - /* update the index if any */ - CatalogUpdateIndexes(rel, tup); + CatalogTupleInsert(rel, tup); myself.classId = ConversionRelationId; myself.objectId = HeapTupleGetOid(tup); @@ -168,7 +165,7 @@ RemoveConversionById(Oid conversionOid) /* search for the target tuple */ if (HeapTupleIsValid(tuple = heap_getnext(scan, ForwardScanDirection))) - simple_heap_delete(rel, &tuple->t_self); + CatalogTupleDelete(rel, &tuple->t_self); else elog(ERROR, "could not find tuple for conversion %u", conversionOid); heap_endscan(scan); diff --git a/src/backend/catalog/pg_db_role_setting.c b/src/backend/catalog/pg_db_role_setting.c index 9414ede961..323471bc83 100644 --- a/src/backend/catalog/pg_db_role_setting.c +++ b/src/backend/catalog/pg_db_role_setting.c @@ -2,7 +2,7 @@ * pg_db_role_setting.c * Routines to support manipulation of the pg_db_role_setting relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -88,13 +88,10 @@ AlterSetting(Oid databaseid, Oid roleid, VariableSetStmt *setstmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(rel), repl_val, repl_null, repl_repl); - simple_heap_update(rel, &tuple->t_self, newtuple); - - /* Update indexes */ - CatalogUpdateIndexes(rel, newtuple); + CatalogTupleUpdate(rel, &tuple->t_self, newtuple); } else - simple_heap_delete(rel, &tuple->t_self); + CatalogTupleDelete(rel, &tuple->t_self); } } else if (HeapTupleIsValid(tuple)) @@ -129,13 +126,10 @@ AlterSetting(Oid databaseid, Oid roleid, VariableSetStmt *setstmt) newtuple = heap_modify_tuple(tuple, RelationGetDescr(rel), repl_val, repl_null, repl_repl); - simple_heap_update(rel, &tuple->t_self, newtuple); - - /* Update indexes */ - CatalogUpdateIndexes(rel, newtuple); + CatalogTupleUpdate(rel, &tuple->t_self, newtuple); } else - simple_heap_delete(rel, &tuple->t_self); + CatalogTupleDelete(rel, &tuple->t_self); } else if (valuestr) { @@ -155,10 +149,7 @@ AlterSetting(Oid databaseid, Oid roleid, VariableSetStmt *setstmt) values[Anum_pg_db_role_setting_setconfig - 1] = PointerGetDatum(a); newtuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); - simple_heap_insert(rel, newtuple); - - /* Update indexes */ - CatalogUpdateIndexes(rel, newtuple); + CatalogTupleInsert(rel, newtuple); } InvokeObjectPostAlterHookArg(DbRoleSettingRelationId, @@ -208,7 +199,7 @@ DropSetting(Oid databaseid, Oid roleid) scan = heap_beginscan_catalog(relsetting, numkeys, keys); while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) { - simple_heap_delete(relsetting, &tup->t_self); + CatalogTupleDelete(relsetting, &tup->t_self); } heap_endscan(scan); diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c index 7a0713e6cc..d616df62c1 100644 --- a/src/backend/catalog/pg_depend.c +++ b/src/backend/catalog/pg_depend.c @@ -3,7 +3,7 @@ * pg_depend.c * routines to support manipulation of the pg_depend relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -107,13 +107,11 @@ recordMultipleDependencies(const ObjectAddress *depender, tup = heap_form_tuple(dependDesc->rd_att, values, nulls); - simple_heap_insert(dependDesc, tup); - - /* keep indexes current */ + /* fetch index info only when we know we need it */ if (indstate == NULL) indstate = CatalogOpenIndexes(dependDesc); - CatalogIndexInsert(indstate, tup); + CatalogTupleInsertWithInfo(dependDesc, tup, indstate); heap_freetuple(tup); } @@ -219,7 +217,7 @@ deleteDependencyRecordsFor(Oid classId, Oid objectId, ((Form_pg_depend) GETSTRUCT(tup))->deptype == DEPENDENCY_EXTENSION) continue; - simple_heap_delete(depRel, &tup->t_self); + CatalogTupleDelete(depRel, &tup->t_self); count++; } @@ -269,7 +267,7 @@ deleteDependencyRecordsForClass(Oid classId, Oid objectId, if (depform->refclassid == refclassId && depform->deptype == deptype) { - simple_heap_delete(depRel, &tup->t_self); + CatalogTupleDelete(depRel, &tup->t_self); count++; } } @@ -353,7 +351,7 @@ changeDependencyFor(Oid classId, Oid objectId, depform->refobjid == oldRefObjectId) { if (newIsPinned) - simple_heap_delete(depRel, &tup->t_self); + CatalogTupleDelete(depRel, &tup->t_self); else { /* make a modifiable copy */ @@ -362,8 +360,7 @@ changeDependencyFor(Oid classId, Oid objectId, depform->refobjid = newRefObjectId; - simple_heap_update(depRel, &tup->t_self, tup); - CatalogUpdateIndexes(depRel, tup); + CatalogTupleUpdate(depRel, &tup->t_self, tup); heap_freetuple(tup); } @@ -491,7 +488,7 @@ getExtensionOfObject(Oid classId, Oid objectId) /* * Detect whether a sequence is marked as "owned" by a column * - * An ownership marker is an AUTO dependency from the sequence to the + * An ownership marker is an AUTO or INTERNAL dependency from the sequence to the * column. If we find one, store the identity of the owning column * into *tableId and *colId and return TRUE; else return FALSE. * @@ -500,7 +497,7 @@ getExtensionOfObject(Oid classId, Oid objectId) * not happen, though. */ bool -sequenceIsOwned(Oid seqId, Oid *tableId, int32 *colId) +sequenceIsOwned(Oid seqId, char deptype, Oid *tableId, int32 *colId) { bool ret = false; Relation depRel; @@ -527,7 +524,7 @@ sequenceIsOwned(Oid seqId, Oid *tableId, int32 *colId) Form_pg_depend depform = (Form_pg_depend) GETSTRUCT(tup); if (depform->refclassid == RelationRelationId && - depform->deptype == DEPENDENCY_AUTO) + depform->deptype == deptype) { *tableId = depform->refobjid; *colId = depform->refobjsubid; @@ -544,27 +541,15 @@ sequenceIsOwned(Oid seqId, Oid *tableId, int32 *colId) } /* - * Remove any existing "owned" markers for the specified sequence. - * - * Note: we don't provide a special function to install an "owned" - * marker; just use recordDependencyOn(). - */ -void -markSequenceUnowned(Oid seqId) -{ - deleteDependencyRecordsForClass(RelationRelationId, seqId, - RelationRelationId, DEPENDENCY_AUTO); -} - -/* - * Collect a list of OIDs of all sequences owned by the specified relation. + * Collect a list of OIDs of all sequences owned by the specified relation, + * and column if specified. */ List * -getOwnedSequences(Oid relid) +getOwnedSequences(Oid relid, AttrNumber attnum) { List *result = NIL; Relation depRel; - ScanKeyData key[2]; + ScanKeyData key[3]; SysScanDesc scan; HeapTuple tup; @@ -578,23 +563,28 @@ getOwnedSequences(Oid relid) Anum_pg_depend_refobjid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relid)); + if (attnum) + ScanKeyInit(&key[2], + Anum_pg_depend_refobjsubid, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(attnum)); scan = systable_beginscan(depRel, DependReferenceIndexId, true, - NULL, 2, key); + NULL, attnum ? 3 : 2, key); while (HeapTupleIsValid(tup = systable_getnext(scan))) { Form_pg_depend deprec = (Form_pg_depend) GETSTRUCT(tup); /* - * We assume any auto dependency of a sequence on a column must be - * what we are looking for. (We need the relkind test because indexes - * can also have auto dependencies on columns.) + * We assume any auto or internal dependency of a sequence on a column + * must be what we are looking for. (We need the relkind test because + * indexes can also have auto dependencies on columns.) */ if (deprec->classid == RelationRelationId && deprec->objsubid == 0 && deprec->refobjsubid != 0 && - deprec->deptype == DEPENDENCY_AUTO && + (deprec->deptype == DEPENDENCY_AUTO || deprec->deptype == DEPENDENCY_INTERNAL) && get_rel_relkind(deprec->objid) == RELKIND_SEQUENCE) { result = lappend_oid(result, deprec->objid); @@ -608,6 +598,21 @@ getOwnedSequences(Oid relid) return result; } +/* + * Get owned sequence, error if not exactly one. + */ +Oid +getOwnedSequence(Oid relid, AttrNumber attnum) +{ + List *seqlist = getOwnedSequences(relid, attnum); + + if (list_length(seqlist) > 1) + elog(ERROR, "more than one owned sequence found"); + else if (list_length(seqlist) < 1) + elog(ERROR, "no owned sequence found"); + + return linitial_oid(seqlist); +} /* * get_constraint_index diff --git a/src/backend/catalog/pg_enum.c b/src/backend/catalog/pg_enum.c index af89daa712..300f24d231 100644 --- a/src/backend/catalog/pg_enum.c +++ b/src/backend/catalog/pg_enum.c @@ -3,7 +3,7 @@ * pg_enum.c * routines to support manipulation of the pg_enum relation * - * Copyright (c) 2006-2016, PostgreSQL Global Development Group + * Copyright (c) 2006-2017, PostgreSQL Global Development Group * * * IDENTIFICATION @@ -24,6 +24,7 @@ #include "catalog/pg_type.h" #include "storage/lmgr.h" #include "miscadmin.h" +#include "nodes/value.h" #include "utils/builtins.h" #include "utils/catcache.h" #include "utils/fmgroids.h" @@ -35,7 +36,6 @@ Oid binary_upgrade_next_pg_enum_oid = InvalidOid; static void RenumberEnumType(Relation pg_enum, HeapTuple *existing, int nelems); -static int oid_cmp(const void *p1, const void *p2); static int sort_order_cmp(const void *p1, const void *p2); @@ -124,8 +124,7 @@ EnumValuesCreate(Oid enumTypeOid, List *vals) tup = heap_form_tuple(RelationGetDescr(pg_enum), values, nulls); HeapTupleSetOid(tup, oids[elemno]); - simple_heap_insert(pg_enum, tup); - CatalogUpdateIndexes(pg_enum, tup); + CatalogTupleInsert(pg_enum, tup); heap_freetuple(tup); elemno++; @@ -161,7 +160,7 @@ EnumValuesDelete(Oid enumTypeOid) while (HeapTupleIsValid(tup = systable_getnext(scan))) { - simple_heap_delete(pg_enum, &tup->t_self); + CatalogTupleDelete(pg_enum, &tup->t_self); } systable_endscan(scan); @@ -315,21 +314,21 @@ restart: newelemorder = nbr_en->enumsortorder + 1; else { - other_nbr_en = (Form_pg_enum) GETSTRUCT(existing[other_nbr_index]); - newelemorder = (nbr_en->enumsortorder + - other_nbr_en->enumsortorder) / 2; - /* - * On some machines, newelemorder may be in a register that's - * wider than float4. We need to force it to be rounded to float4 - * precision before making the following comparisons, or we'll get - * wrong results. (Such behavior violates the C standard, but - * fixing the compilers is out of our reach.) + * The midpoint value computed here has to be rounded to float4 + * precision, else our equality comparisons against the adjacent + * values are meaningless. The most portable way of forcing that + * to happen with non-C-standard-compliant compilers is to store + * it into a volatile variable. */ - newelemorder = DatumGetFloat4(Float4GetDatum(newelemorder)); + volatile float4 midpoint; + + other_nbr_en = (Form_pg_enum) GETSTRUCT(existing[other_nbr_index]); + midpoint = (nbr_en->enumsortorder + + other_nbr_en->enumsortorder) / 2; - if (newelemorder == nbr_en->enumsortorder || - newelemorder == other_nbr_en->enumsortorder) + if (midpoint == nbr_en->enumsortorder || + midpoint == other_nbr_en->enumsortorder) { RenumberEnumType(pg_enum, existing, nelems); /* Clean up and start over */ @@ -337,6 +336,8 @@ restart: ReleaseCatCacheList(list); goto restart; } + + newelemorder = midpoint; } } @@ -455,8 +456,91 @@ restart: values[Anum_pg_enum_enumlabel - 1] = NameGetDatum(&enumlabel); enum_tup = heap_form_tuple(RelationGetDescr(pg_enum), values, nulls); HeapTupleSetOid(enum_tup, newOid); - simple_heap_insert(pg_enum, enum_tup); - CatalogUpdateIndexes(pg_enum, enum_tup); + CatalogTupleInsert(pg_enum, enum_tup); + heap_freetuple(enum_tup); + + heap_close(pg_enum, RowExclusiveLock); +} + + +/* + * RenameEnumLabel + * Rename a label in an enum set. + */ +void +RenameEnumLabel(Oid enumTypeOid, + const char *oldVal, + const char *newVal) +{ + Relation pg_enum; + HeapTuple enum_tup; + Form_pg_enum en; + CatCList *list; + int nelems; + HeapTuple old_tup; + bool found_new; + int i; + + /* check length of new label is ok */ + if (strlen(newVal) > (NAMEDATALEN - 1)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid enum label \"%s\"", newVal), + errdetail("Labels must be %d characters or less.", + NAMEDATALEN - 1))); + + /* + * Acquire a lock on the enum type, which we won't release until commit. + * This ensures that two backends aren't concurrently modifying the same + * enum type. Since we are not changing the type's sort order, this is + * probably not really necessary, but there seems no reason not to take + * the lock to be sure. + */ + LockDatabaseObject(TypeRelationId, enumTypeOid, 0, ExclusiveLock); + + pg_enum = heap_open(EnumRelationId, RowExclusiveLock); + + /* Get the list of existing members of the enum */ + list = SearchSysCacheList1(ENUMTYPOIDNAME, + ObjectIdGetDatum(enumTypeOid)); + nelems = list->n_members; + + /* + * Locate the element to rename and check if the new label is already in + * use. (The unique index on pg_enum would catch that anyway, but we + * prefer a friendlier error message.) + */ + old_tup = NULL; + found_new = false; + for (i = 0; i < nelems; i++) + { + enum_tup = &(list->members[i]->tuple); + en = (Form_pg_enum) GETSTRUCT(enum_tup); + if (strcmp(NameStr(en->enumlabel), oldVal) == 0) + old_tup = enum_tup; + if (strcmp(NameStr(en->enumlabel), newVal) == 0) + found_new = true; + } + if (!old_tup) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"%s\" is not an existing enum label", + oldVal))); + if (found_new) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("enum label \"%s\" already exists", + newVal))); + + /* OK, make a writable copy of old tuple */ + enum_tup = heap_copytuple(old_tup); + en = (Form_pg_enum) GETSTRUCT(enum_tup); + + ReleaseCatCacheList(list); + + /* Update the pg_enum entry */ + namestrcpy(&en->enumlabel, newVal); + CatalogTupleUpdate(pg_enum, &enum_tup->t_self, enum_tup); heap_freetuple(enum_tup); heap_close(pg_enum, RowExclusiveLock); @@ -509,9 +593,7 @@ RenumberEnumType(Relation pg_enum, HeapTuple *existing, int nelems) { en->enumsortorder = newsortorder; - simple_heap_update(pg_enum, &newtup->t_self, newtup); - - CatalogUpdateIndexes(pg_enum, newtup); + CatalogTupleUpdate(pg_enum, &newtup->t_self, newtup); } heap_freetuple(newtup); @@ -522,20 +604,6 @@ RenumberEnumType(Relation pg_enum, HeapTuple *existing, int nelems) } -/* qsort comparison function for oids */ -static int -oid_cmp(const void *p1, const void *p2) -{ - Oid v1 = *((const Oid *) p1); - Oid v2 = *((const Oid *) p2); - - if (v1 < v2) - return -1; - if (v1 > v2) - return 1; - return 0; -} - /* qsort comparison function for tuples by sort order */ static int sort_order_cmp(const void *p1, const void *p2) diff --git a/src/backend/catalog/pg_inherits.c b/src/backend/catalog/pg_inherits.c index 00f2ae0bbb..e5fb52cfbf 100644 --- a/src/backend/catalog/pg_inherits.c +++ b/src/backend/catalog/pg_inherits.c @@ -8,7 +8,7 @@ * Perhaps someday that code should be moved here, but it'd have to be * disentangled from other stuff such as pg_depend updates. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -27,12 +27,20 @@ #include "catalog/pg_inherits_fn.h" #include "parser/parse_type.h" #include "storage/lmgr.h" +#include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/memutils.h" #include "utils/syscache.h" #include "utils/tqual.h" -static int oid_cmp(const void *p1, const void *p2); - +/* + * Entry of a hash table used in find_all_inheritors. See below. + */ +typedef struct SeenRelsEntry +{ + Oid rel_id; /* relation oid */ + ListCell *numparents_cell; /* corresponding list cell */ +} SeenRelsEntry; /* * find_inheritance_children @@ -158,10 +166,23 @@ find_inheritance_children(Oid parentrelId, LOCKMODE lockmode) List * find_all_inheritors(Oid parentrelId, LOCKMODE lockmode, List **numparents) { + /* hash table for O(1) rel_oid -> rel_numparents cell lookup */ + HTAB *seen_rels; + HASHCTL ctl; List *rels_list, *rel_numparents; ListCell *l; + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(SeenRelsEntry); + ctl.hcxt = CurrentMemoryContext; + + seen_rels = hash_create("find_all_inheritors temporary table", + 32, /* start small and extend */ + &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + /* * We build a list starting with the given rel and adding all direct and * indirect children. We can use a single list as both the record of @@ -191,26 +212,21 @@ find_all_inheritors(Oid parentrelId, LOCKMODE lockmode, List **numparents) foreach(lc, currentchildren) { Oid child_oid = lfirst_oid(lc); - bool found = false; - ListCell *lo; - ListCell *li; + bool found; + SeenRelsEntry *hash_entry; - /* if the rel is already there, bump number-of-parents counter */ - forboth(lo, rels_list, li, rel_numparents) + hash_entry = hash_search(seen_rels, &child_oid, HASH_ENTER, &found); + if (found) { - if (lfirst_oid(lo) == child_oid) - { - lfirst_int(li)++; - found = true; - break; - } + /* if the rel is already there, bump number-of-parents counter */ + lfirst_int(hash_entry->numparents_cell)++; } - - /* if it's not there, add it. expect 1 parent, initially. */ - if (!found) + else { + /* if it's not there, add it. expect 1 parent, initially. */ rels_list = lappend_oid(rels_list, child_oid); rel_numparents = lappend_int(rel_numparents, 1); + hash_entry->numparents_cell = rel_numparents->tail; } } } @@ -219,6 +235,9 @@ find_all_inheritors(Oid parentrelId, LOCKMODE lockmode, List **numparents) *numparents = rel_numparents; else list_free(rel_numparents); + + hash_destroy(seen_rels); + return rels_list; } @@ -357,18 +376,3 @@ typeInheritsFrom(Oid subclassTypeId, Oid superclassTypeId) return result; } - - -/* qsort comparison function */ -static int -oid_cmp(const void *p1, const void *p2) -{ - Oid v1 = *((const Oid *) p1); - Oid v2 = *((const Oid *) p2); - - if (v1 < v2) - return -1; - if (v1 > v2) - return 1; - return 0; -} diff --git a/src/backend/catalog/pg_largeobject.c b/src/backend/catalog/pg_largeobject.c index d08b94e28f..fc4f4f8c9b 100644 --- a/src/backend/catalog/pg_largeobject.c +++ b/src/backend/catalog/pg_largeobject.c @@ -3,7 +3,7 @@ * pg_largeobject.c * routines to support manipulation of the pg_largeobject relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -63,11 +63,9 @@ LargeObjectCreate(Oid loid) if (OidIsValid(loid)) HeapTupleSetOid(ntup, loid); - loid_new = simple_heap_insert(pg_lo_meta, ntup); + loid_new = CatalogTupleInsert(pg_lo_meta, ntup); Assert(!OidIsValid(loid) || loid == loid_new); - CatalogUpdateIndexes(pg_lo_meta, ntup); - heap_freetuple(ntup); heap_close(pg_lo_meta, RowExclusiveLock); @@ -112,7 +110,7 @@ LargeObjectDrop(Oid loid) (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("large object %u does not exist", loid))); - simple_heap_delete(pg_lo_meta, &tuple->t_self); + CatalogTupleDelete(pg_lo_meta, &tuple->t_self); systable_endscan(scan); @@ -129,7 +127,7 @@ LargeObjectDrop(Oid loid) NULL, 1, skey); while (HeapTupleIsValid(tuple = systable_getnext(scan))) { - simple_heap_delete(pg_largeobject, &tuple->t_self); + CatalogTupleDelete(pg_largeobject, &tuple->t_self); } systable_endscan(scan); diff --git a/src/backend/catalog/pg_namespace.c b/src/backend/catalog/pg_namespace.c index e5eed79237..3e20d051c2 100644 --- a/src/backend/catalog/pg_namespace.c +++ b/src/backend/catalog/pg_namespace.c @@ -3,7 +3,7 @@ * pg_namespace.c * routines to support manipulation of the pg_namespace relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -31,10 +31,11 @@ * Create a namespace (schema) with the given name and owner OID. * * If isTemp is true, this schema is a per-backend schema for holding - * temporary tables. Currently, the only effect of that is to prevent it - * from being linked as a member of any active extension. (If someone - * does CREATE TEMP TABLE in an extension script, we don't want the temp - * schema to become part of the extension.) + * temporary tables. Currently, it is used to prevent it from being + * linked as a member of any active extension. (If someone does CREATE + * TEMP TABLE in an extension script, we don't want the temp schema to + * become part of the extension). And to avoid checking for default ACL + * for temp namespace (as it is not necessary). * --------------- */ Oid @@ -49,6 +50,7 @@ NamespaceCreate(const char *nspName, Oid ownerId, bool isTemp) TupleDesc tupDesc; ObjectAddress myself; int i; + Acl *nspacl; /* sanity checks */ if (!nspName) @@ -60,6 +62,12 @@ NamespaceCreate(const char *nspName, Oid ownerId, bool isTemp) (errcode(ERRCODE_DUPLICATE_SCHEMA), errmsg("schema \"%s\" already exists", nspName))); + if (!isTemp) + nspacl = get_user_default_acl(ACL_OBJECT_NAMESPACE, ownerId, + InvalidOid); + else + nspacl = NULL; + /* initialize nulls and values */ for (i = 0; i < Natts_pg_namespace; i++) { @@ -69,18 +77,19 @@ NamespaceCreate(const char *nspName, Oid ownerId, bool isTemp) namestrcpy(&nname, nspName); values[Anum_pg_namespace_nspname - 1] = NameGetDatum(&nname); values[Anum_pg_namespace_nspowner - 1] = ObjectIdGetDatum(ownerId); - nulls[Anum_pg_namespace_nspacl - 1] = true; + if (nspacl != NULL) + values[Anum_pg_namespace_nspacl - 1] = PointerGetDatum(nspacl); + else + nulls[Anum_pg_namespace_nspacl - 1] = true; nspdesc = heap_open(NamespaceRelationId, RowExclusiveLock); tupDesc = nspdesc->rd_att; tup = heap_form_tuple(tupDesc, values, nulls); - nspoid = simple_heap_insert(nspdesc, tup); + nspoid = CatalogTupleInsert(nspdesc, tup); Assert(OidIsValid(nspoid)); - CatalogUpdateIndexes(nspdesc, tup); - heap_close(nspdesc, RowExclusiveLock); /* Record dependencies */ diff --git a/src/backend/catalog/pg_operator.c b/src/backend/catalog/pg_operator.c index 5b5cd3fc01..b5cbc04889 100644 --- a/src/backend/catalog/pg_operator.c +++ b/src/backend/catalog/pg_operator.c @@ -3,7 +3,7 @@ * pg_operator.c * routines to support manipulation of the pg_operator relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -262,9 +262,7 @@ OperatorShellMake(const char *operatorName, /* * insert our "shell" operator tuple */ - operatorObjectId = simple_heap_insert(pg_operator_desc, tup); - - CatalogUpdateIndexes(pg_operator_desc, tup); + operatorObjectId = CatalogTupleInsert(pg_operator_desc, tup); /* Add dependencies for the entry */ makeOperatorDependencies(tup, false); @@ -526,7 +524,7 @@ OperatorCreate(const char *operatorName, nulls, replaces); - simple_heap_update(pg_operator_desc, &tup->t_self, tup); + CatalogTupleUpdate(pg_operator_desc, &tup->t_self, tup); } else { @@ -535,12 +533,9 @@ OperatorCreate(const char *operatorName, tup = heap_form_tuple(RelationGetDescr(pg_operator_desc), values, nulls); - operatorObjectId = simple_heap_insert(pg_operator_desc, tup); + operatorObjectId = CatalogTupleInsert(pg_operator_desc, tup); } - /* Must update the indexes in either case */ - CatalogUpdateIndexes(pg_operator_desc, tup); - /* Add dependencies for the entry */ address = makeOperatorDependencies(tup, isUpdate); @@ -695,8 +690,7 @@ OperatorUpd(Oid baseId, Oid commId, Oid negId, bool isDelete) /* If any columns were found to need modification, update tuple. */ if (update_commutator) { - simple_heap_update(pg_operator_desc, &tup->t_self, tup); - CatalogUpdateIndexes(pg_operator_desc, tup); + CatalogTupleUpdate(pg_operator_desc, &tup->t_self, tup); /* * Do CCI to make the updated tuple visible. We must do this in @@ -741,8 +735,7 @@ OperatorUpd(Oid baseId, Oid commId, Oid negId, bool isDelete) /* If any columns were found to need modification, update tuple. */ if (update_negator) { - simple_heap_update(pg_operator_desc, &tup->t_self, tup); - CatalogUpdateIndexes(pg_operator_desc, tup); + CatalogTupleUpdate(pg_operator_desc, &tup->t_self, tup); /* * In the deletion case, do CCI to make the updated tuple visible. diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c index 75621bd6e3..0f7ab80f65 100644 --- a/src/backend/catalog/pg_proc.c +++ b/src/backend/catalog/pg_proc.c @@ -4,7 +4,7 @@ * routines to support manipulation of the pg_proc relation * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -38,6 +38,7 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/lsyscache.h" +#include "utils/regproc.h" #include "utils/rel.h" #include "utils/syscache.h" #ifdef PGXC @@ -47,10 +48,6 @@ #endif -Datum fmgr_internal_validator(PG_FUNCTION_ARGS); -Datum fmgr_c_validator(PG_FUNCTION_ARGS); -Datum fmgr_sql_validator(PG_FUNCTION_ARGS); - typedef struct { char *proname; @@ -519,8 +516,7 @@ ProcedureCreate(const char *procedureName, Anum_pg_proc_proargdefaults, &isnull); Assert(!isnull); - oldDefaults = (List *) stringToNode(TextDatumGetCString(proargdefaults)); - Assert(IsA(oldDefaults, List)); + oldDefaults = castNode(List, stringToNode(TextDatumGetCString(proargdefaults))); Assert(list_length(oldDefaults) == oldproc->pronargdefaults); /* new list can have more defaults than old, advance over 'em */ @@ -582,7 +578,7 @@ ProcedureCreate(const char *procedureName, /* Okay, do it... */ tup = heap_modify_tuple(oldtup, tupDesc, values, nulls, replaces); - simple_heap_update(rel, &tup->t_self, tup); + CatalogTupleUpdate(rel, &tup->t_self, tup); ReleaseSysCache(oldtup); is_update = true; @@ -600,12 +596,10 @@ ProcedureCreate(const char *procedureName, nulls[Anum_pg_proc_proacl - 1] = true; tup = heap_form_tuple(tupDesc, values, nulls); - simple_heap_insert(rel, tup); + CatalogTupleInsert(rel, tup); is_update = false; } - /* Need to update indexes for either the insert or update case */ - CatalogUpdateIndexes(rel, tup); retval = HeapTupleGetOid(tup); @@ -940,7 +934,7 @@ fmgr_sql_validator(PG_FUNCTION_ARGS) querytree_list = NIL; foreach(lc, raw_parsetree_list) { - Node *parsetree = (Node *) lfirst(lc); + RawStmt *parsetree = lfirst_node(RawStmt, lc); List *querytree_sublist; #ifdef PGXC @@ -954,9 +948,8 @@ fmgr_sql_validator(PG_FUNCTION_ARGS) querytree_sublist = pg_analyze_and_rewrite_params(parsetree, prosrc, (ParserSetupHook) sql_fn_parser_setup, - pinfo); - - + pinfo, + NULL); querytree_list = list_concat(querytree_list, querytree_sublist); } diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c new file mode 100644 index 0000000000..17105f4f2c --- /dev/null +++ b/src/backend/catalog/pg_publication.c @@ -0,0 +1,465 @@ +/*------------------------------------------------------------------------- + * + * pg_publication.c + * publication C API manipulation + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * pg_publication.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "funcapi.h" +#include "miscadmin.h" + +#include "access/genam.h" +#include "access/hash.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" + +#include "catalog/catalog.h" +#include "catalog/dependency.h" +#include "catalog/index.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/objectaccess.h" +#include "catalog/objectaddress.h" +#include "catalog/pg_type.h" +#include "catalog/pg_publication.h" +#include "catalog/pg_publication_rel.h" + +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/fmgroids.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +/* + * Check if relation can be in given publication and throws appropriate + * error if not. + */ +static void +check_publication_add_relation(Relation targetrel) +{ + /* Give more specific error for partitioned tables */ + if (RelationGetForm(targetrel)->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("\"%s\" is a partitioned table", + RelationGetRelationName(targetrel)), + errdetail("Adding partitioned tables to publications is not supported."), + errhint("You can add the table partitions individually."))); + + /* Must be table */ + if (RelationGetForm(targetrel)->relkind != RELKIND_RELATION) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"%s\" is not a table", + RelationGetRelationName(targetrel)), + errdetail("Only tables can be added to publications."))); + + /* Can't be system table */ + if (IsCatalogRelation(targetrel)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"%s\" is a system table", + RelationGetRelationName(targetrel)), + errdetail("System tables cannot be added to publications."))); + + /* UNLOGGED and TEMP relations cannot be part of publication. */ + if (!RelationNeedsWAL(targetrel)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("table \"%s\" cannot be replicated", + RelationGetRelationName(targetrel)), + errdetail("Temporary and unlogged relations cannot be replicated."))); +} + +/* + * Returns if relation represented by oid and Form_pg_class entry + * is publishable. + * + * Does same checks as the above, but does not need relation to be opened + * and also does not throw errors. + * + * Note this also excludes all tables with relid < FirstNormalObjectId, + * ie all tables created during initdb. This mainly affects the preinstalled + * information_schema. (IsCatalogClass() only checks for these inside + * pg_catalog and toast schemas.) + */ +static bool +is_publishable_class(Oid relid, Form_pg_class reltuple) +{ + return reltuple->relkind == RELKIND_RELATION && + !IsCatalogClass(relid, reltuple) && + reltuple->relpersistence == RELPERSISTENCE_PERMANENT && + relid >= FirstNormalObjectId; +} + +/* + * Insert new publication / relation mapping. + */ +ObjectAddress +publication_add_relation(Oid pubid, Relation targetrel, + bool if_not_exists) +{ + Relation rel; + HeapTuple tup; + Datum values[Natts_pg_publication_rel]; + bool nulls[Natts_pg_publication_rel]; + Oid relid = RelationGetRelid(targetrel); + Oid prrelid; + Publication *pub = GetPublication(pubid); + ObjectAddress myself, + referenced; + + rel = heap_open(PublicationRelRelationId, RowExclusiveLock); + + /* + * Check for duplicates. Note that this does not really prevent + * duplicates, it's here just to provide nicer error message in common + * case. The real protection is the unique key on the catalog. + */ + if (SearchSysCacheExists2(PUBLICATIONRELMAP, ObjectIdGetDatum(relid), + ObjectIdGetDatum(pubid))) + { + heap_close(rel, RowExclusiveLock); + + if (if_not_exists) + return InvalidObjectAddress; + + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("relation \"%s\" is already member of publication \"%s\"", + RelationGetRelationName(targetrel), pub->name))); + } + + check_publication_add_relation(targetrel); + + /* Form a tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + values[Anum_pg_publication_rel_prpubid - 1] = + ObjectIdGetDatum(pubid); + values[Anum_pg_publication_rel_prrelid - 1] = + ObjectIdGetDatum(relid); + + tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); + + /* Insert tuple into catalog. */ + prrelid = CatalogTupleInsert(rel, tup); + heap_freetuple(tup); + + ObjectAddressSet(myself, PublicationRelRelationId, prrelid); + + /* Add dependency on the publication */ + ObjectAddressSet(referenced, PublicationRelationId, pubid); + recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); + + /* Add dependency on the relation */ + ObjectAddressSet(referenced, RelationRelationId, relid); + recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); + + /* Close the table. */ + heap_close(rel, RowExclusiveLock); + + /* Invalidate relcache so that publication info is rebuilt. */ + CacheInvalidateRelcache(targetrel); + + return myself; +} + + +/* + * Gets list of publication oids for a relation oid. + */ +List * +GetRelationPublications(Oid relid) +{ + List *result = NIL; + CatCList *pubrellist; + int i; + + /* Find all publications associated with the relation. */ + pubrellist = SearchSysCacheList1(PUBLICATIONRELMAP, + ObjectIdGetDatum(relid)); + for (i = 0; i < pubrellist->n_members; i++) + { + HeapTuple tup = &pubrellist->members[i]->tuple; + Oid pubid = ((Form_pg_publication_rel) GETSTRUCT(tup))->prpubid; + + result = lappend_oid(result, pubid); + } + + ReleaseSysCacheList(pubrellist); + + return result; +} + +/* + * Gets list of relation oids for a publication. + * + * This should only be used for normal publications, the FOR ALL TABLES + * should use GetAllTablesPublicationRelations(). + */ +List * +GetPublicationRelations(Oid pubid) +{ + List *result; + Relation pubrelsrel; + ScanKeyData scankey; + SysScanDesc scan; + HeapTuple tup; + + /* Find all publications associated with the relation. */ + pubrelsrel = heap_open(PublicationRelRelationId, AccessShareLock); + + ScanKeyInit(&scankey, + Anum_pg_publication_rel_prpubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(pubid)); + + scan = systable_beginscan(pubrelsrel, PublicationRelPrrelidPrpubidIndexId, + true, NULL, 1, &scankey); + + result = NIL; + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_publication_rel pubrel; + + pubrel = (Form_pg_publication_rel) GETSTRUCT(tup); + + result = lappend_oid(result, pubrel->prrelid); + } + + systable_endscan(scan); + heap_close(pubrelsrel, AccessShareLock); + + return result; +} + +/* + * Gets list of publication oids for publications marked as FOR ALL TABLES. + */ +List * +GetAllTablesPublications(void) +{ + List *result; + Relation rel; + ScanKeyData scankey; + SysScanDesc scan; + HeapTuple tup; + + /* Find all publications that are marked as for all tables. */ + rel = heap_open(PublicationRelationId, AccessShareLock); + + ScanKeyInit(&scankey, + Anum_pg_publication_puballtables, + BTEqualStrategyNumber, F_BOOLEQ, + BoolGetDatum(true)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, 1, &scankey); + + result = NIL; + while (HeapTupleIsValid(tup = systable_getnext(scan))) + result = lappend_oid(result, HeapTupleGetOid(tup)); + + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return result; +} + +/* + * Gets list of all relation published by FOR ALL TABLES publication(s). + */ +List * +GetAllTablesPublicationRelations(void) +{ + Relation classRel; + ScanKeyData key[1]; + HeapScanDesc scan; + HeapTuple tuple; + List *result = NIL; + + classRel = heap_open(RelationRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_class_relkind, + BTEqualStrategyNumber, F_CHAREQ, + CharGetDatum(RELKIND_RELATION)); + + scan = heap_beginscan_catalog(classRel, 1, key); + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + Oid relid = HeapTupleGetOid(tuple); + Form_pg_class relForm = (Form_pg_class) GETSTRUCT(tuple); + + if (is_publishable_class(relid, relForm)) + result = lappend_oid(result, relid); + } + + heap_endscan(scan); + heap_close(classRel, AccessShareLock); + + return result; +} + +/* + * Get publication using oid + * + * The Publication struct and its data are palloc'ed here. + */ +Publication * +GetPublication(Oid pubid) +{ + HeapTuple tup; + Publication *pub; + Form_pg_publication pubform; + + tup = SearchSysCache1(PUBLICATIONOID, ObjectIdGetDatum(pubid)); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for publication %u", pubid); + + pubform = (Form_pg_publication) GETSTRUCT(tup); + + pub = (Publication *) palloc(sizeof(Publication)); + pub->oid = pubid; + pub->name = pstrdup(NameStr(pubform->pubname)); + pub->alltables = pubform->puballtables; + pub->pubactions.pubinsert = pubform->pubinsert; + pub->pubactions.pubupdate = pubform->pubupdate; + pub->pubactions.pubdelete = pubform->pubdelete; + + ReleaseSysCache(tup); + + return pub; +} + + +/* + * Get Publication using name. + */ +Publication * +GetPublicationByName(const char *pubname, bool missing_ok) +{ + Oid oid; + + oid = GetSysCacheOid1(PUBLICATIONNAME, CStringGetDatum(pubname)); + if (!OidIsValid(oid)) + { + if (missing_ok) + return NULL; + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("publication \"%s\" does not exist", pubname))); + } + + return GetPublication(oid); +} + +/* + * get_publication_oid - given a publication name, look up the OID + * + * If missing_ok is false, throw an error if name not found. If true, just + * return InvalidOid. + */ +Oid +get_publication_oid(const char *pubname, bool missing_ok) +{ + Oid oid; + + oid = GetSysCacheOid1(PUBLICATIONNAME, CStringGetDatum(pubname)); + if (!OidIsValid(oid) && !missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("publication \"%s\" does not exist", pubname))); + return oid; +} + +/* + * get_publication_name - given a publication Oid, look up the name + */ +char * +get_publication_name(Oid pubid) +{ + HeapTuple tup; + char *pubname; + Form_pg_publication pubform; + + tup = SearchSysCache1(PUBLICATIONOID, ObjectIdGetDatum(pubid)); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for publication %u", pubid); + + pubform = (Form_pg_publication) GETSTRUCT(tup); + pubname = pstrdup(NameStr(pubform->pubname)); + + ReleaseSysCache(tup); + + return pubname; +} + +/* + * Returns Oids of tables in a publication. + */ +Datum +pg_get_publication_tables(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + char *pubname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + Publication *publication; + List *tables; + ListCell **lcp; + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + publication = GetPublicationByName(pubname, false); + if (publication->alltables) + tables = GetAllTablesPublicationRelations(); + else + tables = GetPublicationRelations(publication->oid); + lcp = (ListCell **) palloc(sizeof(ListCell *)); + *lcp = list_head(tables); + funcctx->user_fctx = (void *) lcp; + + MemoryContextSwitchTo(oldcontext); + } + + /* stuff done on every call of the function */ + funcctx = SRF_PERCALL_SETUP(); + lcp = (ListCell **) funcctx->user_fctx; + + while (*lcp != NULL) + { + Oid relid = lfirst_oid(*lcp); + + *lcp = lnext(*lcp); + SRF_RETURN_NEXT(funcctx, ObjectIdGetDatum(relid)); + } + + SRF_RETURN_DONE(funcctx); +} diff --git a/src/backend/catalog/pg_range.c b/src/backend/catalog/pg_range.c index 84e7733e74..a3b0fb8838 100644 --- a/src/backend/catalog/pg_range.c +++ b/src/backend/catalog/pg_range.c @@ -3,7 +3,7 @@ * pg_range.c * routines to support manipulation of the pg_range relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -58,8 +58,7 @@ RangeCreate(Oid rangeTypeOid, Oid rangeSubType, Oid rangeCollation, tup = heap_form_tuple(RelationGetDescr(pg_range), values, nulls); - simple_heap_insert(pg_range, tup); - CatalogUpdateIndexes(pg_range, tup); + CatalogTupleInsert(pg_range, tup); heap_freetuple(tup); /* record type's dependencies on range-related items */ @@ -130,7 +129,7 @@ RangeDelete(Oid rangeTypeOid) while (HeapTupleIsValid(tup = systable_getnext(scan))) { - simple_heap_delete(pg_range, &tup->t_self); + CatalogTupleDelete(pg_range, &tup->t_self); } systable_endscan(scan); diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c index 65ecc45d49..d28a8afb47 100644 --- a/src/backend/catalog/pg_shdepend.c +++ b/src/backend/catalog/pg_shdepend.c @@ -3,7 +3,7 @@ * pg_shdepend.c * routines to support manipulation of the pg_shdepend relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -39,6 +39,8 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" #include "catalog/pg_shdepend.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_ts_config.h" #include "catalog/pg_ts_dict.h" @@ -53,7 +55,9 @@ #include "commands/extension.h" #include "commands/policy.h" #include "commands/proclang.h" +#include "commands/publicationcmds.h" #include "commands/schemacmds.h" +#include "commands/subscriptioncmds.h" #include "commands/tablecmds.h" #include "commands/typecmds.h" #include "storage/lmgr.h" @@ -246,7 +250,7 @@ shdepChangeDep(Relation sdepRel, { /* No new entry needed, so just delete existing entry if any */ if (oldtup) - simple_heap_delete(sdepRel, &oldtup->t_self); + CatalogTupleDelete(sdepRel, &oldtup->t_self); } else if (oldtup) { @@ -257,10 +261,7 @@ shdepChangeDep(Relation sdepRel, shForm->refclassid = refclassid; shForm->refobjid = refobjid; - simple_heap_update(sdepRel, &oldtup->t_self, oldtup); - - /* keep indexes current */ - CatalogUpdateIndexes(sdepRel, oldtup); + CatalogTupleUpdate(sdepRel, &oldtup->t_self, oldtup); } else { @@ -284,10 +285,7 @@ shdepChangeDep(Relation sdepRel, * it's certainly a new tuple */ oldtup = heap_form_tuple(RelationGetDescr(sdepRel), values, nulls); - simple_heap_insert(sdepRel, oldtup); - - /* keep indexes current */ - CatalogUpdateIndexes(sdepRel, oldtup); + CatalogTupleInsert(sdepRel, oldtup); } if (oldtup) @@ -756,10 +754,7 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId) HeapTuple newtup; newtup = heap_modify_tuple(tup, sdepDesc, values, nulls, replace); - simple_heap_insert(sdepRel, newtup); - - /* Keep indexes current */ - CatalogIndexInsert(indstate, newtup); + CatalogTupleInsertWithInfo(sdepRel, newtup, indstate); heap_freetuple(newtup); } @@ -801,7 +796,7 @@ dropDatabaseDependencies(Oid databaseId) while (HeapTupleIsValid(tup = systable_getnext(scan))) { - simple_heap_delete(sdepRel, &tup->t_self); + CatalogTupleDelete(sdepRel, &tup->t_self); } systable_endscan(scan); @@ -879,10 +874,7 @@ shdepAddDependency(Relation sdepRel, tup = heap_form_tuple(sdepRel->rd_att, values, nulls); - simple_heap_insert(sdepRel, tup); - - /* keep indexes current */ - CatalogUpdateIndexes(sdepRel, tup); + CatalogTupleInsert(sdepRel, tup); /* clean up */ heap_freetuple(tup); @@ -957,7 +949,7 @@ shdepDropDependency(Relation sdepRel, continue; /* OK, delete it */ - simple_heap_delete(sdepRel, &tup->t_self); + CatalogTupleDelete(sdepRel, &tup->t_self); } systable_endscan(scan); @@ -1406,6 +1398,14 @@ shdepReassignOwned(List *roleids, Oid newrole) AlterEventTriggerOwner_oid(sdepForm->objid, newrole); break; + case PublicationRelationId: + AlterPublicationOwner_oid(sdepForm->objid, newrole); + break; + + case SubscriptionRelationId: + AlterSubscriptionOwner_oid(sdepForm->objid, newrole); + break; + /* Generic alter owner cases */ case CollationRelationId: case ConversionRelationId: @@ -1416,6 +1416,7 @@ shdepReassignOwned(List *roleids, Oid newrole) case OperatorFamilyRelationId: case OperatorClassRelationId: case ExtensionRelationId: + case StatisticExtRelationId: case TableSpaceRelationId: case DatabaseRelationId: case TSConfigRelationId: diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c new file mode 100644 index 0000000000..ab5f3719fc --- /dev/null +++ b/src/backend/catalog/pg_subscription.c @@ -0,0 +1,504 @@ +/*------------------------------------------------------------------------- + * + * pg_subscription.c + * replication subscriptions + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/catalog/pg_subscription.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" + +#include "catalog/indexing.h" +#include "catalog/pg_type.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" + +#include "nodes/makefuncs.h" + +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/syscache.h" + + +static List *textarray_to_stringlist(ArrayType *textarray); + +/* + * Fetch the subscription from the syscache. + */ +Subscription * +GetSubscription(Oid subid, bool missing_ok) +{ + HeapTuple tup; + Subscription *sub; + Form_pg_subscription subform; + Datum datum; + bool isnull; + + tup = SearchSysCache1(SUBSCRIPTIONOID, ObjectIdGetDatum(subid)); + + if (!HeapTupleIsValid(tup)) + { + if (missing_ok) + return NULL; + + elog(ERROR, "cache lookup failed for subscription %u", subid); + } + + subform = (Form_pg_subscription) GETSTRUCT(tup); + + sub = (Subscription *) palloc(sizeof(Subscription)); + sub->oid = subid; + sub->dbid = subform->subdbid; + sub->name = pstrdup(NameStr(subform->subname)); + sub->owner = subform->subowner; + sub->enabled = subform->subenabled; + + /* Get conninfo */ + datum = SysCacheGetAttr(SUBSCRIPTIONOID, + tup, + Anum_pg_subscription_subconninfo, + &isnull); + Assert(!isnull); + sub->conninfo = TextDatumGetCString(datum); + + /* Get slotname */ + datum = SysCacheGetAttr(SUBSCRIPTIONOID, + tup, + Anum_pg_subscription_subslotname, + &isnull); + if (!isnull) + sub->slotname = pstrdup(NameStr(*DatumGetName(datum))); + else + sub->slotname = NULL; + + /* Get synccommit */ + datum = SysCacheGetAttr(SUBSCRIPTIONOID, + tup, + Anum_pg_subscription_subsynccommit, + &isnull); + Assert(!isnull); + sub->synccommit = TextDatumGetCString(datum); + + /* Get publications */ + datum = SysCacheGetAttr(SUBSCRIPTIONOID, + tup, + Anum_pg_subscription_subpublications, + &isnull); + Assert(!isnull); + sub->publications = textarray_to_stringlist(DatumGetArrayTypeP(datum)); + + ReleaseSysCache(tup); + + return sub; +} + +/* + * Return number of subscriptions defined in given database. + * Used by dropdb() to check if database can indeed be dropped. + */ +int +CountDBSubscriptions(Oid dbid) +{ + int nsubs = 0; + Relation rel; + ScanKeyData scankey; + SysScanDesc scan; + HeapTuple tup; + + rel = heap_open(SubscriptionRelationId, RowExclusiveLock); + + ScanKeyInit(&scankey, + Anum_pg_subscription_subdbid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(dbid)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, 1, &scankey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + nsubs++; + + systable_endscan(scan); + + heap_close(rel, NoLock); + + return nsubs; +} + +/* + * Free memory allocated by subscription struct. + */ +void +FreeSubscription(Subscription *sub) +{ + pfree(sub->name); + pfree(sub->conninfo); + if (sub->slotname) + pfree(sub->slotname); + list_free_deep(sub->publications); + pfree(sub); +} + +/* + * get_subscription_oid - given a subscription name, look up the OID + * + * If missing_ok is false, throw an error if name not found. If true, just + * return InvalidOid. + */ +Oid +get_subscription_oid(const char *subname, bool missing_ok) +{ + Oid oid; + + oid = GetSysCacheOid2(SUBSCRIPTIONNAME, MyDatabaseId, + CStringGetDatum(subname)); + if (!OidIsValid(oid) && !missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("subscription \"%s\" does not exist", subname))); + return oid; +} + +/* + * get_subscription_name - given a subscription OID, look up the name + */ +char * +get_subscription_name(Oid subid) +{ + HeapTuple tup; + char *subname; + Form_pg_subscription subform; + + tup = SearchSysCache1(SUBSCRIPTIONOID, ObjectIdGetDatum(subid)); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for subscription %u", subid); + + subform = (Form_pg_subscription) GETSTRUCT(tup); + subname = pstrdup(NameStr(subform->subname)); + + ReleaseSysCache(tup); + + return subname; +} + +/* + * Convert text array to list of strings. + * + * Note: the resulting list of strings is pallocated here. + */ +static List * +textarray_to_stringlist(ArrayType *textarray) +{ + Datum *elems; + int nelems, + i; + List *res = NIL; + + deconstruct_array(textarray, + TEXTOID, -1, false, 'i', + &elems, NULL, &nelems); + + if (nelems == 0) + return NIL; + + for (i = 0; i < nelems; i++) + res = lappend(res, makeString(TextDatumGetCString(elems[i]))); + + return res; +} + +/* + * Set the state of a subscription table. + * + * The insert-or-update logic in this function is not concurrency safe so it + * might raise an error in rare circumstances. But if we took a stronger lock + * such as ShareRowExclusiveLock, we would risk more deadlocks. + */ +Oid +SetSubscriptionRelState(Oid subid, Oid relid, char state, + XLogRecPtr sublsn) +{ + Relation rel; + HeapTuple tup; + Oid subrelid; + bool nulls[Natts_pg_subscription_rel]; + Datum values[Natts_pg_subscription_rel]; + + rel = heap_open(SubscriptionRelRelationId, RowExclusiveLock); + + /* Try finding existing mapping. */ + tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, + ObjectIdGetDatum(relid), + ObjectIdGetDatum(subid)); + + /* + * If the record for given table does not exist yet create new record, + * otherwise update the existing one. + */ + if (!HeapTupleIsValid(tup)) + { + /* Form the tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_subscription_rel_srsubid - 1] = ObjectIdGetDatum(subid); + values[Anum_pg_subscription_rel_srrelid - 1] = ObjectIdGetDatum(relid); + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + if (sublsn != InvalidXLogRecPtr) + values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); + else + nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; + + tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); + + /* Insert tuple into catalog. */ + subrelid = CatalogTupleInsert(rel, tup); + + heap_freetuple(tup); + } + else + { + bool replaces[Natts_pg_subscription_rel]; + + /* Update the tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + replaces[Anum_pg_subscription_rel_srsubstate - 1] = true; + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + + replaces[Anum_pg_subscription_rel_srsublsn - 1] = true; + if (sublsn != InvalidXLogRecPtr) + values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); + else + nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); + + /* Update the catalog. */ + CatalogTupleUpdate(rel, &tup->t_self, tup); + + subrelid = HeapTupleGetOid(tup); + } + + /* Cleanup. */ + heap_close(rel, NoLock); + + return subrelid; +} + +/* + * Get state of subscription table. + * + * Returns SUBREL_STATE_UNKNOWN when not found and missing_ok is true. + */ +char +GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn, + bool missing_ok) +{ + Relation rel; + HeapTuple tup; + char substate; + bool isnull; + Datum d; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + /* Try finding the mapping. */ + tup = SearchSysCache2(SUBSCRIPTIONRELMAP, + ObjectIdGetDatum(relid), + ObjectIdGetDatum(subid)); + + if (!HeapTupleIsValid(tup)) + { + if (missing_ok) + { + heap_close(rel, AccessShareLock); + *sublsn = InvalidXLogRecPtr; + return SUBREL_STATE_UNKNOWN; + } + + elog(ERROR, "subscription table %u in subscription %u does not exist", + relid, subid); + } + + /* Get the state. */ + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, + Anum_pg_subscription_rel_srsubstate, &isnull); + Assert(!isnull); + substate = DatumGetChar(d); + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, + Anum_pg_subscription_rel_srsublsn, &isnull); + if (isnull) + *sublsn = InvalidXLogRecPtr; + else + *sublsn = DatumGetLSN(d); + + /* Cleanup */ + ReleaseSysCache(tup); + heap_close(rel, AccessShareLock); + + return substate; +} + +/* + * Drop subscription relation mapping. These can be for a particular + * subscription, or for a particular relation, or both. + */ +void +RemoveSubscriptionRel(Oid subid, Oid relid) +{ + Relation rel; + HeapScanDesc scan; + ScanKeyData skey[2]; + HeapTuple tup; + int nkeys = 0; + + rel = heap_open(SubscriptionRelRelationId, RowExclusiveLock); + + if (OidIsValid(subid)) + { + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(subid)); + } + + if (OidIsValid(relid)) + { + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srrelid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(relid)); + } + + /* Do the search and delete what we found. */ + scan = heap_beginscan_catalog(rel, nkeys, skey); + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + simple_heap_delete(rel, &tup->t_self); + } + heap_endscan(scan); + + heap_close(rel, RowExclusiveLock); +} + + +/* + * Get all relations for subscription. + * + * Returned list is palloc'ed in current memory context. + */ +List * +GetSubscriptionRelations(Oid subid) +{ + List *res = NIL; + Relation rel; + HeapTuple tup; + int nkeys = 0; + ScanKeyData skey[2]; + SysScanDesc scan; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, nkeys, skey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + SubscriptionRelState *relstate; + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + + relstate = (SubscriptionRelState *) palloc(sizeof(SubscriptionRelState)); + relstate->relid = subrel->srrelid; + relstate->state = subrel->srsubstate; + relstate->lsn = subrel->srsublsn; + + res = lappend(res, relstate); + } + + /* Cleanup */ + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return res; +} + +/* + * Get all relations for subscription that are not in a ready state. + * + * Returned list is palloc'ed in current memory context. + */ +List * +GetSubscriptionNotReadyRelations(Oid subid) +{ + List *res = NIL; + Relation rel; + HeapTuple tup; + int nkeys = 0; + ScanKeyData skey[2]; + SysScanDesc scan; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubstate, + BTEqualStrategyNumber, F_CHARNE, + CharGetDatum(SUBREL_STATE_READY)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, nkeys, skey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + SubscriptionRelState *relstate; + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + + relstate = (SubscriptionRelState *) palloc(sizeof(SubscriptionRelState)); + relstate->relid = subrel->srrelid; + relstate->state = subrel->srsubstate; + relstate->lsn = subrel->srsublsn; + + res = lappend(res, relstate); + } + + /* Cleanup */ + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return res; +} diff --git a/src/backend/catalog/pg_type.c b/src/backend/catalog/pg_type.c index 4b2d281f2c..6b0e4f4729 100644 --- a/src/backend/catalog/pg_type.c +++ b/src/backend/catalog/pg_type.c @@ -3,7 +3,7 @@ * pg_type.c * routines to support manipulation of the pg_type relation * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -142,9 +142,7 @@ TypeShellMake(const char *typeName, Oid typeNamespace, Oid ownerId) /* * insert the tuple in the relation and get the tuple's oid. */ - typoid = simple_heap_insert(pg_type_desc, tup); - - CatalogUpdateIndexes(pg_type_desc, tup); + typoid = CatalogTupleInsert(pg_type_desc, tup); /* * Create dependencies. We can/must skip this in bootstrap mode. @@ -430,7 +428,7 @@ TypeCreate(Oid newTypeOid, nulls, replaces); - simple_heap_update(pg_type_desc, &tup->t_self, tup); + CatalogTupleUpdate(pg_type_desc, &tup->t_self, tup); typeObjectId = HeapTupleGetOid(tup); @@ -458,12 +456,9 @@ TypeCreate(Oid newTypeOid, } /* else allow system to assign oid */ - typeObjectId = simple_heap_insert(pg_type_desc, tup); + typeObjectId = CatalogTupleInsert(pg_type_desc, tup); } - /* Update indexes */ - CatalogUpdateIndexes(pg_type_desc, tup); - /* * Create dependencies. We can/must skip this in bootstrap mode. */ @@ -700,6 +695,7 @@ RenameTypeInternal(Oid typeOid, const char *newTypeName, Oid typeNamespace) HeapTuple tuple; Form_pg_type typ; Oid arrayOid; + Oid oldTypeOid; pg_type_desc = heap_open(TypeRelationId, RowExclusiveLock); @@ -713,29 +709,45 @@ RenameTypeInternal(Oid typeOid, const char *newTypeName, Oid typeNamespace) arrayOid = typ->typarray; - /* Just to give a more friendly error than unique-index violation */ - if (SearchSysCacheExists2(TYPENAMENSP, - CStringGetDatum(newTypeName), - ObjectIdGetDatum(typeNamespace))) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("type \"%s\" already exists", newTypeName))); + /* Check for a conflicting type name. */ + oldTypeOid = GetSysCacheOid2(TYPENAMENSP, + CStringGetDatum(newTypeName), + ObjectIdGetDatum(typeNamespace)); + + /* + * If there is one, see if it's an autogenerated array type, and if so + * rename it out of the way. (But we must skip that for a shell type + * because moveArrayTypeName will do the wrong thing in that case.) + * Otherwise, we can at least give a more friendly error than unique-index + * violation. + */ + if (OidIsValid(oldTypeOid)) + { + if (get_typisdefined(oldTypeOid) && + moveArrayTypeName(oldTypeOid, newTypeName, typeNamespace)) + /* successfully dodged the problem */ ; + else + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("type \"%s\" already exists", newTypeName))); + } /* OK, do the rename --- tuple is a copy, so OK to scribble on it */ namestrcpy(&(typ->typname), newTypeName); - simple_heap_update(pg_type_desc, &tuple->t_self, tuple); - - /* update the system catalog indexes */ - CatalogUpdateIndexes(pg_type_desc, tuple); + CatalogTupleUpdate(pg_type_desc, &tuple->t_self, tuple); InvokeObjectPostAlterHook(TypeRelationId, typeOid, 0); heap_freetuple(tuple); heap_close(pg_type_desc, RowExclusiveLock); - /* If the type has an array type, recurse to handle that */ - if (OidIsValid(arrayOid)) + /* + * If the type has an array type, recurse to handle that. But we don't + * need to do anything more if we already renamed that array type above + * (which would happen when, eg, renaming "foo" to "_foo"). + */ + if (OidIsValid(arrayOid) && arrayOid != oldTypeOid) { char *arrname = makeArrayTypeName(newTypeName, typeNamespace); diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c index 297010be9f..a35cf7866d 100644 --- a/src/backend/catalog/pgxc_class.c +++ b/src/backend/catalog/pgxc_class.c @@ -79,9 +79,7 @@ PgxcClassCreate(Oid pcrelid, htup = heap_form_tuple(pgxcclassrel->rd_att, values, nulls); - (void) simple_heap_insert(pgxcclassrel, htup); - - CatalogUpdateIndexes(pgxcclassrel, htup); + CatalogTupleInsert(pgxcclassrel, htup); heap_close(pgxcclassrel, RowExclusiveLock); } @@ -176,8 +174,7 @@ PgxcClassAlter(Oid pcrelid, newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel), new_record, new_record_nulls, new_record_repl); - simple_heap_update(rel, &oldtup->t_self, newtup); - CatalogUpdateIndexes(rel, newtup); + CatalogTupleUpdate(rel, &oldtup->t_self, newtup); heap_close(rel, RowExclusiveLock); } diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt index 8956ba9304..2821b9b702 100644 --- a/src/backend/catalog/sql_features.txt +++ b/src/backend/catalog/sql_features.txt @@ -200,7 +200,7 @@ F181 Multiple module support NO F191 Referential delete actions YES F200 TRUNCATE TABLE statement YES F201 CAST function YES -F202 TRUNCATE TABLE: identity column restart option NO +F202 TRUNCATE TABLE: identity column restart option YES F221 Explicit defaults YES F222 INSERT statement: DEFAULT VALUES clause YES F231 Privilege tables YES @@ -241,9 +241,9 @@ F381 Extended schema manipulation 02 ALTER TABLE statement: ADD CONSTRAINT claus F381 Extended schema manipulation 03 ALTER TABLE statement: DROP CONSTRAINT clause YES F382 Alter column data type YES F383 Set column not null clause YES -F384 Drop identity property clause NO +F384 Drop identity property clause YES F385 Drop column generation expression clause NO -F386 Set identity column generation clause NO +F386 Set identity column generation clause YES F391 Long identifiers YES F392 Unicode escapes in identifiers YES F393 Unicode escapes in literals YES @@ -420,11 +420,11 @@ T152 DISTINCT predicate with negation YES T171 LIKE clause in table definition YES T172 AS subquery clause in table definition YES T173 Extended LIKE clause in table definition YES -T174 Identity columns NO +T174 Identity columns YES T175 Generated columns NO T176 Sequence generator support NO -T177 Sequence generator support: simple restart option NO -T178 Identity columns: simple restart option NO +T177 Sequence generator support: simple restart option YES +T178 Identity columns: simple restart option YES T180 System-versioned tables NO T181 Application-time period tables NO T191 Referential action RESTRICT YES diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index a759e16c72..d5c4754d01 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -4,7 +4,7 @@ * code to create and destroy physical storage for relations * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 4fc5d5a065..0fdad0c119 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1,7 +1,7 @@ /* * PostgreSQL System Views * - * Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Copyright (c) 1996-2017, PostgreSQL Global Development Group * * src/backend/catalog/system_views.sql * @@ -76,6 +76,12 @@ CREATE VIEW pg_policies AS C.relname AS tablename, pol.polname AS policyname, CASE + WHEN pol.polpermissive THEN + 'PERMISSIVE' + ELSE + 'RESTRICTIVE' + END AS permissive, + CASE WHEN pol.polroles = '{0}' THEN string_to_array('public', '') ELSE @@ -130,7 +136,7 @@ CREATE VIEW pg_tables AS C.relrowsecurity AS rowsecurity FROM pg_class C LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) LEFT JOIN pg_tablespace T ON (T.oid = C.reltablespace) - WHERE C.relkind = 'r'; + WHERE C.relkind IN ('r', 'p'); CREATE VIEW pg_matviews AS SELECT @@ -158,6 +164,28 @@ CREATE VIEW pg_indexes AS LEFT JOIN pg_tablespace T ON (T.oid = I.reltablespace) WHERE C.relkind IN ('r', 'm') AND I.relkind = 'i'; +CREATE OR REPLACE VIEW pg_sequences AS + SELECT + N.nspname AS schemaname, + C.relname AS sequencename, + pg_get_userbyid(C.relowner) AS sequenceowner, + S.seqtypid::regtype AS data_type, + S.seqstart AS start_value, + S.seqmin AS min_value, + S.seqmax AS max_value, + S.seqincrement AS increment_by, + S.seqcycle AS cycle, + S.seqcache AS cache_size, + CASE + WHEN has_sequence_privilege(C.oid, 'SELECT,USAGE'::text) + THEN pg_sequence_last_value(C.oid) + ELSE NULL + END AS last_value + FROM pg_sequence S JOIN pg_class C ON (C.oid = S.seqrelid) + LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) + WHERE NOT pg_is_other_temp_schema(N.oid) + AND relkind = 'S'; + CREATE VIEW pg_stats WITH (security_barrier) AS SELECT nspname AS schemaname, @@ -225,6 +253,15 @@ CREATE VIEW pg_stats WITH (security_barrier) AS REVOKE ALL on pg_statistic FROM public; +CREATE VIEW pg_publication_tables AS + SELECT + P.pubname AS pubname, + N.nspname AS schemaname, + C.relname AS tablename + FROM pg_publication P, pg_class C + JOIN pg_namespace N ON (N.oid = C.relnamespace) + WHERE C.oid IN (SELECT relid FROM pg_get_publication_tables(P.pubname)); + CREATE VIEW pg_locks AS SELECT * FROM pg_lock_status() AS L; @@ -257,7 +294,7 @@ CREATE VIEW pg_prepared_statements AS CREATE VIEW pg_seclabels AS SELECT l.objoid, l.classoid, l.objsubid, - CASE WHEN rel.relkind = 'r' THEN 'table'::text + CASE WHEN rel.relkind IN ('r', 'p') THEN 'table'::text WHEN rel.relkind = 'v' THEN 'view'::text WHEN rel.relkind = 'm' THEN 'materialized view'::text WHEN rel.relkind = 'S' THEN 'sequence'::text @@ -378,6 +415,28 @@ WHERE l.objsubid = 0 UNION ALL SELECT + l.objoid, l.classoid, l.objsubid, + 'publication'::text AS objtype, + NULL::oid AS objnamespace, + quote_ident(p.pubname) AS objname, + l.provider, l.label +FROM + pg_seclabel l + JOIN pg_publication p ON l.classoid = p.tableoid AND l.objoid = p.oid +WHERE + l.objsubid = 0 +UNION ALL +SELECT + l.objoid, l.classoid, 0::int4 AS objsubid, + 'subscription'::text AS objtype, + NULL::oid AS objnamespace, + quote_ident(s.subname) AS objname, + l.provider, l.label +FROM + pg_shseclabel l + JOIN pg_subscription s ON l.classoid = s.tableoid AND l.objoid = s.oid +UNION ALL +SELECT l.objoid, l.classoid, 0::int4 AS objsubid, 'database'::text AS objtype, NULL::oid AS objnamespace, @@ -427,6 +486,12 @@ CREATE VIEW pg_file_settings AS REVOKE ALL on pg_file_settings FROM PUBLIC; REVOKE EXECUTE ON FUNCTION pg_show_all_file_settings() FROM PUBLIC; +CREATE VIEW pg_hba_file_rules AS + SELECT * FROM pg_hba_file_rules() AS A; + +REVOKE ALL on pg_hba_file_rules FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION pg_hba_file_rules() FROM PUBLIC; + CREATE VIEW pg_timezone_abbrevs AS SELECT * FROM pg_timezone_abbrevs(); @@ -641,10 +706,11 @@ CREATE VIEW pg_stat_activity AS S.state, S.backend_xid, s.backend_xmin, - S.query - FROM pg_database D, pg_stat_get_activity(NULL) AS S, pg_authid U - WHERE S.datid = D.oid AND - S.usesysid = U.oid; + S.query, + S.backend_type + FROM pg_stat_get_activity(NULL) AS S + LEFT JOIN pg_database AS D ON (S.datid = D.oid) + LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid); CREATE VIEW pg_stat_replication AS SELECT @@ -658,16 +724,18 @@ CREATE VIEW pg_stat_replication AS S.backend_start, S.backend_xmin, W.state, - W.sent_location, - W.write_location, - W.flush_location, - W.replay_location, + W.sent_lsn, + W.write_lsn, + W.flush_lsn, + W.replay_lsn, + W.write_lag, + W.flush_lag, + W.replay_lag, W.sync_priority, W.sync_state - FROM pg_stat_get_activity(NULL) AS S, pg_authid U, - pg_stat_get_wal_senders() AS W - WHERE S.usesysid = U.oid AND - S.pid = W.pid; + FROM pg_stat_get_activity(NULL) AS S + JOIN pg_stat_get_wal_senders() AS W ON (S.pid = W.pid) + LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid); CREATE VIEW pg_stat_wal_receiver AS SELECT @@ -686,6 +754,21 @@ CREATE VIEW pg_stat_wal_receiver AS FROM pg_stat_get_wal_receiver() s WHERE s.pid IS NOT NULL; +CREATE VIEW pg_stat_subscription AS + SELECT + su.oid AS subid, + su.subname, + st.pid, + st.relid, + st.received_lsn, + st.last_msg_send_time, + st.last_msg_receipt_time, + st.latest_end_lsn, + st.latest_end_time + FROM pg_subscription su + LEFT JOIN pg_stat_get_subscription(NULL) st + ON (st.subid = su.oid); + CREATE VIEW pg_stat_ssl AS SELECT S.pid, @@ -704,6 +787,7 @@ CREATE VIEW pg_replication_slots AS L.slot_type, L.datoid, D.datname AS database, + L.temporary, L.active, L.active_pid, L.xmin, @@ -813,7 +897,7 @@ CREATE VIEW pg_stat_progress_vacuum AS S.param4 AS heap_blks_vacuumed, S.param5 AS index_vacuum_count, S.param6 AS max_dead_tuples, S.param7 AS num_dead_tuples FROM pg_stat_get_progress_info('VACUUM') AS S - JOIN pg_database D ON S.datid = D.oid; + LEFT JOIN pg_database D ON S.datid = D.oid; CREATE VIEW pg_user_mappings AS SELECT @@ -826,24 +910,29 @@ CREATE VIEW pg_user_mappings AS ELSE A.rolname END AS usename, - CASE WHEN pg_has_role(S.srvowner, 'USAGE') OR has_server_privilege(S.oid, 'USAGE') THEN - U.umoptions - ELSE - NULL - END AS umoptions + CASE WHEN (U.umuser <> 0 AND A.rolname = current_user) + OR (U.umuser = 0 AND pg_has_role(S.srvowner, 'USAGE')) + OR (SELECT rolsuper FROM pg_authid WHERE rolname = current_user) + THEN U.umoptions + ELSE NULL END AS umoptions FROM pg_user_mapping U - LEFT JOIN pg_authid A ON (A.oid = U.umuser) JOIN - pg_foreign_server S ON (U.umserver = S.oid); + JOIN pg_foreign_server S ON (U.umserver = S.oid) + LEFT JOIN pg_authid A ON (A.oid = U.umuser); REVOKE ALL on pg_user_mapping FROM public; - CREATE VIEW pg_replication_origin_status AS SELECT * FROM pg_show_replication_origin_status(); REVOKE ALL ON pg_replication_origin_status FROM public; +-- All columns of pg_subscription except subconninfo are readable. +REVOKE ALL ON pg_subscription FROM public; +GRANT SELECT (subdbid, subname, subowner, subenabled, subslotname, subpublications) + ON pg_subscription TO public; + + -- -- We have a few function definitions in here, too. -- At some point there might be enough to justify breaking them out into @@ -926,6 +1015,12 @@ CREATE OR REPLACE FUNCTION RETURNS pg_lsn STRICT VOLATILE LANGUAGE internal AS 'pg_start_backup' PARALLEL RESTRICTED; +CREATE OR REPLACE FUNCTION pg_stop_backup ( + exclusive boolean, wait_for_archive boolean DEFAULT true, + OUT lsn pg_lsn, OUT labelfile text, OUT spcmapfile text) + RETURNS SETOF record STRICT VOLATILE LANGUAGE internal as 'pg_stop_backup_v2' + PARALLEL RESTRICTED; + -- legacy definition for compatibility with 9.3 CREATE OR REPLACE FUNCTION json_populate_record(base anyelement, from_json json, use_json_as_text boolean DEFAULT false) @@ -938,7 +1033,7 @@ CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION pg_logical_slot_get_changes( IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT location pg_lsn, OUT xid xid, OUT data text) + OUT lsn pg_lsn, OUT xid xid, OUT data text) RETURNS SETOF RECORD LANGUAGE INTERNAL VOLATILE ROWS 1000 COST 1000 @@ -946,7 +1041,7 @@ AS 'pg_logical_slot_get_changes'; CREATE OR REPLACE FUNCTION pg_logical_slot_peek_changes( IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT location pg_lsn, OUT xid xid, OUT data text) + OUT lsn pg_lsn, OUT xid xid, OUT data text) RETURNS SETOF RECORD LANGUAGE INTERNAL VOLATILE ROWS 1000 COST 1000 @@ -954,7 +1049,7 @@ AS 'pg_logical_slot_peek_changes'; CREATE OR REPLACE FUNCTION pg_logical_slot_get_binary_changes( IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT location pg_lsn, OUT xid xid, OUT data bytea) + OUT lsn pg_lsn, OUT xid xid, OUT data bytea) RETURNS SETOF RECORD LANGUAGE INTERNAL VOLATILE ROWS 1000 COST 1000 @@ -962,7 +1057,7 @@ AS 'pg_logical_slot_get_binary_changes'; CREATE OR REPLACE FUNCTION pg_logical_slot_peek_binary_changes( IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT location pg_lsn, OUT xid xid, OUT data bytea) + OUT lsn pg_lsn, OUT xid xid, OUT data bytea) RETURNS SETOF RECORD LANGUAGE INTERNAL VOLATILE ROWS 1000 COST 1000 @@ -970,12 +1065,22 @@ AS 'pg_logical_slot_peek_binary_changes'; CREATE OR REPLACE FUNCTION pg_create_physical_replication_slot( IN slot_name name, IN immediately_reserve boolean DEFAULT false, - OUT slot_name name, OUT xlog_position pg_lsn) + IN temporary boolean DEFAULT false, + OUT slot_name name, OUT lsn pg_lsn) RETURNS RECORD LANGUAGE INTERNAL STRICT VOLATILE AS 'pg_create_physical_replication_slot'; +CREATE OR REPLACE FUNCTION pg_create_logical_replication_slot( + IN slot_name name, IN plugin name, + IN temporary boolean DEFAULT false, + OUT slot_name text, OUT lsn pg_lsn) +RETURNS RECORD +LANGUAGE INTERNAL +STRICT VOLATILE +AS 'pg_create_logical_replication_slot'; + CREATE OR REPLACE FUNCTION make_interval(years int4 DEFAULT 0, months int4 DEFAULT 0, weeks int4 DEFAULT 0, days int4 DEFAULT 0, hours int4 DEFAULT 0, mins int4 DEFAULT 0, @@ -1016,15 +1121,26 @@ AS 'jsonb_insert'; -- available to superuser / cluster owner, if they choose. REVOKE EXECUTE ON FUNCTION pg_start_backup(text, boolean, boolean) FROM public; REVOKE EXECUTE ON FUNCTION pg_stop_backup() FROM public; -REVOKE EXECUTE ON FUNCTION pg_stop_backup(boolean) FROM public; +REVOKE EXECUTE ON FUNCTION pg_stop_backup(boolean, boolean) FROM public; REVOKE EXECUTE ON FUNCTION pg_create_restore_point(text) FROM public; -REVOKE EXECUTE ON FUNCTION pg_switch_xlog() FROM public; -REVOKE EXECUTE ON FUNCTION pg_xlog_replay_pause() FROM public; -REVOKE EXECUTE ON FUNCTION pg_xlog_replay_resume() FROM public; +REVOKE EXECUTE ON FUNCTION pg_switch_wal() FROM public; +REVOKE EXECUTE ON FUNCTION pg_wal_replay_pause() FROM public; +REVOKE EXECUTE ON FUNCTION pg_wal_replay_resume() FROM public; REVOKE EXECUTE ON FUNCTION pg_rotate_logfile() FROM public; REVOKE EXECUTE ON FUNCTION pg_reload_conf() FROM public; +REVOKE EXECUTE ON FUNCTION pg_current_logfile() FROM public; +REVOKE EXECUTE ON FUNCTION pg_current_logfile(text) FROM public; REVOKE EXECUTE ON FUNCTION pg_stat_reset() FROM public; REVOKE EXECUTE ON FUNCTION pg_stat_reset_shared(text) FROM public; REVOKE EXECUTE ON FUNCTION pg_stat_reset_single_table_counters(oid) FROM public; REVOKE EXECUTE ON FUNCTION pg_stat_reset_single_function_counters(oid) FROM public; + +REVOKE EXECUTE ON FUNCTION pg_ls_logdir() FROM public; +REVOKE EXECUTE ON FUNCTION pg_ls_waldir() FROM public; +GRANT EXECUTE ON FUNCTION pg_ls_logdir() TO pg_monitor; +GRANT EXECUTE ON FUNCTION pg_ls_waldir() TO pg_monitor; + +GRANT pg_read_all_settings TO pg_monitor; +GRANT pg_read_all_stats TO pg_monitor; +GRANT pg_stat_scan_tables TO pg_monitor; diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 564e10e3a2..29756eb14e 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -4,7 +4,7 @@ * This file contains routines to support creation of toast tables * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -307,7 +307,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, indexInfo->ii_Expressions = NIL; indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_Predicate = NIL; - indexInfo->ii_PredicateState = NIL; + indexInfo->ii_PredicateState = NULL; indexInfo->ii_ExclusionOps = NULL; indexInfo->ii_ExclusionProcs = NULL; indexInfo->ii_ExclusionStrats = NULL; @@ -315,6 +315,8 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, indexInfo->ii_ReadyForInserts = true; indexInfo->ii_Concurrent = false; indexInfo->ii_BrokenHotChain = false; + indexInfo->ii_AmCache = NULL; + indexInfo->ii_Context = CurrentMemoryContext; collationObjectId[0] = InvalidOid; collationObjectId[1] = InvalidOid; @@ -350,10 +352,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, if (!IsBootstrapProcessingMode()) { /* normal case, use a transactional update */ - simple_heap_update(class_rel, &reltup->t_self, reltup); - - /* Keep catalog indexes current */ - CatalogUpdateIndexes(class_rel, reltup); + CatalogTupleUpdate(class_rel, &reltup->t_self, reltup); } else { diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile index 6b3742c0a0..4a6c99e090 100644 --- a/src/backend/commands/Makefile +++ b/src/backend/commands/Makefile @@ -17,9 +17,9 @@ OBJS = amcmds.o aggregatecmds.o alter.o analyze.o async.o cluster.o comment.o \ dbcommands.o define.o discard.o dropcmds.o \ event_trigger.o explain.o extension.o foreigncmds.o functioncmds.o \ indexcmds.o lockcmds.o matview.o operatorcmds.o opclasscmds.o \ - policy.o portalcmds.o prepare.o proclang.o \ - schemacmds.o seclabel.o sequence.o tablecmds.o tablespace.o trigger.o \ - tsearchcmds.o typecmds.o user.o vacuum.o vacuumlazy.o \ - variable.o view.o + policy.o portalcmds.o prepare.o proclang.o publicationcmds.o \ + schemacmds.o seclabel.o sequence.o statscmds.o subscriptioncmds.o \ + tablecmds.o tablespace.o trigger.o tsearchcmds.o typecmds.o user.o \ + vacuum.o vacuumlazy.o variable.o view.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/commands/aggregatecmds.c b/src/backend/commands/aggregatecmds.c index d34c82c5ba..a84c61493f 100644 --- a/src/backend/commands/aggregatecmds.c +++ b/src/backend/commands/aggregatecmds.c @@ -4,7 +4,7 @@ * * Routines for aggregate-manipulation commands * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -22,7 +22,6 @@ */ #include "postgres.h" -#include "access/heapam.h" #include "access/htup_details.h" #include "catalog/dependency.h" #include "catalog/indexing.h" @@ -52,8 +51,7 @@ * "parameters" is a list of DefElem representing the agg's definition clauses. */ ObjectAddress -DefineAggregate(List *name, List *args, bool oldstyle, List *parameters, - const char *queryString) +DefineAggregate(ParseState *pstate, List *name, List *args, bool oldstyle, List *parameters) { char *aggName; Oid aggNamespace; @@ -111,13 +109,13 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters, aggKind = AGGKIND_ORDERED_SET; else numDirectArgs = 0; - args = (List *) linitial(args); + args = linitial_node(List, args); } /* Examine aggregate's definition clauses */ foreach(pl, parameters) { - DefElem *defel = (DefElem *) lfirst(pl); + DefElem *defel = lfirst_node(DefElem, pl); /* * sfunc1, stype1, and initcond1 are accepted as obsolete spellings @@ -287,10 +285,10 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters, errmsg("basetype is redundant with aggregate input type specification"))); numArgs = list_length(args); - interpret_function_parameter_list(args, + interpret_function_parameter_list(pstate, + args, InvalidOid, true, /* is an aggregate */ - queryString, ¶meterTypes, &allParameterTypes, ¶meterModes, diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c index 1301bcb5e8..4d3fe8c745 100644 --- a/src/backend/commands/alter.c +++ b/src/backend/commands/alter.c @@ -3,7 +3,7 @@ * alter.c * Drivers for generic alter commands * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -32,6 +32,8 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_ts_config.h" #include "catalog/pg_ts_dict.h" #include "catalog/pg_ts_parser.h" @@ -45,7 +47,9 @@ #include "commands/extension.h" #include "commands/policy.h" #include "commands/proclang.h" +#include "commands/publicationcmds.h" #include "commands/schemacmds.h" +#include "commands/subscriptioncmds.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" #include "commands/trigger.h" @@ -88,6 +92,12 @@ report_name_conflict(Oid classId, const char *name) case LanguageRelationId: msgfmt = gettext_noop("language \"%s\" already exists"); break; + case PublicationRelationId: + msgfmt = gettext_noop("publication \"%s\" already exists"); + break; + case SubscriptionRelationId: + msgfmt = gettext_noop("subscription \"%s\" already exists"); + break; default: elog(ERROR, "unsupported object class %u", classId); break; @@ -111,6 +121,10 @@ report_namespace_conflict(Oid classId, const char *name, Oid nspOid) Assert(OidIsValid(nspOid)); msgfmt = gettext_noop("conversion \"%s\" already exists in schema \"%s\""); break; + case StatisticExtRelationId: + Assert(OidIsValid(nspOid)); + msgfmt = gettext_noop("statistics object \"%s\" already exists in schema \"%s\""); + break; case TSParserRelationId: Assert(OidIsValid(nspOid)); msgfmt = gettext_noop("text search parser \"%s\" already exists in schema \"%s\""); @@ -254,6 +268,12 @@ AlterObjectRename_internal(Relation rel, Oid objectId, const char *new_name) IsThereOpFamilyInNamespace(new_name, opf->opfmethod, opf->opfnamespace); } + else if (classId == SubscriptionRelationId) + { + if (SearchSysCacheExists2(SUBSCRIPTIONNAME, MyDatabaseId, + CStringGetDatum(new_name))) + report_name_conflict(classId, new_name); + } else if (nameCacheId >= 0) { if (OidIsValid(namespaceId)) @@ -282,8 +302,7 @@ AlterObjectRename_internal(Relation rel, Oid objectId, const char *new_name) values, nulls, replaces); /* Perform actual update */ - simple_heap_update(rel, &oldtup->t_self, newtup); - CatalogUpdateIndexes(rel, newtup); + CatalogTupleUpdate(rel, &oldtup->t_self, newtup); InvokeObjectPostAlterHook(classId, objectId, 0); @@ -359,17 +378,20 @@ ExecRenameStmt(RenameStmt *stmt) case OBJECT_OPCLASS: case OBJECT_OPFAMILY: case OBJECT_LANGUAGE: + case OBJECT_STATISTIC_EXT: case OBJECT_TSCONFIGURATION: case OBJECT_TSDICTIONARY: case OBJECT_TSPARSER: case OBJECT_TSTEMPLATE: + case OBJECT_PUBLICATION: + case OBJECT_SUBSCRIPTION: { ObjectAddress address; Relation catalog; Relation relation; address = get_object_address(stmt->renameType, - stmt->object, stmt->objarg, + stmt->object, &relation, AccessExclusiveLock, false); Assert(relation == NULL); @@ -405,8 +427,8 @@ ExecAlterObjectDependsStmt(AlterObjectDependsStmt *stmt, ObjectAddress *refAddre Relation rel; address = - get_object_address_rv(stmt->objectType, stmt->relation, stmt->objname, - stmt->objargs, &rel, AccessExclusiveLock, false); + get_object_address_rv(stmt->objectType, stmt->relation, (List *) stmt->object, + &rel, AccessExclusiveLock, false); /* * If a relation was involved, it would have been opened and locked. We @@ -415,8 +437,8 @@ ExecAlterObjectDependsStmt(AlterObjectDependsStmt *stmt, ObjectAddress *refAddre if (rel) heap_close(rel, NoLock); - refAddr = get_object_address(OBJECT_EXTENSION, list_make1(stmt->extname), - NULL, &rel, AccessExclusiveLock, false); + refAddr = get_object_address(OBJECT_EXTENSION, (Node *) stmt->extname, + &rel, AccessExclusiveLock, false); Assert(rel == NULL); if (refAddress) *refAddress = refAddr; @@ -445,7 +467,7 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt, switch (stmt->objectType) { case OBJECT_EXTENSION: - address = AlterExtensionNamespace(stmt->object, stmt->newschema, + address = AlterExtensionNamespace(strVal((Value *) stmt->object), stmt->newschema, oldSchemaAddr ? &oldNspOid : NULL); break; @@ -460,7 +482,7 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt, case OBJECT_DOMAIN: case OBJECT_TYPE: - address = AlterTypeNamespace(stmt->object, stmt->newschema, + address = AlterTypeNamespace(castNode(List, stmt->object), stmt->newschema, stmt->objectType, oldSchemaAddr ? &oldNspOid : NULL); break; @@ -473,6 +495,7 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt, case OBJECT_OPERATOR: case OBJECT_OPCLASS: case OBJECT_OPFAMILY: + case OBJECT_STATISTIC_EXT: case OBJECT_TSCONFIGURATION: case OBJECT_TSDICTIONARY: case OBJECT_TSPARSER: @@ -485,7 +508,6 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt, address = get_object_address(stmt->objectType, stmt->object, - stmt->objarg, &relation, AccessExclusiveLock, false); @@ -521,7 +543,8 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt, * so it only needs to cover object types that can be members of an * extension, and it doesn't have to deal with certain special cases * such as not wanting to process array types --- those should never - * be direct members of an extension anyway. + * be direct members of an extension anyway. Nonetheless, we insist + * on listing all OCLASS types in the switch. * * Returns the OID of the object's previous namespace, or InvalidOid if * object doesn't have a schema. @@ -556,12 +579,13 @@ AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid, oldNspOid = AlterTypeNamespace_oid(objid, nspOid, objsMoved); break; + case OCLASS_PROC: case OCLASS_COLLATION: case OCLASS_CONVERSION: case OCLASS_OPERATOR: case OCLASS_OPCLASS: case OCLASS_OPFAMILY: - case OCLASS_PROC: + case OCLASS_STATISTIC_EXT: case OCLASS_TSPARSER: case OCLASS_TSDICT: case OCLASS_TSTEMPLATE: @@ -578,8 +602,38 @@ AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid, } break; - default: + case OCLASS_CAST: + case OCLASS_CONSTRAINT: + case OCLASS_DEFAULT: + case OCLASS_LANGUAGE: + case OCLASS_LARGEOBJECT: + case OCLASS_AM: + case OCLASS_AMOP: + case OCLASS_AMPROC: + case OCLASS_REWRITE: + case OCLASS_TRIGGER: + case OCLASS_SCHEMA: + case OCLASS_ROLE: + case OCLASS_DATABASE: + case OCLASS_TBLSPACE: + case OCLASS_FDW: + case OCLASS_FOREIGN_SERVER: + case OCLASS_USER_MAPPING: + case OCLASS_DEFACL: + case OCLASS_EXTENSION: + case OCLASS_EVENT_TRIGGER: + case OCLASS_POLICY: + case OCLASS_PUBLICATION: + case OCLASS_PUBLICATION_REL: + case OCLASS_SUBSCRIPTION: + case OCLASS_TRANSFORM: + /* ignore object types that don't have schema-qualified names */ break; + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new OCLASS hasn't been handled above. + */ } return oldNspOid; @@ -720,8 +774,7 @@ AlterObjectNamespace_internal(Relation rel, Oid objid, Oid nspOid) values, nulls, replaces); /* Perform actual update */ - simple_heap_update(rel, &tup->t_self, newtup); - CatalogUpdateIndexes(rel, newtup); + CatalogTupleUpdate(rel, &tup->t_self, newtup); /* Release memory */ pfree(values); @@ -749,26 +802,34 @@ ExecAlterOwnerStmt(AlterOwnerStmt *stmt) switch (stmt->objectType) { case OBJECT_DATABASE: - return AlterDatabaseOwner(strVal(linitial(stmt->object)), newowner); + return AlterDatabaseOwner(strVal((Value *) stmt->object), newowner); case OBJECT_SCHEMA: - return AlterSchemaOwner(strVal(linitial(stmt->object)), newowner); + return AlterSchemaOwner(strVal((Value *) stmt->object), newowner); case OBJECT_TYPE: case OBJECT_DOMAIN: /* same as TYPE */ - return AlterTypeOwner(stmt->object, newowner, stmt->objectType); + return AlterTypeOwner(castNode(List, stmt->object), newowner, stmt->objectType); break; case OBJECT_FDW: - return AlterForeignDataWrapperOwner(strVal(linitial(stmt->object)), + return AlterForeignDataWrapperOwner(strVal((Value *) stmt->object), newowner); case OBJECT_FOREIGN_SERVER: - return AlterForeignServerOwner(strVal(linitial(stmt->object)), + return AlterForeignServerOwner(strVal((Value *) stmt->object), newowner); case OBJECT_EVENT_TRIGGER: - return AlterEventTriggerOwner(strVal(linitial(stmt->object)), + return AlterEventTriggerOwner(strVal((Value *) stmt->object), + newowner); + + case OBJECT_PUBLICATION: + return AlterPublicationOwner(strVal((Value *) stmt->object), + newowner); + + case OBJECT_SUBSCRIPTION: + return AlterSubscriptionOwner(strVal((Value *) stmt->object), newowner); /* Generic cases */ @@ -781,6 +842,7 @@ ExecAlterOwnerStmt(AlterOwnerStmt *stmt) case OBJECT_OPERATOR: case OBJECT_OPCLASS: case OBJECT_OPFAMILY: + case OBJECT_STATISTIC_EXT: case OBJECT_TABLESPACE: case OBJECT_TSDICTIONARY: case OBJECT_TSCONFIGURATION: @@ -792,7 +854,6 @@ ExecAlterOwnerStmt(AlterOwnerStmt *stmt) address = get_object_address(stmt->objectType, stmt->object, - stmt->objarg, &relation, AccessExclusiveLock, false); @@ -945,8 +1006,7 @@ AlterObjectOwner_internal(Relation rel, Oid objectId, Oid new_ownerId) values, nulls, replaces); /* Perform actual update */ - simple_heap_update(rel, &newtup->t_self, newtup); - CatalogUpdateIndexes(rel, newtup); + CatalogTupleUpdate(rel, &newtup->t_self, newtup); /* Update owner dependency reference */ if (classId == LargeObjectMetadataRelationId) diff --git a/src/backend/commands/amcmds.c b/src/backend/commands/amcmds.c index 9ac930ea8b..7e0a9aa0fd 100644 --- a/src/backend/commands/amcmds.c +++ b/src/backend/commands/amcmds.c @@ -3,7 +3,7 @@ * amcmds.c * Routines for SQL commands that manipulate access methods. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -34,7 +34,7 @@ static const char *get_am_type_string(char amtype); /* - * CreateAcessMethod + * CreateAccessMethod * Registers a new access method. */ ObjectAddress @@ -87,8 +87,7 @@ CreateAccessMethod(CreateAmStmt *stmt) tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); - amoid = simple_heap_insert(rel, tup); - CatalogUpdateIndexes(rel, tup); + amoid = CatalogTupleInsert(rel, tup); heap_freetuple(tup); myself.classId = AccessMethodRelationId; @@ -129,7 +128,7 @@ RemoveAccessMethodById(Oid amOid) if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for access method %u", amOid); - simple_heap_delete(relation, &tup->t_self); + CatalogTupleDelete(relation, &tup->t_self); ReleaseSysCache(tup); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 14aad4fd7c..67e4146c6c 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -4,7 +4,7 @@ * the Postgres statistics generator * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -18,6 +18,7 @@ #include <math.h> #include "access/multixact.h" +#include "access/sysattr.h" #include "access/transam.h" #include "access/tupconvert.h" #include "access/tuptoaster.h" @@ -29,6 +30,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_inherits_fn.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_statistic_ext.h" #include "commands/dbcommands.h" #include "commands/tablecmds.h" #include "commands/vacuum.h" @@ -40,13 +42,17 @@ #include "parser/parse_relation.h" #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" #include "utils/acl.h" #include "utils/attoptcache.h" +#include "utils/builtins.h" #include "utils/datum.h" +#include "utils/fmgroids.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -246,6 +252,12 @@ analyze_rel(Oid relid, RangeVar *relation, int options, return; } } + else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + /* + * For partitioned tables, we want to do the recursive ANALYZE below. + */ + } else { /* No need for a WARNING if we already complained during VACUUM */ @@ -265,10 +277,12 @@ analyze_rel(Oid relid, RangeVar *relation, int options, LWLockRelease(ProcArrayLock); /* - * Do the normal non-recursive ANALYZE. + * Do the normal non-recursive ANALYZE. We can skip this for partitioned + * tables, which don't contain any rows. */ - do_analyze_rel(onerel, options, params, va_cols, acquirefunc, relpages, - false, in_outer_xact, elevel); + if (onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + do_analyze_rel(onerel, options, params, va_cols, acquirefunc, + relpages, false, in_outer_xact, elevel); /* * If there are child tables, do recursive ANALYZE. @@ -345,9 +359,7 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, */ anl_context = AllocSetContextCreate(CurrentMemoryContext, "Analyze", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); caller_context = MemoryContextSwitchTo(anl_context); /* @@ -541,9 +553,7 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, col_context = AllocSetContextCreate(anl_context, "Analyze Column", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); old_context = MemoryContextSwitchTo(col_context); for (i = 0; i < attr_cnt; i++) @@ -599,6 +609,10 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, update_attstats(RelationGetRelid(Irel[ind]), false, thisdata->attr_cnt, thisdata->vacattrstats); } + + /* Build extended statistics (if there are any). */ + BuildRelationExtStatistics(onerel, totalrows, numrows, rows, attr_cnt, + vacattrstats); } /* @@ -731,9 +745,7 @@ compute_index_stats(Relation onerel, double totalrows, ind_context = AllocSetContextCreate(anl_context, "Analyze Index", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); old_context = MemoryContextSwitchTo(ind_context); for (ind = 0; ind < nindexes; ind++) @@ -744,7 +756,7 @@ compute_index_stats(Relation onerel, double totalrows, TupleTableSlot *slot; EState *estate; ExprContext *econtext; - List *predicate; + ExprState *predicate; Datum *exprvals; bool *exprnulls; int numindexrows, @@ -770,9 +782,7 @@ compute_index_stats(Relation onerel, double totalrows, econtext->ecxt_scantuple = slot; /* Set up execution state for predicate. */ - predicate = (List *) - ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, - estate); + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); /* Compute and save index expression values */ exprvals = (Datum *) palloc(numrows * attr_cnt * sizeof(Datum)); @@ -795,9 +805,9 @@ compute_index_stats(Relation onerel, double totalrows, ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); /* If index is partial, check predicate */ - if (predicate != NIL) + if (predicate != NULL) { - if (!ExecQual(predicate, econtext, false)) + if (!ExecQual(predicate, econtext)) continue; } numindexrows++; @@ -1041,7 +1051,7 @@ acquire_sample_rows(Relation onerel, int elevel, totalblocks = RelationGetNumberOfBlocks(onerel); /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ - OldestXmin = GetOldestXmin(onerel, true); + OldestXmin = GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM); /* Prepare for sampling block numbers */ BlockSampler_Init(&bs, totalblocks, targrows, random()); @@ -1308,6 +1318,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, nrels, i; ListCell *lc; + bool has_child; /* * Find all members of inheritance set. We only need AccessShareLock on @@ -1345,6 +1356,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, relblocks = (double *) palloc(list_length(tableOIDs) * sizeof(double)); totalblocks = 0; nrels = 0; + has_child = false; foreach(lc, tableOIDs) { Oid childOID = lfirst_oid(lc); @@ -1398,13 +1410,20 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, } else { - /* ignore, but release the lock on it */ - Assert(childrel != onerel); - heap_close(childrel, AccessShareLock); + /* + * ignore, but release the lock on it. don't try to unlock the + * passed-in relation + */ + Assert(childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + if (childrel != onerel) + heap_close(childrel, AccessShareLock); + else + heap_close(childrel, NoLock); continue; } /* OK, we'll process this child */ + has_child = true; rels[nrels] = childrel; acquirefuncs[nrels] = acquirefunc; relblocks[nrels] = (double) relpages; @@ -1413,9 +1432,10 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, } /* - * If we don't have at least two tables to consider, fail. + * If we don't have at least one child table to consider, fail. If the + * relation is a partitioned table, it's not counted as a child table. */ - if (nrels < 2) + if (!has_child) { ereport(elevel, (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no analyzable child tables", @@ -1636,18 +1656,15 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats) nulls, replaces); ReleaseSysCache(oldtup); - simple_heap_update(sd, &stup->t_self, stup); + CatalogTupleUpdate(sd, &stup->t_self, stup); } else { /* No, insert new tuple */ stup = heap_form_tuple(RelationGetDescr(sd), values, nulls); - simple_heap_insert(sd, stup); + CatalogTupleInsert(sd, stup); } - /* update indexes too */ - CatalogUpdateIndexes(sd, stup); - heap_freetuple(stup); } @@ -1717,19 +1734,6 @@ ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) */ typedef struct { - Oid eqopr; /* '=' operator for datatype, if any */ - Oid eqfunc; /* and associated function */ - Oid ltopr; /* '<' operator for datatype, if any */ -} StdAnalyzeData; - -typedef struct -{ - Datum value; /* a data value */ - int tupno; /* position index for tuple it came from */ -} ScalarItem; - -typedef struct -{ int count; /* # of duplicates */ int first; /* values[] index of first occurrence */ } ScalarMCVItem; diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 716f1c3318..87b215d8d3 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -3,7 +3,7 @@ * async.c * Asynchronous notification: NOTIFY, LISTEN, UNLISTEN * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -1636,7 +1636,7 @@ AtSubCommit_Notify(void) List *parentPendingActions; List *parentPendingNotifies; - parentPendingActions = (List *) linitial(upperPendingActions); + parentPendingActions = linitial_node(List, upperPendingActions); upperPendingActions = list_delete_first(upperPendingActions); Assert(list_length(upperPendingActions) == @@ -1647,7 +1647,7 @@ AtSubCommit_Notify(void) */ pendingActions = list_concat(parentPendingActions, pendingActions); - parentPendingNotifies = (List *) linitial(upperPendingNotifies); + parentPendingNotifies = linitial_node(List, upperPendingNotifies); upperPendingNotifies = list_delete_first(upperPendingNotifies); Assert(list_length(upperPendingNotifies) == @@ -1679,13 +1679,13 @@ AtSubAbort_Notify(void) */ while (list_length(upperPendingActions) > my_level - 2) { - pendingActions = (List *) linitial(upperPendingActions); + pendingActions = linitial_node(List, upperPendingActions); upperPendingActions = list_delete_first(upperPendingActions); } while (list_length(upperPendingNotifies) > my_level - 2) { - pendingNotifies = (List *) linitial(upperPendingNotifies); + pendingNotifies = linitial_node(List, upperPendingNotifies); upperPendingNotifies = list_delete_first(upperPendingNotifies); } } diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index ca59799248..9a08a07319 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -6,7 +6,7 @@ * There is hardly anything left of Paul Brown's original implementation... * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * * @@ -205,9 +205,7 @@ cluster(ClusterStmt *stmt, bool isTopLevel) */ cluster_context = AllocSetContextCreate(PortalContext, "Cluster", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); /* * Build the list of relations to cluster. Note that this lives in @@ -526,8 +524,7 @@ mark_index_clustered(Relation rel, Oid indexOid, bool is_internal) if (indexForm->indisclustered) { indexForm->indisclustered = false; - simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); - CatalogUpdateIndexes(pg_index, indexTuple); + CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); } else if (thisIndexOid == indexOid) { @@ -535,8 +532,7 @@ mark_index_clustered(Relation rel, Oid indexOid, bool is_internal) if (!IndexIsValid(indexForm)) elog(ERROR, "cannot cluster on invalid index %u", indexOid); indexForm->indisclustered = true; - simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); - CatalogUpdateIndexes(pg_index, indexTuple); + CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); } InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0, @@ -562,6 +558,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose) Oid tableOid = RelationGetRelid(OldHeap); Oid tableSpace = OldHeap->rd_rel->reltablespace; Oid OIDNewHeap; + char relpersistence; bool is_system_catalog; bool swap_toast_by_content; TransactionId frozenXid; @@ -571,7 +568,8 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose) if (OidIsValid(indexOid)) mark_index_clustered(OldHeap, indexOid, true); - /* Remember if it's a system catalog */ + /* Remember info about rel before closing OldHeap */ + relpersistence = OldHeap->rd_rel->relpersistence; is_system_catalog = IsSystemRelation(OldHeap); /* Close relcache entry, but keep lock until transaction commit */ @@ -579,7 +577,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose) /* Create the transient table that will receive the re-ordered data */ OIDNewHeap = make_new_heap(tableOid, tableSpace, - OldHeap->rd_rel->relpersistence, + relpersistence, AccessExclusiveLock); /* Copy the heap data into the new table in the desired order */ @@ -593,7 +591,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose) finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, swap_toast_by_content, false, true, frozenXid, cutoffMulti, - OldHeap->rd_rel->relpersistence); + relpersistence); } @@ -1060,11 +1058,10 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, for (;;) { HeapTuple tuple; - bool shouldfree; CHECK_FOR_INTERRUPTS(); - tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree); + tuple = tuplesort_getheaptuple(tuplesort, true); if (tuple == NULL) break; @@ -1072,9 +1069,6 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, oldTupDesc, newTupDesc, values, isnull, NewHeap->rd_rel->relhasoids, rwstate); - - if (shouldfree) - heap_freetuple(tuple); } tuplesort_end(tuplesort); @@ -1150,7 +1144,6 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, relfilenode2; Oid swaptemp; char swptmpchr; - CatalogIndexState indstate; /* We need writable copies of both pg_class tuples. */ relRelation = heap_open(RelationRelationId, RowExclusiveLock); @@ -1301,13 +1294,13 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, */ if (!target_is_pg_class) { - simple_heap_update(relRelation, &reltup1->t_self, reltup1); - simple_heap_update(relRelation, &reltup2->t_self, reltup2); + CatalogIndexState indstate; - /* Keep system catalogs current */ indstate = CatalogOpenIndexes(relRelation); - CatalogIndexInsert(indstate, reltup1); - CatalogIndexInsert(indstate, reltup2); + CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1, + indstate); + CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2, + indstate); CatalogCloseIndexes(indstate); } else @@ -1572,8 +1565,7 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, relform->relfrozenxid = frozenXid; relform->relminmxid = cutoffMulti; - simple_heap_update(relRelation, &reltup->t_self, reltup); - CatalogUpdateIndexes(relRelation, reltup); + CatalogTupleUpdate(relRelation, &reltup->t_self, reltup); heap_close(relRelation, RowExclusiveLock); } diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index e4ebb654a6..110fb7ef65 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -3,7 +3,7 @@ * collationcmds.c * collation-related commands support code * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -20,10 +20,12 @@ #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/namespace.h" +#include "catalog/objectaccess.h" #include "catalog/pg_collation.h" #include "catalog/pg_collation_fn.h" #include "commands/alter.h" #include "commands/collationcmds.h" +#include "commands/comment.h" #include "commands/dbcommands.h" #include "commands/defrem.h" #include "mb/pg_wchar.h" @@ -34,11 +36,12 @@ #include "utils/rel.h" #include "utils/syscache.h" + /* * CREATE COLLATION */ ObjectAddress -DefineCollation(List *names, List *parameters) +DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_exists) { char *collName; Oid collNamespace; @@ -48,8 +51,14 @@ DefineCollation(List *names, List *parameters) DefElem *localeEl = NULL; DefElem *lccollateEl = NULL; DefElem *lcctypeEl = NULL; + DefElem *providerEl = NULL; + DefElem *versionEl = NULL; char *collcollate = NULL; char *collctype = NULL; + char *collproviderstr = NULL; + int collencoding; + char collprovider = 0; + char *collversion = NULL; Oid newoid; ObjectAddress address; @@ -62,7 +71,7 @@ DefineCollation(List *names, List *parameters) foreach(pl, parameters) { - DefElem *defel = (DefElem *) lfirst(pl); + DefElem *defel = lfirst_node(DefElem, pl); DefElem **defelp; if (pg_strcasecmp(defel->defname, "from") == 0) @@ -73,12 +82,17 @@ DefineCollation(List *names, List *parameters) defelp = &lccollateEl; else if (pg_strcasecmp(defel->defname, "lc_ctype") == 0) defelp = &lcctypeEl; + else if (pg_strcasecmp(defel->defname, "provider") == 0) + defelp = &providerEl; + else if (pg_strcasecmp(defel->defname, "version") == 0) + defelp = &versionEl; else { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("collation attribute \"%s\" not recognized", - defel->defname))); + defel->defname), + parser_errposition(pstate, defel->location))); break; } @@ -103,6 +117,7 @@ DefineCollation(List *names, List *parameters) collcollate = pstrdup(NameStr(((Form_pg_collation) GETSTRUCT(tp))->collcollate)); collctype = pstrdup(NameStr(((Form_pg_collation) GETSTRUCT(tp))->collctype)); + collprovider = ((Form_pg_collation) GETSTRUCT(tp))->collprovider; ReleaseSysCache(tp); } @@ -119,6 +134,27 @@ DefineCollation(List *names, List *parameters) if (lcctypeEl) collctype = defGetString(lcctypeEl); + if (providerEl) + collproviderstr = defGetString(providerEl); + + if (versionEl) + collversion = defGetString(versionEl); + + if (collproviderstr) + { + if (pg_strcasecmp(collproviderstr, "icu") == 0) + collprovider = COLLPROVIDER_ICU; + else if (pg_strcasecmp(collproviderstr, "libc") == 0) + collprovider = COLLPROVIDER_LIBC; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("unrecognized collation provider: %s", + collproviderstr))); + } + else if (!fromEl) + collprovider = COLLPROVIDER_LIBC; + if (!collcollate) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -129,14 +165,29 @@ DefineCollation(List *names, List *parameters) (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("parameter \"lc_ctype\" must be specified"))); - check_encoding_locale_matches(GetDatabaseEncoding(), collcollate, collctype); + if (collprovider == COLLPROVIDER_ICU) + collencoding = -1; + else + { + collencoding = GetDatabaseEncoding(); + check_encoding_locale_matches(collencoding, collcollate, collctype); + } + + if (!collversion) + collversion = get_collation_actual_version(collprovider, collcollate); newoid = CollationCreate(collName, collNamespace, GetUserId(), - GetDatabaseEncoding(), + collprovider, + collencoding, collcollate, - collctype); + collctype, + collversion, + if_not_exists); + + if (!OidIsValid(newoid)) + return InvalidObjectAddress; ObjectAddressSet(address, CollationRelationId, newoid); @@ -177,3 +228,390 @@ IsThereCollationInNamespace(const char *collname, Oid nspOid) errmsg("collation \"%s\" already exists in schema \"%s\"", collname, get_namespace_name(nspOid)))); } + +/* + * ALTER COLLATION + */ +ObjectAddress +AlterCollation(AlterCollationStmt *stmt) +{ + Relation rel; + Oid collOid; + HeapTuple tup; + Form_pg_collation collForm; + Datum collversion; + bool isnull; + char *oldversion; + char *newversion; + ObjectAddress address; + + rel = heap_open(CollationRelationId, RowExclusiveLock); + collOid = get_collation_oid(stmt->collname, false); + + if (!pg_collation_ownercheck(collOid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_COLLATION, + NameListToString(stmt->collname)); + + tup = SearchSysCacheCopy1(COLLOID, ObjectIdGetDatum(collOid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for collation %u", collOid); + + collForm = (Form_pg_collation) GETSTRUCT(tup); + collversion = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collversion, + &isnull); + oldversion = isnull ? NULL : TextDatumGetCString(collversion); + + newversion = get_collation_actual_version(collForm->collprovider, NameStr(collForm->collcollate)); + + /* cannot change from NULL to non-NULL or vice versa */ + if ((!oldversion && newversion) || (oldversion && !newversion)) + elog(ERROR, "invalid collation version change"); + else if (oldversion && newversion && strcmp(newversion, oldversion) != 0) + { + bool nulls[Natts_pg_collation]; + bool replaces[Natts_pg_collation]; + Datum values[Natts_pg_collation]; + + ereport(NOTICE, + (errmsg("changing version from %s to %s", + oldversion, newversion))); + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + values[Anum_pg_collation_collversion - 1] = CStringGetTextDatum(newversion); + replaces[Anum_pg_collation_collversion - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), + values, nulls, replaces); + } + else + ereport(NOTICE, + (errmsg("version has not changed"))); + + CatalogTupleUpdate(rel, &tup->t_self, tup); + + InvokeObjectPostAlterHook(CollationRelationId, collOid, 0); + + ObjectAddressSet(address, CollationRelationId, collOid); + + heap_freetuple(tup); + heap_close(rel, NoLock); + + return address; +} + + +Datum +pg_collation_actual_version(PG_FUNCTION_ARGS) +{ + Oid collid = PG_GETARG_OID(0); + HeapTuple tp; + char *collcollate; + char collprovider; + char *version; + + tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); + if (!HeapTupleIsValid(tp)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("collation with OID %u does not exist", collid))); + + collcollate = pstrdup(NameStr(((Form_pg_collation) GETSTRUCT(tp))->collcollate)); + collprovider = ((Form_pg_collation) GETSTRUCT(tp))->collprovider; + + ReleaseSysCache(tp); + + version = get_collation_actual_version(collprovider, collcollate); + + if (version) + PG_RETURN_TEXT_P(cstring_to_text(version)); + else + PG_RETURN_NULL(); +} + + +/* + * "Normalize" a libc locale name, stripping off encoding tags such as + * ".utf8" (e.g., "en_US.utf8" -> "en_US", but "br_FR.iso885915@euro" + * -> "br_FR@euro"). Return true if a new, different name was + * generated. + */ +pg_attribute_unused() +static bool +normalize_libc_locale_name(char *new, const char *old) +{ + char *n = new; + const char *o = old; + bool changed = false; + + while (*o) + { + if (*o == '.') + { + /* skip over encoding tag such as ".utf8" or ".UTF-8" */ + o++; + while ((*o >= 'A' && *o <= 'Z') + || (*o >= 'a' && *o <= 'z') + || (*o >= '0' && *o <= '9') + || (*o == '-')) + o++; + changed = true; + } + else + *n++ = *o++; + } + *n = '\0'; + + return changed; +} + + +#ifdef USE_ICU +static char * +get_icu_language_tag(const char *localename) +{ + char buf[ULOC_FULLNAME_CAPACITY]; + UErrorCode status; + + status = U_ZERO_ERROR; + uloc_toLanguageTag(localename, buf, sizeof(buf), TRUE, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not convert locale name \"%s\" to language tag: %s", + localename, u_errorName(status)))); + + return pstrdup(buf); +} + + +static char * +get_icu_locale_comment(const char *localename) +{ + UErrorCode status; + UChar displayname[128]; + int32 len_uchar; + char *result; + + status = U_ZERO_ERROR; + len_uchar = uloc_getDisplayName(localename, "en", &displayname[0], sizeof(displayname), &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could get display name for locale \"%s\": %s", + localename, u_errorName(status)))); + + icu_from_uchar(&result, displayname, len_uchar); + + return result; +} +#endif /* USE_ICU */ + + +Datum +pg_import_system_collations(PG_FUNCTION_ARGS) +{ +#if defined(HAVE_LOCALE_T) && !defined(WIN32) + bool if_not_exists = PG_GETARG_BOOL(0); + Oid nspid = PG_GETARG_OID(1); + + FILE *locale_a_handle; + char localebuf[NAMEDATALEN]; /* we assume ASCII so this is fine */ + int count = 0; + List *aliaslist = NIL; + List *localelist = NIL; + List *enclist = NIL; + ListCell *lca, + *lcl, + *lce; +#endif + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to import system collations")))); + +#if defined(HAVE_LOCALE_T) && !defined(WIN32) + locale_a_handle = OpenPipeStream("locale -a", "r"); + if (locale_a_handle == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not execute command \"%s\": %m", + "locale -a"))); + + while (fgets(localebuf, sizeof(localebuf), locale_a_handle)) + { + int i; + size_t len; + int enc; + bool skip; + char alias[NAMEDATALEN]; + + len = strlen(localebuf); + + if (len == 0 || localebuf[len - 1] != '\n') + { + elog(DEBUG1, "locale name too long, skipped: \"%s\"", localebuf); + continue; + } + localebuf[len - 1] = '\0'; + + /* + * Some systems have locale names that don't consist entirely of ASCII + * letters (such as "bokmål" or "français"). This is + * pretty silly, since we need the locale itself to interpret the + * non-ASCII characters. We can't do much with those, so we filter + * them out. + */ + skip = false; + for (i = 0; i < len; i++) + { + if (IS_HIGHBIT_SET(localebuf[i])) + { + skip = true; + break; + } + } + if (skip) + { + elog(DEBUG1, "locale name has non-ASCII characters, skipped: \"%s\"", localebuf); + continue; + } + + enc = pg_get_encoding_from_locale(localebuf, false); + if (enc < 0) + { + /* error message printed by pg_get_encoding_from_locale() */ + continue; + } + if (!PG_VALID_BE_ENCODING(enc)) + continue; /* ignore locales for client-only encodings */ + if (enc == PG_SQL_ASCII) + continue; /* C/POSIX are already in the catalog */ + + count++; + + CollationCreate(localebuf, nspid, GetUserId(), COLLPROVIDER_LIBC, enc, + localebuf, localebuf, + get_collation_actual_version(COLLPROVIDER_LIBC, localebuf), + if_not_exists); + + CommandCounterIncrement(); + + /* + * Generate aliases such as "en_US" in addition to "en_US.utf8" for + * ease of use. Note that collation names are unique per encoding + * only, so this doesn't clash with "en_US" for LATIN1, say. + * + * However, it might conflict with a name we'll see later in the + * "locale -a" output. So save up the aliases and try to add them + * after we've read all the output. + */ + if (normalize_libc_locale_name(alias, localebuf)) + { + aliaslist = lappend(aliaslist, pstrdup(alias)); + localelist = lappend(localelist, pstrdup(localebuf)); + enclist = lappend_int(enclist, enc); + } + } + + ClosePipeStream(locale_a_handle); + + /* Now try to add any aliases we created */ + forthree(lca, aliaslist, lcl, localelist, lce, enclist) + { + char *alias = (char *) lfirst(lca); + char *locale = (char *) lfirst(lcl); + int enc = lfirst_int(lce); + + CollationCreate(alias, nspid, GetUserId(), COLLPROVIDER_LIBC, enc, + locale, locale, + get_collation_actual_version(COLLPROVIDER_LIBC, locale), + true); + CommandCounterIncrement(); + } + + if (count == 0) + ereport(WARNING, + (errmsg("no usable system locales were found"))); +#endif /* not HAVE_LOCALE_T && not WIN32 */ + +#ifdef USE_ICU + if (!is_encoding_supported_by_icu(GetDatabaseEncoding())) + { + ereport(NOTICE, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("encoding \"%s\" not supported by ICU", + pg_encoding_to_char(GetDatabaseEncoding())))); + } + else + { + int i; + + /* + * Start the loop at -1 to sneak in the root locale without too much + * code duplication. + */ + for (i = -1; i < ucol_countAvailable(); i++) + { + const char *name; + char *langtag; + const char *collcollate; + UEnumeration *en; + UErrorCode status; + const char *val; + Oid collid; + + if (i == -1) + name = ""; /* ICU root locale */ + else + name = ucol_getAvailable(i); + + langtag = get_icu_language_tag(name); + collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name; + collid = CollationCreate(psprintf("%s-x-icu", langtag), + nspid, GetUserId(), COLLPROVIDER_ICU, -1, + collcollate, collcollate, + get_collation_actual_version(COLLPROVIDER_ICU, collcollate), + if_not_exists); + + CreateComments(collid, CollationRelationId, 0, + get_icu_locale_comment(name)); + + /* + * Add keyword variants + */ + status = U_ZERO_ERROR; + en = ucol_getKeywordValuesForLocale("collation", name, TRUE, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not get keyword values for locale \"%s\": %s", + name, u_errorName(status)))); + + status = U_ZERO_ERROR; + uenum_reset(en, &status); + while ((val = uenum_next(en, NULL, &status))) + { + char *localeid = psprintf("%s@collation=%s", name, val); + + langtag = get_icu_language_tag(localeid); + collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid; + collid = CollationCreate(psprintf("%s-x-icu", langtag), + nspid, GetUserId(), COLLPROVIDER_ICU, -1, + collcollate, collcollate, + get_collation_actual_version(COLLPROVIDER_ICU, collcollate), + if_not_exists); + CreateComments(collid, CollationRelationId, 0, + get_icu_locale_comment(localeid)); + } + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not get keyword values for locale \"%s\": %s", + name, u_errorName(status)))); + uenum_close(en); + } + } +#endif + + PG_RETURN_VOID(); +} diff --git a/src/backend/commands/comment.c b/src/backend/commands/comment.c index f45da2d914..236b582f7c 100644 --- a/src/backend/commands/comment.c +++ b/src/backend/commands/comment.c @@ -5,7 +5,7 @@ * PostgreSQL object comments utility code. * * Portions Copyright (c) 2010-2012 Postgres-XC Development Group - * Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Copyright (c) 1996-2017, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/commands/comment.c @@ -49,12 +49,11 @@ CommentObject(CommentStmt *stmt) * (which is really pg_restore's fault, but for now we will work around * the problem here). Consensus is that the best fix is to treat wrong * database name as a WARNING not an ERROR; hence, the following special - * case. (If the length of stmt->objname is not 1, get_object_address - * will throw an error below; that's OK.) + * case. */ - if (stmt->objtype == OBJECT_DATABASE && list_length(stmt->objname) == 1) + if (stmt->objtype == OBJECT_DATABASE) { - char *database = strVal(linitial(stmt->objname)); + char *database = strVal((Value *) stmt->object); if (!OidIsValid(get_database_oid(database, true))) { @@ -71,12 +70,12 @@ CommentObject(CommentStmt *stmt) * does not exist, and will also acquire a lock on the target to guard * against concurrent DROP operations. */ - address = get_object_address(stmt->objtype, stmt->objname, stmt->objargs, + address = get_object_address(stmt->objtype, stmt->object, &relation, ShareUpdateExclusiveLock, false); /* Require ownership of the target object. */ check_object_ownership(GetUserId(), stmt->objtype, address, - stmt->objname, stmt->objargs, relation); + stmt->object, relation); /* Perform other integrity checks as needed. */ switch (stmt->objtype) @@ -96,7 +95,8 @@ CommentObject(CommentStmt *stmt) relation->rd_rel->relkind != RELKIND_VIEW && relation->rd_rel->relkind != RELKIND_MATVIEW && relation->rd_rel->relkind != RELKIND_COMPOSITE_TYPE && - relation->rd_rel->relkind != RELKIND_FOREIGN_TABLE) + relation->rd_rel->relkind != RELKIND_FOREIGN_TABLE && + relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table, view, materialized view, composite type, or foreign table", @@ -195,12 +195,12 @@ CreateComments(Oid oid, Oid classoid, int32 subid, char *comment) /* Found the old tuple, so delete or update it */ if (comment == NULL) - simple_heap_delete(description, &oldtuple->t_self); + CatalogTupleDelete(description, &oldtuple->t_self); else { newtuple = heap_modify_tuple(oldtuple, RelationGetDescr(description), values, nulls, replaces); - simple_heap_update(description, &oldtuple->t_self, newtuple); + CatalogTupleUpdate(description, &oldtuple->t_self, newtuple); } break; /* Assume there can be only one match */ @@ -214,15 +214,11 @@ CreateComments(Oid oid, Oid classoid, int32 subid, char *comment) { newtuple = heap_form_tuple(RelationGetDescr(description), values, nulls); - simple_heap_insert(description, newtuple); + CatalogTupleInsert(description, newtuple); } - /* Update indexes, if necessary */ if (newtuple != NULL) - { - CatalogUpdateIndexes(description, newtuple); heap_freetuple(newtuple); - } /* Done */ @@ -289,12 +285,12 @@ CreateSharedComments(Oid oid, Oid classoid, char *comment) /* Found the old tuple, so delete or update it */ if (comment == NULL) - simple_heap_delete(shdescription, &oldtuple->t_self); + CatalogTupleDelete(shdescription, &oldtuple->t_self); else { newtuple = heap_modify_tuple(oldtuple, RelationGetDescr(shdescription), values, nulls, replaces); - simple_heap_update(shdescription, &oldtuple->t_self, newtuple); + CatalogTupleUpdate(shdescription, &oldtuple->t_self, newtuple); } break; /* Assume there can be only one match */ @@ -308,15 +304,11 @@ CreateSharedComments(Oid oid, Oid classoid, char *comment) { newtuple = heap_form_tuple(RelationGetDescr(shdescription), values, nulls); - simple_heap_insert(shdescription, newtuple); + CatalogTupleInsert(shdescription, newtuple); } - /* Update indexes, if necessary */ if (newtuple != NULL) - { - CatalogUpdateIndexes(shdescription, newtuple); heap_freetuple(newtuple); - } /* Done */ @@ -367,7 +359,7 @@ DeleteComments(Oid oid, Oid classoid, int32 subid) NULL, nkeys, skey); while ((oldtuple = systable_getnext(sd)) != NULL) - simple_heap_delete(description, &oldtuple->t_self); + CatalogTupleDelete(description, &oldtuple->t_self); /* Done */ @@ -403,7 +395,7 @@ DeleteSharedComments(Oid oid, Oid classoid) NULL, 2, skey); while ((oldtuple = systable_getnext(sd)) != NULL) - simple_heap_delete(shdescription, &oldtuple->t_self); + CatalogTupleDelete(shdescription, &oldtuple->t_self); /* Done */ diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 26f9114f55..e2544e51ed 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -3,7 +3,7 @@ * constraint.c * PostgreSQL CONSTRAINT support code. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -37,7 +37,7 @@ Datum unique_key_recheck(PG_FUNCTION_ARGS) { - TriggerData *trigdata = (TriggerData *) fcinfo->context; + TriggerData *trigdata = castNode(TriggerData, fcinfo->context); const char *funcname = "unique_key_recheck"; HeapTuple new_row; ItemPointerData tmptid; @@ -165,7 +165,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) * index will know about. */ index_insert(indexRel, values, isnull, &(new_row->t_self), - trigdata->tg_relation, UNIQUE_CHECK_EXISTING); + trigdata->tg_relation, UNIQUE_CHECK_EXISTING, + indexInfo); } else { diff --git a/src/backend/commands/conversioncmds.c b/src/backend/commands/conversioncmds.c index 175d4ab685..9861d3df22 100644 --- a/src/backend/commands/conversioncmds.c +++ b/src/backend/commands/conversioncmds.c @@ -3,7 +3,7 @@ * conversioncmds.c * conversion creation command support code * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,7 +14,6 @@ */ #include "postgres.h" -#include "access/heapam.h" #include "access/htup_details.h" #include "catalog/dependency.h" #include "catalog/indexing.h" diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 461e94ed0b..5d5e409c7d 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -4,7 +4,7 @@ * Implements the COPY utility command * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -56,6 +56,7 @@ #endif #include "nodes/makefuncs.h" #include "optimizer/pgxcship.h" +#include "parser/parse_relation.h" #include "rewrite/rewriteHandler.h" #include "storage/fd.h" #include "tcop/tcopprot.h" @@ -79,10 +80,11 @@ typedef enum CopyDest { COPY_FILE, /* to/from file (or a piped program) */ COPY_OLD_FE, /* to/from frontend (2.0 protocol) */ - COPY_NEW_FE /* to/from frontend (3.0 protocol) */ + COPY_NEW_FE, /* to/from frontend (3.0 protocol) */ #ifdef PGXC - ,COPY_BUFFER /* Do not send, just prepare */ + COPY_BUFFER, /* Do not send, just prepare */ #endif + COPY_CALLBACK /* to/from callback function */ } CopyDest; /* @@ -131,6 +133,7 @@ typedef struct CopyStateData List *attnumlist; /* integer list of attnums to copy */ char *filename; /* filename, or NULL for STDIN/STDOUT */ bool is_program; /* is 'filename' a program to popen? */ + copy_data_source_cb data_source_cb; /* function for reading data */ bool binary; /* binary format? */ bool oids; /* include OIDs? */ bool freeze; /* freeze rows on loading? */ @@ -184,6 +187,13 @@ typedef struct CopyStateData bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; + PartitionDispatch *partition_dispatch_info; + int num_dispatch; /* Number of entries in the above array */ + int num_partitions; /* Number of members in the following arrays */ + ResultRelInfo *partitions; /* Per partition result relation */ + TupleConversionMap **partition_tupconv_maps; + TupleTableSlot *partition_tuple_slot; + /* * These variables are used to reduce overhead in textual COPY FROM. * @@ -305,20 +315,19 @@ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; /* non-export function prototypes */ -static CopyState BeginCopy(bool is_from, Relation rel, Node *raw_query, - const char *queryString, const Oid queryRelId, List *attnamelist, +static CopyState BeginCopy(ParseState *pstate, bool is_from, Relation rel, + RawStmt *raw_query, Oid queryRelId, List *attnamelist, List *options); static void EndCopy(CopyState cstate); static void ClosePipeToProgram(CopyState cstate); -static CopyState BeginCopyTo(Relation rel, Node *query, const char *queryString, - const Oid queryRelId, const char *filename, bool is_program, +static CopyState BeginCopyTo(ParseState *pstate, Relation rel, RawStmt *query, + Oid queryRelId, const char *filename, bool is_program, List *attnamelist, List *options); static void EndCopyTo(CopyState cstate); static uint64 DoCopyTo(CopyState cstate); static uint64 CopyTo(CopyState cstate); static void CopyOneRowTo(CopyState cstate, Oid tupleOid, Datum *values, bool *nulls); -static uint64 CopyFrom(CopyState cstate); static void CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, int hi_options, ResultRelInfo *resultRelInfo, TupleTableSlot *myslot, @@ -384,7 +393,7 @@ SendCopyBegin(CopyState cstate) pq_endmessage(&buf); cstate->copy_dest = COPY_NEW_FE; } - else if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 2) + else { /* old way */ if (cstate->binary) @@ -396,18 +405,6 @@ SendCopyBegin(CopyState cstate) pq_startcopyout(); cstate->copy_dest = COPY_OLD_FE; } - else - { - /* very old way */ - if (cstate->binary) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("COPY BINARY is not supported to stdout or from stdin"))); - pq_putemptymessage('B'); - /* grottiness needed for old COPY OUT protocol */ - pq_startcopyout(); - cstate->copy_dest = COPY_OLD_FE; - } } static void @@ -430,7 +427,7 @@ ReceiveCopyBegin(CopyState cstate) cstate->copy_dest = COPY_NEW_FE; cstate->fe_msgbuf = makeStringInfo(); } - else if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 2) + else { /* old way */ if (cstate->binary) @@ -442,18 +439,6 @@ ReceiveCopyBegin(CopyState cstate) pq_startmsgread(); cstate->copy_dest = COPY_OLD_FE; } - else - { - /* very old way */ - if (cstate->binary) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("COPY BINARY is not supported to stdout or from stdin"))); - pq_putemptymessage('D'); - /* any error in old protocol will make us lose sync */ - pq_startmsgread(); - cstate->copy_dest = COPY_OLD_FE; - } /* We *must* flush here to ensure FE knows it can send. */ pq_flush(); } @@ -582,6 +567,9 @@ CopySendEndOfRow(CopyState cstate) /* Do not send yet anywhere, just return */ return; #endif + case COPY_CALLBACK: + Assert(false); /* Not yet supported. */ + break; } resetStringInfo(fe_msgbuf); @@ -701,6 +689,9 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread) elog(ERROR, "COPY_BUFFER not allowed in this context"); break; #endif + case COPY_CALLBACK: + bytesread = cstate->data_source_cb(databuf, minread, maxread); + break; } return bytesread; @@ -826,16 +817,17 @@ CopyLoadRawBuf(CopyState cstate) * Do not allow the copy if user doesn't have proper permission to access * the table or the specifically requested columns. */ -Oid -DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) +void +DoCopy(ParseState *pstate, const CopyStmt *stmt, + int stmt_location, int stmt_len, + uint64 *processed) { CopyState cstate; bool is_from = stmt->is_from; bool pipe = (stmt->filename == NULL); Relation rel; Oid relid; - Node *query = NULL; - List *range_table = NIL; + RawStmt *query = NULL; /* Disallow COPY to/from file or program except to superusers. */ if (!pipe && !superuser()) @@ -857,7 +849,6 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) if (stmt->relation) { TupleDesc tupDesc; - AclMode required_access = (is_from ? ACL_INSERT : ACL_SELECT); List *attnums; ListCell *cur; RangeTblEntry *rte; @@ -870,12 +861,8 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) relid = RelationGetRelid(rel); - rte = makeNode(RangeTblEntry); - rte->rtekind = RTE_RELATION; - rte->relid = RelationGetRelid(rel); - rte->relkind = rel->rd_rel->relkind; - rte->requiredPerms = required_access; - range_table = list_make1(rte); + rte = addRangeTableEntryForRelation(pstate, rel, NULL, false, false); + rte->requiredPerms = (is_from ? ACL_INSERT : ACL_SELECT); tupDesc = RelationGetDescr(rel); attnums = CopyGetAttnums(tupDesc, rel, stmt->attlist); @@ -889,7 +876,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) else rte->selectedCols = bms_add_member(rte->selectedCols, attno); } - ExecCheckRTPerms(range_table, true); + ExecCheckRTPerms(pstate->p_rtable, true); /* * Permission check for row security policies. @@ -911,6 +898,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) ColumnRef *cr; ResTarget *target; RangeVar *from; + List *targetList = NIL; if (is_from) ereport(ERROR, @@ -918,35 +906,77 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) errmsg("COPY FROM not supported with row-level security"), errhint("Use INSERT statements instead."))); - /* Build target list */ - cr = makeNode(ColumnRef); - + /* + * Build target list + * + * If no columns are specified in the attribute list of the COPY + * command, then the target list is 'all' columns. Therefore, '*' + * should be used as the target list for the resulting SELECT + * statement. + * + * In the case that columns are specified in the attribute list, + * create a ColumnRef and ResTarget for each column and add them + * to the target list for the resulting SELECT statement. + */ if (!stmt->attlist) + { + cr = makeNode(ColumnRef); cr->fields = list_make1(makeNode(A_Star)); - else - cr->fields = stmt->attlist; + cr->location = -1; + + target = makeNode(ResTarget); + target->name = NULL; + target->indirection = NIL; + target->val = (Node *) cr; + target->location = -1; - cr->location = 1; + targetList = list_make1(target); + } + else + { + ListCell *lc; - target = makeNode(ResTarget); - target->name = NULL; - target->indirection = NIL; - target->val = (Node *) cr; - target->location = 1; + foreach(lc, stmt->attlist) + { + /* + * Build the ColumnRef for each column. The ColumnRef + * 'fields' property is a String 'Value' node (see + * nodes/value.h) that corresponds to the column name + * respectively. + */ + cr = makeNode(ColumnRef); + cr->fields = list_make1(lfirst(lc)); + cr->location = -1; + + /* Build the ResTarget and add the ColumnRef to it. */ + target = makeNode(ResTarget); + target->name = NULL; + target->indirection = NIL; + target->val = (Node *) cr; + target->location = -1; + + /* Add each column to the SELECT statement's target list */ + targetList = lappend(targetList, target); + } + } /* * Build RangeVar for from clause, fully qualified based on the * relation which we have opened and locked. */ from = makeRangeVar(get_namespace_name(RelationGetNamespace(rel)), - RelationGetRelationName(rel), -1); + pstrdup(RelationGetRelationName(rel)), + -1); /* Build query */ select = makeNode(SelectStmt); - select->targetList = list_make1(target); + select->targetList = targetList; select->fromClause = list_make1(from); - query = (Node *) select; + query = makeNode(RawStmt); + query->stmt = (Node *) select; + query->stmt_location = stmt_location; + query->stmt_len = stmt_len; /* * Close the relation for now, but keep the lock on it to prevent @@ -962,7 +992,11 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) { Assert(stmt->query); - query = stmt->query; + query = makeNode(RawStmt); + query->stmt = stmt->query; + query->stmt_location = stmt_location; + query->stmt_len = stmt_len; + relid = InvalidOid; rel = NULL; } @@ -976,9 +1010,8 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) PreventCommandIfReadOnly("COPY FROM"); PreventCommandIfParallelMode("COPY FROM"); - cstate = BeginCopyFrom(rel, stmt->filename, stmt->is_program, - stmt->attlist, stmt->options); - cstate->range_table = range_table; + cstate = BeginCopyFrom(pstate, rel, stmt->filename, stmt->is_program, + NULL, stmt->attlist, stmt->options); *processed = CopyFrom(cstate); /* copy from file to database */ #ifdef XCP /* @@ -993,7 +1026,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) } else { - cstate = BeginCopyTo(rel, query, queryString, relid, + cstate = BeginCopyTo(pstate, rel, query, relid, stmt->filename, stmt->is_program, stmt->attlist, stmt->options); *processed = DoCopyTo(cstate); /* copy from database to file */ @@ -1007,8 +1040,6 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) */ if (rel != NULL) heap_close(rel, (is_from ? NoLock : AccessShareLock)); - - return relid; } /* @@ -1029,7 +1060,8 @@ DoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) * self-consistency of the options list. */ void -ProcessCopyOptions(CopyState cstate, +ProcessCopyOptions(ParseState *pstate, + CopyState cstate, bool is_from, List *options) { @@ -1045,7 +1077,7 @@ ProcessCopyOptions(CopyState cstate, /* Extract options from the statement node tree */ foreach(option, options) { - DefElem *defel = (DefElem *) lfirst(option); + DefElem *defel = lfirst_node(DefElem, option); if (strcmp(defel->defname, "format") == 0) { @@ -1054,7 +1086,8 @@ ProcessCopyOptions(CopyState cstate, if (format_specified) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); format_specified = true; if (strcmp(fmt, "text") == 0) /* default format */ ; @@ -1065,14 +1098,16 @@ ProcessCopyOptions(CopyState cstate, else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("COPY format \"%s\" not recognized", fmt))); + errmsg("COPY format \"%s\" not recognized", fmt), + parser_errposition(pstate, defel->location))); } else if (strcmp(defel->defname, "oids") == 0) { if (cstate->oids) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->oids = defGetBoolean(defel); } else if (strcmp(defel->defname, "freeze") == 0) @@ -1080,7 +1115,8 @@ ProcessCopyOptions(CopyState cstate, if (cstate->freeze) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->freeze = defGetBoolean(defel); } else if (strcmp(defel->defname, "delimiter") == 0) @@ -1088,7 +1124,8 @@ ProcessCopyOptions(CopyState cstate, if (cstate->delim) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->delim = defGetString(defel); } else if (strcmp(defel->defname, "null") == 0) @@ -1096,7 +1133,8 @@ ProcessCopyOptions(CopyState cstate, if (cstate->null_print) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->null_print = defGetString(defel); } else if (strcmp(defel->defname, "header") == 0) @@ -1104,7 +1142,8 @@ ProcessCopyOptions(CopyState cstate, if (cstate->header_line) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->header_line = defGetBoolean(defel); } else if (strcmp(defel->defname, "quote") == 0) @@ -1112,7 +1151,8 @@ ProcessCopyOptions(CopyState cstate, if (cstate->quote) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->quote = defGetString(defel); } else if (strcmp(defel->defname, "escape") == 0) @@ -1120,7 +1160,8 @@ ProcessCopyOptions(CopyState cstate, if (cstate->escape) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->escape = defGetString(defel); } else if (strcmp(defel->defname, "force_quote") == 0) @@ -1128,30 +1169,34 @@ ProcessCopyOptions(CopyState cstate, if (cstate->force_quote || cstate->force_quote_all) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); if (defel->arg && IsA(defel->arg, A_Star)) cstate->force_quote_all = true; else if (defel->arg && IsA(defel->arg, List)) - cstate->force_quote = (List *) defel->arg; + cstate->force_quote = castNode(List, defel->arg); else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument to option \"%s\" must be a list of column names", - defel->defname))); + defel->defname), + parser_errposition(pstate, defel->location))); } else if (strcmp(defel->defname, "force_not_null") == 0) { if (cstate->force_notnull) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); if (defel->arg && IsA(defel->arg, List)) - cstate->force_notnull = (List *) defel->arg; + cstate->force_notnull = castNode(List, defel->arg); else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument to option \"%s\" must be a list of column names", - defel->defname))); + defel->defname), + parser_errposition(pstate, defel->location))); } else if (strcmp(defel->defname, "force_null") == 0) { @@ -1160,12 +1205,13 @@ ProcessCopyOptions(CopyState cstate, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options"))); if (defel->arg && IsA(defel->arg, List)) - cstate->force_null = (List *) defel->arg; + cstate->force_null = castNode(List, defel->arg); else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument to option \"%s\" must be a list of column names", - defel->defname))); + defel->defname), + parser_errposition(pstate, defel->location))); } else if (strcmp(defel->defname, "convert_selectively") == 0) { @@ -1177,34 +1223,39 @@ ProcessCopyOptions(CopyState cstate, if (cstate->convert_selectively) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->convert_selectively = true; if (defel->arg == NULL || IsA(defel->arg, List)) - cstate->convert_select = (List *) defel->arg; + cstate->convert_select = castNode(List, defel->arg); else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument to option \"%s\" must be a list of column names", - defel->defname))); + defel->defname), + parser_errposition(pstate, defel->location))); } else if (strcmp(defel->defname, "encoding") == 0) { if (cstate->file_encoding >= 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); cstate->file_encoding = pg_char_to_encoding(defGetString(defel)); if (cstate->file_encoding < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument to option \"%s\" must be a valid encoding name", - defel->defname))); + defel->defname), + parser_errposition(pstate, defel->location))); } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("option \"%s\" not recognized", - defel->defname))); + defel->defname), + parser_errposition(pstate, defel->location))); } /* @@ -1367,11 +1418,11 @@ ProcessCopyOptions(CopyState cstate, * NULL values as <null_print>. */ static CopyState -BeginCopy(bool is_from, +BeginCopy(ParseState *pstate, + bool is_from, Relation rel, - Node *raw_query, - const char *queryString, - const Oid queryRelId, + RawStmt *raw_query, + Oid queryRelId, List *attnamelist, List *options) { @@ -1389,14 +1440,12 @@ BeginCopy(bool is_from, */ cstate->copycontext = AllocSetContextCreate(CurrentMemoryContext, "COPY", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldcontext = MemoryContextSwitchTo(cstate->copycontext); /* Extract options from the statement node tree */ - ProcessCopyOptions(cstate, is_from, options); + ProcessCopyOptions(pstate, cstate, is_from, options); /* Process the source/target relation or query */ if (rel) @@ -1413,6 +1462,30 @@ BeginCopy(bool is_from, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("table \"%s\" does not have OIDs", RelationGetRelationName(cstate->rel)))); + + /* Initialize state for CopyFrom tuple routing. */ + if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + PartitionDispatch *partition_dispatch_info; + ResultRelInfo *partitions; + TupleConversionMap **partition_tupconv_maps; + TupleTableSlot *partition_tuple_slot; + int num_parted, + num_partitions; + + ExecSetupPartitionTupleRouting(rel, + &partition_dispatch_info, + &partitions, + &partition_tupconv_maps, + &partition_tuple_slot, + &num_parted, &num_partitions); + cstate->partition_dispatch_info = partition_dispatch_info; + cstate->num_dispatch = num_parted; + cstate->partitions = partitions; + cstate->num_partitions = num_partitions; + cstate->partition_tupconv_maps = partition_tupconv_maps; + cstate->partition_tuple_slot = partition_tuple_slot; + } #ifdef PGXC /* Get copy statement and execution node information */ if (IS_PGXC_COORDINATOR) @@ -1493,8 +1566,9 @@ BeginCopy(bool is_from, * function and is executed repeatedly. (See also the same hack in * DECLARE CURSOR and PREPARE.) XXX FIXME someday. */ - rewritten = pg_analyze_and_rewrite((Node *) copyObject(raw_query), - queryString, NULL, 0); + rewritten = pg_analyze_and_rewrite(copyObject(raw_query), + pstate->p_sourcetext, NULL, 0, + NULL); /* check that we got back something we can work with */ if (rewritten == NIL) @@ -1510,7 +1584,7 @@ BeginCopy(bool is_from, /* examine queries to determine which error message to issue */ foreach(lc, rewritten) { - Query *q = (Query *) lfirst(lc); + Query *q = lfirst_node(Query, lc); if (q->querySource == QSRC_QUAL_INSTEAD_RULE) ereport(ERROR, @@ -1527,7 +1601,7 @@ BeginCopy(bool is_from, errmsg("multi-statement DO INSTEAD rules are not supported for COPY"))); } - query = (Query *) linitial(rewritten); + query = linitial_node(Query, rewritten); Assert(query->utilityStmt == NULL); @@ -1587,10 +1661,10 @@ BeginCopy(bool is_from, ((DR_copy *) dest)->cstate = cstate; /* Create a QueryDesc requesting no output */ - cstate->queryDesc = CreateQueryDesc(plan, queryString, + cstate->queryDesc = CreateQueryDesc(plan, pstate->p_sourcetext, GetActiveSnapshot(), InvalidSnapshot, - dest, NULL, 0); + dest, NULL, NULL, 0); /* * Call ExecutorStart to prepare the plan for execution. @@ -1799,10 +1873,10 @@ EndCopy(CopyState cstate) * Setup CopyState to read tuples from a table or a query for COPY TO. */ static CopyState -BeginCopyTo(Relation rel, - Node *query, - const char *queryString, - const Oid queryRelId, +BeginCopyTo(ParseState *pstate, + Relation rel, + RawStmt *query, + Oid queryRelId, const char *filename, bool is_program, List *attnamelist, @@ -1837,6 +1911,12 @@ BeginCopyTo(Relation rel, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot copy from sequence \"%s\"", RelationGetRelationName(rel)))); + else if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot copy from partitioned table \"%s\"", + RelationGetRelationName(rel)), + errhint("Try the COPY (SELECT ...) TO variant."))); else ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -1844,7 +1924,7 @@ BeginCopyTo(Relation rel, RelationGetRelationName(rel)))); } - cstate = BeginCopy(false, rel, query, queryString, queryRelId, attnamelist, + cstate = BeginCopy(pstate, false, rel, query, queryRelId, attnamelist, options); oldcontext = MemoryContextSwitchTo(cstate->copycontext); @@ -1886,10 +1966,18 @@ BeginCopyTo(Relation rel, cstate->copy_file = AllocateFile(cstate->filename, PG_BINARY_W); umask(oumask); if (cstate->copy_file == NULL) + { + /* copy errno because ereport subfunctions might change it */ + int save_errno = errno; + ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" for writing: %m", - cstate->filename))); + cstate->filename), + (save_errno == ENOENT || save_errno == EACCES) ? + errhint("COPY TO instructs the PostgreSQL server process to write a file. " + "You may want a client-side facility such as psql's \\copy.") : 0)); + } if (fstat(fileno(cstate->copy_file), &st)) ereport(ERROR, @@ -2014,9 +2102,7 @@ CopyTo(CopyState cstate) */ cstate->rowcontext = AllocSetContextCreate(CurrentMemoryContext, "COPY TO", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); if (cstate->binary) { @@ -2127,7 +2213,7 @@ CopyTo(CopyState cstate) else { /* run the plan --- the dest receiver will send tuples */ - ExecutorRun(cstate->queryDesc, ForwardScanDirection, 0L); + ExecutorRun(cstate->queryDesc, ForwardScanDirection, 0L, true); processed = ((DR_copy *) cstate->queryDesc->dest)->processed; } @@ -2358,7 +2444,7 @@ limit_printout_length(const char *str) /* * Copy FROM file to relation. */ -static uint64 +uint64 CopyFrom(CopyState cstate) { HeapTuple tuple; @@ -2366,6 +2452,7 @@ CopyFrom(CopyState cstate) Datum *values; bool *nulls; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ ExprContext *econtext; TupleTableSlot *myslot; @@ -2378,6 +2465,7 @@ CopyFrom(CopyState cstate) uint64 processed = 0; bool useHeapMultiInsert; int nBufferedTuples = 0; + int prev_leaf_part_index = -1; #define MAX_BUFFERED_TUPLES 1000 HeapTuple *bufferedTuples = NULL; /* initialize to silence warning */ @@ -2386,13 +2474,22 @@ CopyFrom(CopyState cstate) Assert(cstate->rel); - if (cstate->rel->rd_rel->relkind != RELKIND_RELATION) + /* + * The target must be a plain relation or have an INSTEAD OF INSERT row + * trigger. (Currently, such triggers are only allowed on views, so we + * only hint about them in the view case.) + */ + if (cstate->rel->rd_rel->relkind != RELKIND_RELATION && + cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && + !(cstate->rel->trigdesc && + cstate->rel->trigdesc->trig_insert_instead_row)) { if (cstate->rel->rd_rel->relkind == RELKIND_VIEW) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot copy to view \"%s\"", - RelationGetRelationName(cstate->rel)))); + RelationGetRelationName(cstate->rel)), + errhint("To enable copying to a view, provide an INSTEAD OF INSERT trigger."))); else if (cstate->rel->rd_rel->relkind == RELKIND_MATVIEW) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -2467,7 +2564,7 @@ CopyFrom(CopyState cstate) * earlier scan or command. This ensures that if this subtransaction * aborts then the frozen rows won't be visible after xact cleanup. Note * that the stronger test of exactly which subtransaction created it is - * crucial for correctness of this optimisation. + * crucial for correctness of this optimization. */ if (cstate->freeze) { @@ -2494,6 +2591,7 @@ CopyFrom(CopyState cstate) InitResultRelInfo(resultRelInfo, cstate->rel, 1, /* dummy rangetable index */ + NULL, 0); ExecOpenIndices(resultRelInfo, false); @@ -2516,11 +2614,13 @@ CopyFrom(CopyState cstate) * BEFORE/INSTEAD OF triggers, or we need to evaluate volatile default * expressions. Such triggers or expressions might query the table we're * inserting to, and act differently if the tuples that have already been - * processed and prepared for insertion are not there. + * processed and prepared for insertion are not there. We also can't do + * it if the table is partitioned. */ if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || + cstate->partition_dispatch_info != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -2628,6 +2728,81 @@ CopyFrom(CopyState cstate) slot = myslot; ExecStoreTuple(tuple, slot, InvalidBuffer, false); + /* Determine the partition to heap_insert the tuple into */ + if (cstate->partition_dispatch_info) + { + int leaf_part_index; + TupleConversionMap *map; + + /* + * Away we go ... If we end up not finding a partition after all, + * ExecFindPartition() does not return and errors out instead. + * Otherwise, the returned value is to be used as an index into + * arrays mt_partitions[] and mt_partition_tupconv_maps[] that + * will get us the ResultRelInfo and TupleConversionMap for the + * partition, respectively. + */ + leaf_part_index = ExecFindPartition(resultRelInfo, + cstate->partition_dispatch_info, + slot, + estate); + Assert(leaf_part_index >= 0 && + leaf_part_index < cstate->num_partitions); + + /* + * If this tuple is mapped to a partition that is not same as the + * previous one, we'd better make the bulk insert mechanism gets a + * new buffer. + */ + if (prev_leaf_part_index != leaf_part_index) + { + ReleaseBulkInsertStatePin(bistate); + prev_leaf_part_index = leaf_part_index; + } + + /* + * Save the old ResultRelInfo and switch to the one corresponding + * to the selected partition. + */ + saved_resultRelInfo = resultRelInfo; + resultRelInfo = cstate->partitions + leaf_part_index; + + /* We do not yet have a way to insert into a foreign partition */ + if (resultRelInfo->ri_FdwRoutine) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot route inserted tuples to a foreign table"))); + + /* + * For ExecInsertIndexTuples() to work on the partition's indexes + */ + estate->es_result_relation_info = resultRelInfo; + + /* + * We might need to convert from the parent rowtype to the + * partition rowtype. + */ + map = cstate->partition_tupconv_maps[leaf_part_index]; + if (map) + { + Relation partrel = resultRelInfo->ri_RelationDesc; + + tuple = do_convert_tuple(tuple, map); + + /* + * We must use the partition's tuple descriptor from this + * point on. Use a dedicated slot from this point on until + * we're finished dealing with the partition. + */ + slot = cstate->partition_tuple_slot; + Assert(slot != NULL); + ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); + ExecStoreTuple(tuple, slot, InvalidBuffer, true); + } + + tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + } + skip_tuple = false; /* BEFORE ROW INSERT Triggers */ @@ -2644,52 +2819,66 @@ CopyFrom(CopyState cstate) if (!skip_tuple) { - /* Check the constraints of the tuple */ - if (cstate->rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate); - - if (useHeapMultiInsert) + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_insert_instead_row) { - /* Add this tuple to the tuple buffer */ - if (nBufferedTuples == 0) - firstBufferedLineNo = cstate->cur_lineno; - bufferedTuples[nBufferedTuples++] = tuple; - bufferedTuplesSize += tuple->t_len; - - /* - * If the buffer filled up, flush it. Also flush if the total - * size of all the tuples in the buffer becomes large, to - * avoid using large amounts of memory for the buffers when - * the tuples are exceptionally wide. - */ - if (nBufferedTuples == MAX_BUFFERED_TUPLES || - bufferedTuplesSize > 65535) - { - CopyFromInsertBatch(cstate, estate, mycid, hi_options, - resultRelInfo, myslot, bistate, - nBufferedTuples, bufferedTuples, - firstBufferedLineNo); - nBufferedTuples = 0; - bufferedTuplesSize = 0; - } + /* Pass the data to the INSTEAD ROW INSERT trigger */ + ExecIRInsertTriggers(estate, resultRelInfo, slot); } else { - List *recheckIndexes = NIL; + /* Check the constraints of the tuple */ + if (cstate->rel->rd_att->constr || + resultRelInfo->ri_PartitionCheck) + ExecConstraints(resultRelInfo, slot, estate); + + if (useHeapMultiInsert) + { + /* Add this tuple to the tuple buffer */ + if (nBufferedTuples == 0) + firstBufferedLineNo = cstate->cur_lineno; + bufferedTuples[nBufferedTuples++] = tuple; + bufferedTuplesSize += tuple->t_len; + + /* + * If the buffer filled up, flush it. Also flush if the + * total size of all the tuples in the buffer becomes + * large, to avoid using large amounts of memory for the + * buffer when the tuples are exceptionally wide. + */ + if (nBufferedTuples == MAX_BUFFERED_TUPLES || + bufferedTuplesSize > 65535) + { + CopyFromInsertBatch(cstate, estate, mycid, hi_options, + resultRelInfo, myslot, bistate, + nBufferedTuples, bufferedTuples, + firstBufferedLineNo); + nBufferedTuples = 0; + bufferedTuplesSize = 0; + } + } + else + { + List *recheckIndexes = NIL; - /* OK, store the tuple and create index entries for it */ - heap_insert(cstate->rel, tuple, mycid, hi_options, bistate); + /* OK, store the tuple and create index entries for it */ + heap_insert(resultRelInfo->ri_RelationDesc, tuple, mycid, + hi_options, bistate); - if (resultRelInfo->ri_NumIndices > 0) - recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), - estate, false, NULL, - NIL); + if (resultRelInfo->ri_NumIndices > 0) + recheckIndexes = ExecInsertIndexTuples(slot, + &(tuple->t_self), + estate, + false, + NULL, + NIL); - /* AFTER ROW INSERT Triggers */ - ExecARInsertTriggers(estate, resultRelInfo, tuple, - recheckIndexes); + /* AFTER ROW INSERT Triggers */ + ExecARInsertTriggers(estate, resultRelInfo, tuple, + recheckIndexes); - list_free(recheckIndexes); + list_free(recheckIndexes); + } } /* @@ -2698,6 +2887,12 @@ CopyFrom(CopyState cstate) * tuples inserted by an INSERT command. */ processed++; + + if (saved_resultRelInfo) + { + resultRelInfo = saved_resultRelInfo; + estate->es_result_relation_info = resultRelInfo; + } } #ifdef PGXC } @@ -2758,6 +2953,39 @@ CopyFrom(CopyState cstate) ExecCloseIndices(resultRelInfo); + /* Close all the partitioned tables, leaf partitions, and their indices */ + if (cstate->partition_dispatch_info) + { + int i; + + /* + * Remember cstate->partition_dispatch_info[0] corresponds to the root + * partitioned table, which we must not try to close, because it is + * the main target table of COPY that will be closed eventually by + * DoCopy(). Also, tupslot is NULL for the root partitioned table. + */ + for (i = 1; i < cstate->num_dispatch; i++) + { + PartitionDispatch pd = cstate->partition_dispatch_info[i]; + + heap_close(pd->reldesc, NoLock); + ExecDropSingleTupleTableSlot(pd->tupslot); + } + for (i = 0; i < cstate->num_partitions; i++) + { + ResultRelInfo *resultRelInfo = cstate->partitions + i; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + } + + /* Release the standalone partition tuple descriptor */ + ExecDropSingleTupleTableSlot(cstate->partition_tuple_slot); + } + + /* Close any trigger target relations */ + ExecCleanUpTriggerState(estate); + FreeExecutorState(estate); /* @@ -2859,9 +3087,11 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, * Returns a CopyState, to be passed to NextCopyFrom and related functions. */ CopyState -BeginCopyFrom(Relation rel, +BeginCopyFrom(ParseState *pstate, + Relation rel, const char *filename, bool is_program, + copy_data_source_cb data_source_cb, List *attnamelist, List *options) { @@ -2880,7 +3110,7 @@ BeginCopyFrom(Relation rel, MemoryContext oldcontext; bool volatile_defexprs; - cstate = BeginCopy(true, rel, NULL, NULL, InvalidOid, attnamelist, options); + cstate = BeginCopy(pstate, true, rel, NULL, InvalidOid, attnamelist, options); oldcontext = MemoryContextSwitchTo(cstate->copycontext); /* Initialize state variables */ @@ -2898,6 +3128,10 @@ BeginCopyFrom(Relation rel, cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1); cstate->raw_buf_index = cstate->raw_buf_len = 0; + /* Assign range table, we'll need it in CopyFrom. */ + if (pstate) + cstate->range_table = pstate->p_rtable; + tupDesc = RelationGetDescr(cstate->rel); attr = tupDesc->attrs; num_phys_attrs = tupDesc->natts; @@ -2998,7 +3232,7 @@ BeginCopyFrom(Relation rel, * the special case of when the default expression is the * nextval() of a sequence which in this specific case is * known to be safe for use with the multi-insert - * optimisation. Hence we use this special case function + * optimization. Hence we use this special case function * checker rather than the standard check for * contain_volatile_functions(). */ @@ -3020,7 +3254,12 @@ BeginCopyFrom(Relation rel, cstate->num_defaults = num_defaults; cstate->is_program = is_program; - if (pipe) + if (data_source_cb) + { + cstate->copy_dest = COPY_CALLBACK; + cstate->data_source_cb = data_source_cb; + } + else if (pipe) { Assert(!is_program); /* the grammar does not allow this */ if (whereToSendOutput == DestRemote) @@ -3047,10 +3286,18 @@ BeginCopyFrom(Relation rel, cstate->copy_file = AllocateFile(cstate->filename, PG_BINARY_R); if (cstate->copy_file == NULL) + { + /* copy errno because ereport subfunctions might change it */ + int save_errno = errno; + ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" for reading: %m", - cstate->filename))); + cstate->filename), + (save_errno == ENOENT || save_errno == EACCES) ? + errhint("COPY FROM instructs the PostgreSQL server process to read a file. " + "You may want a client-side facility such as psql's \\copy.") : 0)); + } if (fstat(fileno(cstate->copy_file), &st)) ereport(ERROR, @@ -3499,7 +3746,7 @@ NextCopyFrom(CopyState cstate, ExprContext *econtext, Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory); values[defmap[i]] = ExecEvalExpr(defexprs[i], econtext, - &nulls[defmap[i]], NULL); + &nulls[defmap[i]]); } #ifdef PGXC diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index 5b4f6affcc..06425cc0eb 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -13,7 +13,7 @@ * we must return a tuples-processed count in the completionTag. (We no * longer do that for CTAS ... WITH NO DATA, however.) * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -112,7 +112,7 @@ create_ctas_internal(List *attrList, IntoClause *into) * Create the relation. (This will error out if there's an existing view, * so we don't need more code to complain if "replace" is false.) */ - intoRelationAddr = DefineRelation(create, relkind, InvalidOid, NULL); + intoRelationAddr = DefineRelation(create, relkind, InvalidOid, NULL, NULL); /* * If necessary, create a TOAST table for the target table. Note that @@ -222,9 +222,10 @@ create_ctas_nodata(List *tlist, IntoClause *into) */ ObjectAddress ExecCreateTableAs(CreateTableAsStmt *stmt, const char *queryString, - ParamListInfo params, char *completionTag) + ParamListInfo params, QueryEnvironment *queryEnv, + char *completionTag) { - Query *query = (Query *) stmt->query; + Query *query = castNode(Query, stmt->query); IntoClause *into = stmt->into; bool is_matview = (into->viewQuery != NULL); DestReceiver *dest; @@ -261,11 +262,10 @@ ExecCreateTableAs(CreateTableAsStmt *stmt, const char *queryString, * The contained Query could be a SELECT, or an EXECUTE utility command. * If the latter, we just pass it off to ExecuteQuery. */ - Assert(IsA(query, Query)); if (query->commandType == CMD_UTILITY && IsA(query->utilityStmt, ExecuteStmt)) { - ExecuteStmt *estmt = (ExecuteStmt *) query->utilityStmt; + ExecuteStmt *estmt = castNode(ExecuteStmt, query->utilityStmt); Assert(!is_matview); /* excluded by syntax */ ExecuteQuery(estmt, into, queryString, params, dest, completionTag); @@ -316,17 +316,17 @@ ExecCreateTableAs(CreateTableAsStmt *stmt, const char *queryString, * and is executed repeatedly. (See also the same hack in EXPLAIN and * PREPARE.) */ - rewritten = QueryRewrite((Query *) copyObject(query)); + rewritten = QueryRewrite(copyObject(query)); /* SELECT should never rewrite to more or less than one SELECT query */ if (list_length(rewritten) != 1) elog(ERROR, "unexpected rewrite result for %s", is_matview ? "CREATE MATERIALIZED VIEW" : "CREATE TABLE AS SELECT"); - query = (Query *) linitial(rewritten); + query = linitial_node(Query, rewritten); Assert(query->commandType == CMD_SELECT); - /* plan the query */ + /* plan the query --- note we disallow parallelism */ plan = pg_plan_query(query, 0, params); /* @@ -342,13 +342,13 @@ ExecCreateTableAs(CreateTableAsStmt *stmt, const char *queryString, /* Create a QueryDesc, redirecting output to our tuple receiver */ queryDesc = CreateQueryDesc(plan, queryString, GetActiveSnapshot(), InvalidSnapshot, - dest, params, 0); + dest, params, queryEnv, 0); /* call ExecutorStart to prepare the plan for execution */ ExecutorStart(queryDesc, GetIntoRelEFlags(into)); /* run the plan to completion */ - ExecutorRun(queryDesc, ForwardScanDirection, 0L); + ExecutorRun(queryDesc, ForwardScanDirection, 0L, true); /* save the rowcount if we're given a completionTag to fill */ if (completionTag) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index d87945e4d3..baeb8b591e 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -8,7 +8,7 @@ * stepping on each others' toes. Formerly we used table-level locks * on pg_database, but that's too coarse-grained. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -20,7 +20,6 @@ #include "postgres.h" #include <fcntl.h> -#include <locale.h> #include <unistd.h> #include <sys/stat.h> @@ -37,6 +36,7 @@ #include "catalog/pg_authid.h" #include "catalog/pg_database.h" #include "catalog/pg_db_role_setting.h" +#include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "commands/comment.h" #include "commands/dbcommands.h" @@ -108,7 +108,7 @@ static void movedb_success_callback(Oid db_id, Oid tblspcoid); * CREATE DATABASE */ Oid -createdb(const CreatedbStmt *stmt) +createdb(ParseState *pstate, const CreatedbStmt *stmt) { HeapScanDesc scan; Relation rel; @@ -164,7 +164,8 @@ createdb(const CreatedbStmt *stmt) if (dtablespacename) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dtablespacename = defel; } else if (strcmp(defel->defname, "owner") == 0) @@ -172,7 +173,8 @@ createdb(const CreatedbStmt *stmt) if (downer) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); downer = defel; } else if (strcmp(defel->defname, "template") == 0) @@ -180,7 +182,8 @@ createdb(const CreatedbStmt *stmt) if (dtemplate) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dtemplate = defel; } else if (strcmp(defel->defname, "encoding") == 0) @@ -188,7 +191,8 @@ createdb(const CreatedbStmt *stmt) if (dencoding) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dencoding = defel; } else if (strcmp(defel->defname, "lc_collate") == 0) @@ -196,7 +200,8 @@ createdb(const CreatedbStmt *stmt) if (dcollate) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dcollate = defel; } else if (strcmp(defel->defname, "lc_ctype") == 0) @@ -204,7 +209,8 @@ createdb(const CreatedbStmt *stmt) if (dctype) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dctype = defel; } else if (strcmp(defel->defname, "is_template") == 0) @@ -212,7 +218,8 @@ createdb(const CreatedbStmt *stmt) if (distemplate) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); distemplate = defel; } else if (strcmp(defel->defname, "allow_connections") == 0) @@ -220,7 +227,8 @@ createdb(const CreatedbStmt *stmt) if (dallowconnections) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dallowconnections = defel; } else if (strcmp(defel->defname, "connection_limit") == 0) @@ -228,7 +236,8 @@ createdb(const CreatedbStmt *stmt) if (dconnlimit) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dconnlimit = defel; } else if (strcmp(defel->defname, "location") == 0) @@ -236,12 +245,14 @@ createdb(const CreatedbStmt *stmt) ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("LOCATION is not supported anymore"), - errhint("Consider using tablespaces instead."))); + errhint("Consider using tablespaces instead."), + parser_errposition(pstate, defel->location))); } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("option \"%s\" not recognized", defel->defname))); + errmsg("option \"%s\" not recognized", defel->defname), + parser_errposition(pstate, defel->location))); } if (downer && downer->arg) @@ -261,7 +272,8 @@ createdb(const CreatedbStmt *stmt) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("%d is not a valid encoding code", - encoding))); + encoding), + parser_errposition(pstate, dencoding->location))); } else { @@ -271,7 +283,8 @@ createdb(const CreatedbStmt *stmt) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("%s is not a valid encoding name", - encoding_name))); + encoding_name), + parser_errposition(pstate, dencoding->location))); } } if (dcollate && dcollate->arg) @@ -544,10 +557,7 @@ createdb(const CreatedbStmt *stmt) HeapTupleSetOid(tuple, dboid); - simple_heap_insert(pg_database_rel, tuple); - - /* Update indexes */ - CatalogUpdateIndexes(pg_database_rel, tuple); + CatalogTupleInsert(pg_database_rel, tuple); /* * Now generate additional catalog entries associated with the new DB @@ -686,7 +696,7 @@ createdb(const CreatedbStmt *stmt) /* * Force synchronous commit, thus minimizing the window between - * creation of the database files and commital of the transaction. If + * creation of the database files and committal of the transaction. If * we crash before committing, we'll have a DB that's taking up disk * space but is not in pg_database, which is not good. */ @@ -817,6 +827,7 @@ dropdb(const char *dbname, bool missing_ok) int npreparedxacts; int nslots, nslots_active; + int nsubscriptions; /* * Look up the target database's OID, and get exclusive lock on it. We @@ -874,19 +885,22 @@ dropdb(const char *dbname, bool missing_ok) errmsg("cannot drop the currently open database"))); /* - * Check whether there are, possibly unconnected, logical slots that refer - * to the to-be-dropped database. The database lock we are holding - * prevents the creation of new slots using the database. + * Check whether there are active logical slots that refer to the + * to-be-dropped database. The database lock we are holding prevents the + * creation of new slots using the database or existing slots becoming + * active. */ - if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active)) + (void) ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active); + if (nslots_active) + { ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("database \"%s\" is used by a logical replication slot", - dbname), - errdetail_plural("There is %d slot, %d of them active.", - "There are %d slots, %d of them active.", - nslots, - nslots, nslots_active))); + errmsg("database \"%s\" is used by an active logical replication slot", + dbname), + errdetail_plural("There is %d active slot", + "There are %d active slots", + nslots_active, nslots_active))); + } /* * Check for other backends in the target database. (Because we hold the @@ -902,13 +916,28 @@ dropdb(const char *dbname, bool missing_ok) errdetail_busy_db(notherbackends, npreparedxacts))); /* + * Check if there are subscriptions defined in the target database. + * + * We can't drop them automatically because they might be holding + * resources in other databases/instances. + */ + if ((nsubscriptions = CountDBSubscriptions(db_id)) > 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("database \"%s\" is being used by logical replication subscription", + dbname), + errdetail_plural("There is %d subscription.", + "There are %d subscriptions.", + nsubscriptions, nsubscriptions))); + + /* * Remove the database's tuple from pg_database. */ tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(db_id)); if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for database %u", db_id); - simple_heap_delete(pgdbrel, &tup->t_self); + CatalogTupleDelete(pgdbrel, &tup->t_self); ReleaseSysCache(tup); @@ -929,6 +958,11 @@ dropdb(const char *dbname, bool missing_ok) dropDatabaseDependencies(db_id); /* + * Drop db-specific replication slots. + */ + ReplicationSlotsDropDBSlots(db_id); + + /* * Drop pages for this database that are in the shared buffer cache. This * is important to ensure that no remaining backend tries to write out a * dirty buffer to the dead database later... @@ -968,7 +1002,7 @@ dropdb(const char *dbname, bool missing_ok) /* * Force synchronous commit, thus minimizing the window between removal of - * the database files and commital of the transaction. If we crash before + * the database files and committal of the transaction. If we crash before * committing, we'll have a DB that's gone on disk but still there * according to pg_database, which is not good. */ @@ -1057,8 +1091,7 @@ RenameDatabase(const char *oldname, const char *newname) if (!HeapTupleIsValid(newtup)) elog(ERROR, "cache lookup failed for database %u", db_id); namestrcpy(&(((Form_pg_database) GETSTRUCT(newtup))->datname), newname); - simple_heap_update(rel, &newtup->t_self, newtup); - CatalogUpdateIndexes(rel, newtup); + CatalogTupleUpdate(rel, &newtup->t_self, newtup); InvokeObjectPostAlterHook(DatabaseRelationId, db_id, 0); @@ -1325,7 +1358,7 @@ movedb(const char *dbname, const char *tblspcname) ScanKeyInit(&scankey, Anum_pg_database_datname, BTEqualStrategyNumber, F_NAMEEQ, - NameGetDatum(dbname)); + CStringGetDatum(dbname)); sysscan = systable_beginscan(pgdbrel, DatabaseNameIndexId, true, NULL, 1, &scankey); oldtuple = systable_getnext(sysscan); @@ -1344,10 +1377,7 @@ movedb(const char *dbname, const char *tblspcname) newtuple = heap_modify_tuple(oldtuple, RelationGetDescr(pgdbrel), new_record, new_record_nulls, new_record_repl); - simple_heap_update(pgdbrel, &oldtuple->t_self, newtuple); - - /* Update indexes */ - CatalogUpdateIndexes(pgdbrel, newtuple); + CatalogTupleUpdate(pgdbrel, &oldtuple->t_self, newtuple); InvokeObjectPostAlterHook(DatabaseRelationId, HeapTupleGetOid(newtuple), 0); @@ -1364,7 +1394,7 @@ movedb(const char *dbname, const char *tblspcname) /* * Force synchronous commit, thus minimizing the window between - * copying the database files and commital of the transaction. If we + * copying the database files and committal of the transaction. If we * crash before committing, we'll leave an orphaned set of files on * disk, which is not fatal but not good either. */ @@ -1497,7 +1527,7 @@ movedb_failure_callback(int code, Datum arg) * ALTER DATABASE name ... */ Oid -AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) +AlterDatabase(ParseState *pstate, AlterDatabaseStmt *stmt, bool isTopLevel) { Relation rel; Oid dboid; @@ -1527,7 +1557,8 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) if (distemplate) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); distemplate = defel; } else if (strcmp(defel->defname, "allow_connections") == 0) @@ -1535,7 +1566,8 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) if (dallowconnections) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dallowconnections = defel; } else if (strcmp(defel->defname, "connection_limit") == 0) @@ -1543,7 +1575,8 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) if (dconnlimit) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dconnlimit = defel; } else if (strcmp(defel->defname, "tablespace") == 0) @@ -1551,13 +1584,15 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) if (dtablespace) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); dtablespace = defel; } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("option \"%s\" not recognized", defel->defname))); + errmsg("option \"%s\" not recognized", defel->defname), + parser_errposition(pstate, defel->location))); } if (dtablespace) @@ -1571,7 +1606,8 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("option \"%s\" cannot be specified with other options", - dtablespace->defname))); + dtablespace->defname), + parser_errposition(pstate, dtablespace->location))); /* this case isn't allowed within a transaction block */ #ifdef PGXC /* ... but we allow it on remote nodes */ @@ -1605,7 +1641,7 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) ScanKeyInit(&scankey, Anum_pg_database_datname, BTEqualStrategyNumber, F_NAMEEQ, - NameGetDatum(stmt->dbname)); + CStringGetDatum(stmt->dbname)); scan = systable_beginscan(rel, DatabaseNameIndexId, true, NULL, 1, &scankey); tuple = systable_getnext(scan); @@ -1656,10 +1692,7 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) newtuple = heap_modify_tuple(tuple, RelationGetDescr(rel), new_record, new_record_nulls, new_record_repl); - simple_heap_update(rel, &tuple->t_self, newtuple); - - /* Update indexes */ - CatalogUpdateIndexes(rel, newtuple); + CatalogTupleUpdate(rel, &tuple->t_self, newtuple); InvokeObjectPostAlterHook(DatabaseRelationId, HeapTupleGetOid(newtuple), 0); @@ -1722,7 +1755,7 @@ AlterDatabaseOwner(const char *dbname, Oid newOwnerId) ScanKeyInit(&scankey, Anum_pg_database_datname, BTEqualStrategyNumber, F_NAMEEQ, - NameGetDatum(dbname)); + CStringGetDatum(dbname)); scan = systable_beginscan(rel, DatabaseNameIndexId, true, NULL, 1, &scankey); tuple = systable_getnext(scan); @@ -1794,8 +1827,7 @@ AlterDatabaseOwner(const char *dbname, Oid newOwnerId) } newtuple = heap_modify_tuple(tuple, RelationGetDescr(rel), repl_val, repl_null, repl_repl); - simple_heap_update(rel, &newtuple->t_self, newtuple); - CatalogUpdateIndexes(rel, newtuple); + CatalogTupleUpdate(rel, &newtuple->t_self, newtuple); heap_freetuple(newtuple); @@ -1862,7 +1894,7 @@ get_db_info(const char *name, LOCKMODE lockmode, ScanKeyInit(&scanKey, Anum_pg_database_datname, BTEqualStrategyNumber, F_NAMEEQ, - NameGetDatum(name)); + CStringGetDatum(name)); scan = systable_beginscan(relation, DatabaseNameIndexId, true, NULL, 1, &scanKey); @@ -2238,11 +2270,18 @@ dbase_redo(XLogReaderState *record) * InitPostgres() cannot fully re-execute concurrently. This * avoids backends re-connecting automatically to same database, * which can happen in some cases. + * + * This will lock out walsenders trying to connect to db-specific + * slots for logical decoding too, so it's safe for us to drop + * slots. */ LockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock); ResolveRecoveryConflictWithDatabase(xlrec->db_id); } + /* Drop any database-specific replication slots */ + ReplicationSlotsDropDBSlots(xlrec->db_id); + /* Drop pages for this database that are in the shared buffer cache */ DropDatabaseBuffers(xlrec->db_id); diff --git a/src/backend/commands/define.c b/src/backend/commands/define.c index ece803ec4b..3ad4eea59e 100644 --- a/src/backend/commands/define.c +++ b/src/backend/commands/define.c @@ -4,7 +4,7 @@ * Support routines for various kinds of object creation. * * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -40,7 +40,7 @@ #include "nodes/makefuncs.h" #include "parser/parse_type.h" #include "parser/scansup.h" -#include "utils/int8.h" +#include "utils/builtins.h" /* * Extract a string value (otherwise uninterpreted) from a DefElem. @@ -321,10 +321,29 @@ defGetTypeLength(DefElem *def) } /* - * Create a DefElem setting "oids" to the specified value. + * Extract a list of string values (otherwise uninterpreted) from a DefElem. */ -DefElem * -defWithOids(bool value) +List * +defGetStringList(DefElem *def) { - return makeDefElem("oids", (Node *) makeInteger(value)); + ListCell *cell; + + if (def->arg == NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("%s requires a parameter", + def->defname))); + if (nodeTag(def->arg) != T_List) + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(def->arg)); + + foreach(cell, (List *) def->arg) + { + Node *str = (Node *) lfirst(cell); + + if (!IsA(str, String)) + elog(ERROR, "unexpected node type in name list: %d", + (int) nodeTag(str)); + } + + return (List *) def->arg; } diff --git a/src/backend/commands/discard.c b/src/backend/commands/discard.c index 5b8bd67579..f0dcd87fb8 100644 --- a/src/backend/commands/discard.c +++ b/src/backend/commands/discard.c @@ -3,7 +3,7 @@ * discard.c * The implementation of the DISCARD command * - * Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Copyright (c) 1996-2017, PostgreSQL Global Development Group * * * IDENTIFICATION diff --git a/src/backend/commands/dropcmds.c b/src/backend/commands/dropcmds.c index 61ff8f2190..9e307eb8af 100644 --- a/src/backend/commands/dropcmds.c +++ b/src/backend/commands/dropcmds.c @@ -3,7 +3,7 @@ * dropcmds.c * handle various "DROP" operations * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -30,10 +30,10 @@ static void does_not_exist_skipping(ObjectType objtype, - List *objname, List *objargs); -static bool owningrel_does_not_exist_skipping(List *objname, + Node *object); +static bool owningrel_does_not_exist_skipping(List *object, const char **msg, char **name); -static bool schema_does_not_exist_skipping(List *objname, +static bool schema_does_not_exist_skipping(List *object, const char **msg, char **name); static bool type_in_list_does_not_exist_skipping(List *typenames, const char **msg, char **name); @@ -55,27 +55,19 @@ RemoveObjects(DropStmt *stmt) { ObjectAddresses *objects; ListCell *cell1; - ListCell *cell2 = NULL; objects = new_object_addresses(); foreach(cell1, stmt->objects) { ObjectAddress address; - List *objname = lfirst(cell1); - List *objargs = NIL; + Node *object = lfirst(cell1); Relation relation = NULL; Oid namespaceId; - if (stmt->arguments) - { - cell2 = (!cell2 ? list_head(stmt->arguments) : lnext(cell2)); - objargs = lfirst(cell2); - } - /* Get an ObjectAddress for the object. */ address = get_object_address(stmt->removeType, - objname, objargs, + object, &relation, AccessExclusiveLock, stmt->missing_ok); @@ -88,7 +80,7 @@ RemoveObjects(DropStmt *stmt) if (!OidIsValid(address.objectId)) { Assert(stmt->missing_ok); - does_not_exist_skipping(stmt->removeType, objname, objargs); + does_not_exist_skipping(stmt->removeType, object); continue; } @@ -110,7 +102,7 @@ RemoveObjects(DropStmt *stmt) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an aggregate function", - NameListToString(objname)), + NameListToString(castNode(ObjectWithArgs, object)->objname)), errhint("Use DROP AGGREGATE to drop aggregate functions."))); ReleaseSysCache(tup); @@ -121,7 +113,7 @@ RemoveObjects(DropStmt *stmt) if (!OidIsValid(namespaceId) || !pg_namespace_ownercheck(namespaceId, GetUserId())) check_object_ownership(GetUserId(), stmt->removeType, address, - objname, objargs, relation); + object, relation); /* Release any relcache reference count, but keep lock until commit. */ if (relation) @@ -147,23 +139,23 @@ RemoveObjects(DropStmt *stmt) * exist, fill the error message format string and name, and return true. */ static bool -owningrel_does_not_exist_skipping(List *objname, const char **msg, char **name) +owningrel_does_not_exist_skipping(List *object, const char **msg, char **name) { - List *parent_objname; + List *parent_object; RangeVar *parent_rel; - parent_objname = list_truncate(list_copy(objname), - list_length(objname) - 1); + parent_object = list_truncate(list_copy(object), + list_length(object) - 1); - if (schema_does_not_exist_skipping(parent_objname, msg, name)) + if (schema_does_not_exist_skipping(parent_object, msg, name)) return true; - parent_rel = makeRangeVarFromNameList(parent_objname); + parent_rel = makeRangeVarFromNameList(parent_object); if (!OidIsValid(RangeVarGetRelid(parent_rel, NoLock, true))) { *msg = gettext_noop("relation \"%s\" does not exist, skipping"); - *name = NameListToString(parent_objname); + *name = NameListToString(parent_object); return true; } @@ -183,11 +175,11 @@ owningrel_does_not_exist_skipping(List *objname, const char **msg, char **name) * specified schema name, and return true. */ static bool -schema_does_not_exist_skipping(List *objname, const char **msg, char **name) +schema_does_not_exist_skipping(List *object, const char **msg, char **name) { RangeVar *rel; - rel = makeRangeVarFromNameList(objname); + rel = makeRangeVarFromNameList(object); if (rel->schemaname != NULL && !OidIsValid(LookupNamespaceNoError(rel->schemaname))) @@ -222,12 +214,10 @@ type_in_list_does_not_exist_skipping(List *typenames, const char **msg, foreach(l, typenames) { - TypeName *typeName = (TypeName *) lfirst(l); + TypeName *typeName = lfirst_node(TypeName, l); if (typeName != NULL) { - Assert(IsA(typeName, TypeName)); - if (!OidIsValid(LookupTypeNameOid(NULL, typeName, true))) { /* type doesn't exist, try to find why */ @@ -254,7 +244,7 @@ type_in_list_does_not_exist_skipping(List *typenames, const char **msg, * get_object_address() in RemoveObjects would have thrown an ERROR. */ static void -does_not_exist_skipping(ObjectType objtype, List *objname, List *objargs) +does_not_exist_skipping(ObjectType objtype, Node *object) { const char *msg = NULL; char *name = NULL; @@ -264,12 +254,12 @@ does_not_exist_skipping(ObjectType objtype, List *objname, List *objargs) { case OBJECT_ACCESS_METHOD: msg = gettext_noop("access method \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = strVal((Value *) object); break; case OBJECT_TYPE: case OBJECT_DOMAIN: { - TypeName *typ = linitial(objname); + TypeName *typ = castNode(TypeName, object); if (!schema_does_not_exist_skipping(typ->names, &msg, &name)) { @@ -279,168 +269,191 @@ does_not_exist_skipping(ObjectType objtype, List *objname, List *objargs) } break; case OBJECT_COLLATION: - if (!schema_does_not_exist_skipping(objname, &msg, &name)) + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("collation \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = NameListToString(castNode(List, object)); } break; case OBJECT_CONVERSION: - if (!schema_does_not_exist_skipping(objname, &msg, &name)) + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("conversion \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = NameListToString(castNode(List, object)); } break; case OBJECT_SCHEMA: msg = gettext_noop("schema \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = strVal((Value *) object); + break; + case OBJECT_STATISTIC_EXT: + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) + { + msg = gettext_noop("statistics object \"%s\" does not exist, skipping"); + name = NameListToString(castNode(List, object)); + } break; case OBJECT_TSPARSER: - if (!schema_does_not_exist_skipping(objname, &msg, &name)) + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("text search parser \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = NameListToString(castNode(List, object)); } break; case OBJECT_TSDICTIONARY: - if (!schema_does_not_exist_skipping(objname, &msg, &name)) + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("text search dictionary \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = NameListToString(castNode(List, object)); } break; case OBJECT_TSTEMPLATE: - if (!schema_does_not_exist_skipping(objname, &msg, &name)) + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("text search template \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = NameListToString(castNode(List, object)); } break; case OBJECT_TSCONFIGURATION: - if (!schema_does_not_exist_skipping(objname, &msg, &name)) + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("text search configuration \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = NameListToString(castNode(List, object)); } break; case OBJECT_EXTENSION: msg = gettext_noop("extension \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = strVal((Value *) object); break; case OBJECT_FUNCTION: - if (!schema_does_not_exist_skipping(objname, &msg, &name) && - !type_in_list_does_not_exist_skipping(objargs, &msg, &name)) { - msg = gettext_noop("function %s(%s) does not exist, skipping"); - name = NameListToString(objname); - args = TypeNameListToString(objargs); + ObjectWithArgs *owa = castNode(ObjectWithArgs, object); + + if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) && + !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name)) + { + msg = gettext_noop("function %s(%s) does not exist, skipping"); + name = NameListToString(owa->objname); + args = TypeNameListToString(owa->objargs); + } + break; } - break; case OBJECT_AGGREGATE: - if (!schema_does_not_exist_skipping(objname, &msg, &name) && - !type_in_list_does_not_exist_skipping(objargs, &msg, &name)) { - msg = gettext_noop("aggregate %s(%s) does not exist, skipping"); - name = NameListToString(objname); - args = TypeNameListToString(objargs); + ObjectWithArgs *owa = castNode(ObjectWithArgs, object); + + if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) && + !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name)) + { + msg = gettext_noop("aggregate %s(%s) does not exist, skipping"); + name = NameListToString(owa->objname); + args = TypeNameListToString(owa->objargs); + } + break; } - break; case OBJECT_OPERATOR: - if (!schema_does_not_exist_skipping(objname, &msg, &name) && - !type_in_list_does_not_exist_skipping(objargs, &msg, &name)) { - msg = gettext_noop("operator %s does not exist, skipping"); - name = NameListToString(objname); + ObjectWithArgs *owa = castNode(ObjectWithArgs, object); + + if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) && + !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name)) + { + msg = gettext_noop("operator %s does not exist, skipping"); + name = NameListToString(owa->objname); + } + break; } - break; case OBJECT_LANGUAGE: msg = gettext_noop("language \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = strVal((Value *) object); break; case OBJECT_CAST: { - if (!type_in_list_does_not_exist_skipping(objname, &msg, &name) && - !type_in_list_does_not_exist_skipping(objargs, &msg, &name)) + if (!type_in_list_does_not_exist_skipping(list_make1(linitial(castNode(List, object))), &msg, &name) && + !type_in_list_does_not_exist_skipping(list_make1(lsecond(castNode(List, object))), &msg, &name)) { /* XXX quote or no quote? */ msg = gettext_noop("cast from type %s to type %s does not exist, skipping"); - name = TypeNameToString((TypeName *) linitial(objname)); - args = TypeNameToString((TypeName *) linitial(objargs)); + name = TypeNameToString(linitial_node(TypeName, castNode(List, object))); + args = TypeNameToString(lsecond_node(TypeName, castNode(List, object))); } } break; case OBJECT_TRANSFORM: - if (!type_in_list_does_not_exist_skipping(objname, &msg, &name)) + if (!type_in_list_does_not_exist_skipping(list_make1(linitial(castNode(List, object))), &msg, &name)) { msg = gettext_noop("transform for type %s language \"%s\" does not exist, skipping"); - name = TypeNameToString((TypeName *) linitial(objname)); - args = strVal(linitial(objargs)); + name = TypeNameToString(linitial_node(TypeName, castNode(List, object))); + args = strVal(lsecond(castNode(List, object))); } break; case OBJECT_TRIGGER: - if (!owningrel_does_not_exist_skipping(objname, &msg, &name)) + if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("trigger \"%s\" for relation \"%s\" does not exist, skipping"); - name = strVal(llast(objname)); - args = NameListToString(list_truncate(list_copy(objname), - list_length(objname) - 1)); + name = strVal(llast(castNode(List, object))); + args = NameListToString(list_truncate(list_copy(castNode(List, object)), + list_length(castNode(List, object)) - 1)); } break; case OBJECT_POLICY: - if (!owningrel_does_not_exist_skipping(objname, &msg, &name)) + if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("policy \"%s\" for relation \"%s\" does not exist, skipping"); - name = strVal(llast(objname)); - args = NameListToString(list_truncate(list_copy(objname), - list_length(objname) - 1)); + name = strVal(llast(castNode(List, object))); + args = NameListToString(list_truncate(list_copy(castNode(List, object)), + list_length(castNode(List, object)) - 1)); } break; case OBJECT_EVENT_TRIGGER: msg = gettext_noop("event trigger \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = strVal((Value *) object); break; case OBJECT_RULE: - if (!owningrel_does_not_exist_skipping(objname, &msg, &name)) + if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name)) { msg = gettext_noop("rule \"%s\" for relation \"%s\" does not exist, skipping"); - name = strVal(llast(objname)); - args = NameListToString(list_truncate(list_copy(objname), - list_length(objname) - 1)); + name = strVal(llast(castNode(List, object))); + args = NameListToString(list_truncate(list_copy(castNode(List, object)), + list_length(castNode(List, object)) - 1)); } break; case OBJECT_FDW: msg = gettext_noop("foreign-data wrapper \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = strVal((Value *) object); break; case OBJECT_FOREIGN_SERVER: msg = gettext_noop("server \"%s\" does not exist, skipping"); - name = NameListToString(objname); + name = strVal((Value *) object); break; case OBJECT_OPCLASS: { - List *opcname = list_copy_tail(objname, 1); + List *opcname = list_copy_tail(castNode(List, object), 1); if (!schema_does_not_exist_skipping(opcname, &msg, &name)) { msg = gettext_noop("operator class \"%s\" does not exist for access method \"%s\", skipping"); name = NameListToString(opcname); - args = strVal(linitial(objname)); + args = strVal(linitial(castNode(List, object))); } } break; case OBJECT_OPFAMILY: { - List *opfname = list_copy_tail(objname, 1); + List *opfname = list_copy_tail(castNode(List, object), 1); if (!schema_does_not_exist_skipping(opfname, &msg, &name)) { msg = gettext_noop("operator family \"%s\" does not exist for access method \"%s\", skipping"); name = NameListToString(opfname); - args = strVal(linitial(objname)); + args = strVal(linitial(castNode(List, object))); } } break; + case OBJECT_PUBLICATION: + msg = gettext_noop("publication \"%s\" does not exist, skipping"); + name = strVal((Value *) object); + break; default: elog(ERROR, "unrecognized object type: %d", (int) objtype); break; diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index 0b58639229..51d8783fb6 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -3,7 +3,7 @@ * event_trigger.c * PostgreSQL EVENT TRIGGER support code. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -106,11 +106,14 @@ static event_trigger_support_data event_trigger_support[] = { {"OPERATOR CLASS", true}, {"OPERATOR FAMILY", true}, {"POLICY", true}, + {"PUBLICATION", true}, {"ROLE", false}, {"RULE", true}, {"SCHEMA", true}, {"SEQUENCE", true}, {"SERVER", true}, + {"STATISTICS", true}, + {"SUBSCRIPTION", true}, {"TABLE", true}, {"TABLESPACE", false}, {"TRANSFORM", true}, @@ -403,8 +406,7 @@ insert_event_trigger_tuple(char *trigname, char *eventname, Oid evtOwner, /* Insert heap tuple. */ tuple = heap_form_tuple(tgrel->rd_att, values, nulls); - trigoid = simple_heap_insert(tgrel, tuple); - CatalogUpdateIndexes(tgrel, tuple); + trigoid = CatalogTupleInsert(tgrel, tuple); heap_freetuple(tuple); /* Depend on owner. */ @@ -483,7 +485,7 @@ RemoveEventTriggerById(Oid trigOid) if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for event trigger %u", trigOid); - simple_heap_delete(tgrel, &tup->t_self); + CatalogTupleDelete(tgrel, &tup->t_self); ReleaseSysCache(tup); @@ -522,8 +524,7 @@ AlterEventTrigger(AlterEventTrigStmt *stmt) evtForm = (Form_pg_event_trigger) GETSTRUCT(tup); evtForm->evtenabled = tgenabled; - simple_heap_update(tgrel, &tup->t_self, tup); - CatalogUpdateIndexes(tgrel, tup); + CatalogTupleUpdate(tgrel, &tup->t_self, tup); InvokeObjectPostAlterHook(EventTriggerRelationId, trigoid, 0); @@ -619,8 +620,7 @@ AlterEventTriggerOwner_internal(Relation rel, HeapTuple tup, Oid newOwnerId) errhint("The owner of an event trigger must be a superuser."))); form->evtowner = newOwnerId; - simple_heap_update(rel, &tup->t_self, tup); - CatalogUpdateIndexes(rel, tup); + CatalogTupleUpdate(rel, &tup->t_self, tup); /* Update owner dependency reference */ changeDependencyOnOwner(EventTriggerRelationId, @@ -742,7 +742,7 @@ EventTriggerCommonSetup(Node *parsetree, /* * Filter list of event triggers by command tag, and copy them into our - * memory context. Once we start running the command trigers, or indeed + * memory context. Once we start running the command triggers, or indeed * once we do anything at all that touches the catalogs, an invalidation * might leave cachelist pointing at garbage, so we must do this before we * can do much else. @@ -1018,9 +1018,7 @@ EventTriggerInvoke(List *fn_oid_list, EventTriggerData *trigdata) */ context = AllocSetContextCreate(CurrentMemoryContext, "event trigger context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); oldcontext = MemoryContextSwitchTo(context); /* Call each event trigger. */ @@ -1105,9 +1103,13 @@ EventTriggerSupportsObjectType(ObjectType obtype) case OBJECT_OPERATOR: case OBJECT_OPFAMILY: case OBJECT_POLICY: + case OBJECT_PUBLICATION: + case OBJECT_PUBLICATION_REL: case OBJECT_RULE: case OBJECT_SCHEMA: case OBJECT_SEQUENCE: + case OBJECT_SUBSCRIPTION: + case OBJECT_STATISTIC_EXT: case OBJECT_TABCONSTRAINT: case OBJECT_TABLE: case OBJECT_TRANSFORM: @@ -1120,8 +1122,15 @@ EventTriggerSupportsObjectType(ObjectType obtype) case OBJECT_USER_MAPPING: case OBJECT_VIEW: return true; + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new ObjectType hasn't been handled above. + */ } - return true; + + /* Shouldn't get here, but if we do, say "no support" */ + return false; } /* @@ -1153,12 +1162,13 @@ EventTriggerSupportsObjectClass(ObjectClass objclass) case OCLASS_OPERATOR: case OCLASS_OPCLASS: case OCLASS_OPFAMILY: + case OCLASS_AM: case OCLASS_AMOP: case OCLASS_AMPROC: case OCLASS_REWRITE: case OCLASS_TRIGGER: case OCLASS_SCHEMA: - case OCLASS_TRANSFORM: + case OCLASS_STATISTIC_EXT: case OCLASS_TSPARSER: case OCLASS_TSDICT: case OCLASS_TSTEMPLATE: @@ -1174,11 +1184,20 @@ EventTriggerSupportsObjectClass(ObjectClass objclass) case OCLASS_PGXC_GROUP: #endif case OCLASS_POLICY: - case OCLASS_AM: + case OCLASS_PUBLICATION: + case OCLASS_PUBLICATION_REL: + case OCLASS_SUBSCRIPTION: + case OCLASS_TRANSFORM: return true; + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new OCLASS hasn't been handled above. + */ } - return true; + /* Shouldn't get here, but if we do, say "no support" */ + return false; } bool @@ -1203,10 +1222,15 @@ EventTriggerSupportsGrantObjectType(GrantObjectType objtype) case ACL_OBJECT_NAMESPACE: case ACL_OBJECT_TYPE: return true; - default: - Assert(false); - return true; + + /* + * There's intentionally no default: case here; we want the + * compiler to warn if a new ACL class hasn't been handled above. + */ } + + /* Shouldn't get here, but if we do, say "no support" */ + return false; } /* @@ -1231,9 +1255,7 @@ EventTriggerBeginCompleteQuery(void) cxt = AllocSetContextCreate(TopMemoryContext, "event trigger state", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + ALLOCSET_DEFAULT_SIZES); state = MemoryContextAlloc(cxt, sizeof(EventTriggerQueryState)); state->cxt = cxt; slist_init(&(state->SQLDropList)); @@ -1870,7 +1892,7 @@ EventTriggerCollectAlterOpFam(AlterOpFamilyStmt *stmt, Oid opfamoid, OperatorFamilyRelationId, opfamoid); command->d.opfam.operators = operators; command->d.opfam.procedures = procedures; - command->parsetree = copyObject(stmt); + command->parsetree = (Node *) copyObject(stmt); currentEventTriggerState->commandList = lappend(currentEventTriggerState->commandList, command); @@ -1903,7 +1925,7 @@ EventTriggerCollectCreateOpClass(CreateOpClassStmt *stmt, Oid opcoid, OperatorClassRelationId, opcoid); command->d.createopc.operators = operators; command->d.createopc.procedures = procedures; - command->parsetree = copyObject(stmt); + command->parsetree = (Node *) copyObject(stmt); currentEventTriggerState->commandList = lappend(currentEventTriggerState->commandList, command); @@ -1938,7 +1960,7 @@ EventTriggerCollectAlterTSConfig(AlterTSConfigurationStmt *stmt, Oid cfgId, command->d.atscfg.dictIds = palloc(sizeof(Oid) * ndicts); memcpy(command->d.atscfg.dictIds, dictIds, sizeof(Oid) * ndicts); command->d.atscfg.ndicts = ndicts; - command->parsetree = copyObject(stmt); + command->parsetree = (Node *) copyObject(stmt); currentEventTriggerState->commandList = lappend(currentEventTriggerState->commandList, command); @@ -1968,7 +1990,7 @@ EventTriggerCollectAlterDefPrivs(AlterDefaultPrivilegesStmt *stmt) command->type = SCT_AlterDefaultPrivileges; command->d.defprivs.objtype = stmt->action->objtype; command->in_extension = creating_extension; - command->parsetree = copyObject(stmt); + command->parsetree = (Node *) copyObject(stmt); currentEventTriggerState->commandList = lappend(currentEventTriggerState->commandList, command); @@ -2230,35 +2252,50 @@ stringify_grantobjtype(GrantObjectType objtype) return "TABLESPACE"; case ACL_OBJECT_TYPE: return "TYPE"; - default: - elog(ERROR, "unrecognized type %d", objtype); - return "???"; /* keep compiler quiet */ } + + elog(ERROR, "unrecognized grant object type: %d", (int) objtype); + return "???"; /* keep compiler quiet */ } /* * Return the GrantObjectType as a string; as above, but use the spelling - * in ALTER DEFAULT PRIVILEGES commands instead. + * in ALTER DEFAULT PRIVILEGES commands instead. Generally this is just + * the plural. */ static const char * stringify_adefprivs_objtype(GrantObjectType objtype) { switch (objtype) { + case ACL_OBJECT_COLUMN: + return "COLUMNS"; case ACL_OBJECT_RELATION: return "TABLES"; - break; - case ACL_OBJECT_FUNCTION: - return "FUNCTIONS"; - break; case ACL_OBJECT_SEQUENCE: return "SEQUENCES"; - break; + case ACL_OBJECT_DATABASE: + return "DATABASES"; + case ACL_OBJECT_DOMAIN: + return "DOMAINS"; + case ACL_OBJECT_FDW: + return "FOREIGN DATA WRAPPERS"; + case ACL_OBJECT_FOREIGN_SERVER: + return "FOREIGN SERVERS"; + case ACL_OBJECT_FUNCTION: + return "FUNCTIONS"; + case ACL_OBJECT_LANGUAGE: + return "LANGUAGES"; + case ACL_OBJECT_LARGEOBJECT: + return "LARGE OBJECTS"; + case ACL_OBJECT_NAMESPACE: + return "SCHEMAS"; + case ACL_OBJECT_TABLESPACE: + return "TABLESPACES"; case ACL_OBJECT_TYPE: return "TYPES"; - break; - default: - elog(ERROR, "unrecognized type %d", objtype); - return "???"; /* keep compiler quiet */ } + + elog(ERROR, "unrecognized grant object type: %d", (int) objtype); + return "???"; /* keep compiler quiet */ } diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index cdc0fe8f0c..1bb5d7582f 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -4,7 +4,7 @@ * Explain query execution plans * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * * IDENTIFICATION @@ -60,8 +60,10 @@ explain_get_index_name_hook_type explain_get_index_name_hook = NULL; #define X_CLOSE_IMMEDIATE 2 #define X_NOWHITESPACE 4 -static void ExplainOneQuery(Query *query, IntoClause *into, ExplainState *es, - const char *queryString, ParamListInfo params); +static void ExplainOneQuery(Query *query, int cursorOptions, + IntoClause *into, ExplainState *es, + const char *queryString, ParamListInfo params, + QueryEnvironment *queryEnv); static void report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es); static double elapsed_time(instr_time *starttime); @@ -154,14 +156,16 @@ static void escape_yaml(StringInfo buf, const char *str); * execute an EXPLAIN command */ void -ExplainQuery(ExplainStmt *stmt, const char *queryString, - ParamListInfo params, DestReceiver *dest) +ExplainQuery(ParseState *pstate, ExplainStmt *stmt, const char *queryString, + ParamListInfo params, QueryEnvironment *queryEnv, + DestReceiver *dest) { ExplainState *es = NewExplainState(); TupOutputState *tstate; List *rewritten; ListCell *lc; bool timing_set = false; + bool summary_set = false; /* Parse options list. */ foreach(lc, stmt->options) @@ -187,6 +191,11 @@ ExplainQuery(ExplainStmt *stmt, const char *queryString, timing_set = true; es->timing = defGetBoolean(opt); } + else if (strcmp(opt->defname, "summary") == 0) + { + summary_set = true; + es->summary = defGetBoolean(opt); + } else if (strcmp(opt->defname, "format") == 0) { char *p = defGetString(opt); @@ -203,13 +212,15 @@ ExplainQuery(ExplainStmt *stmt, const char *queryString, ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized value for EXPLAIN option \"%s\": \"%s\"", - opt->defname, p))); + opt->defname, p), + parser_errposition(pstate, opt->location))); } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("unrecognized EXPLAIN option \"%s\"", - opt->defname))); + opt->defname), + parser_errposition(pstate, opt->location))); } if (es->buffers && !es->analyze) @@ -226,8 +237,8 @@ ExplainQuery(ExplainStmt *stmt, const char *queryString, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("EXPLAIN option TIMING requires ANALYZE"))); - /* currently, summary option is not exposed to users; just set it */ - es->summary = es->analyze; + /* if the summary was not set explicitly, set default value */ + es->summary = (summary_set) ? es->summary : es->analyze; /* * Parse analysis was done already, but we still have to run the rule @@ -241,8 +252,7 @@ ExplainQuery(ExplainStmt *stmt, const char *queryString, * executed repeatedly. (See also the same hack in DECLARE CURSOR and * PREPARE.) XXX FIXME someday. */ - Assert(IsA(stmt->query, Query)); - rewritten = QueryRewrite((Query *) copyObject(stmt->query)); + rewritten = QueryRewrite(castNode(Query, copyObject(stmt->query))); /* emit opening boilerplate */ ExplainBeginOutput(es); @@ -263,8 +273,9 @@ ExplainQuery(ExplainStmt *stmt, const char *queryString, /* Explain every plan */ foreach(l, rewritten) { - ExplainOneQuery((Query *) lfirst(l), NULL, es, - queryString, params); + ExplainOneQuery(lfirst_node(Query, l), + CURSOR_OPT_PARALLEL_OK, NULL, es, + queryString, params, queryEnv); /* Separate plans with an appropriate separator */ if (lnext(l) != NULL) @@ -350,8 +361,10 @@ ExplainResultDesc(ExplainStmt *stmt) * "into" is NULL unless we are explaining the contents of a CreateTableAsStmt. */ static void -ExplainOneQuery(Query *query, IntoClause *into, ExplainState *es, - const char *queryString, ParamListInfo params) +ExplainOneQuery(Query *query, int cursorOptions, + IntoClause *into, ExplainState *es, + const char *queryString, ParamListInfo params, + QueryEnvironment *queryEnv) { /* planner will not cope with utility statements */ if (query->commandType == CMD_UTILITY) @@ -366,18 +379,19 @@ ExplainOneQuery(Query *query, IntoClause *into, ExplainState *es, { List *rewritten = QueryRewriteCTAS(query); Assert(list_length(rewritten) == 1); - ExplainOneQuery((Query *) linitial(rewritten), into, es, - queryString, params); + ExplainOneQuery((Query *) linitial(rewritten), cursorOptions, + into, es, queryString, params, queryEnv); } else ExplainOneUtility(query->utilityStmt, into, es, - queryString, params); + queryString, params, queryEnv); return; } /* if an advisor plugin is present, let it manage things */ if (ExplainOneQuery_hook) - (*ExplainOneQuery_hook) (query, into, es, queryString, params); + (*ExplainOneQuery_hook) (query, cursorOptions, into, es, + queryString, params); else { PlannedStmt *plan; @@ -387,13 +401,14 @@ ExplainOneQuery(Query *query, IntoClause *into, ExplainState *es, INSTR_TIME_SET_CURRENT(planstart); /* plan the query */ - plan = pg_plan_query(query, into ? 0 : CURSOR_OPT_PARALLEL_OK, params); + plan = pg_plan_query(query, cursorOptions, params); INSTR_TIME_SET_CURRENT(planduration); INSTR_TIME_SUBTRACT(planduration, planstart); /* run it (if needed) and produce output */ - ExplainOnePlan(plan, into, es, queryString, params, &planduration); + ExplainOnePlan(plan, into, es, queryString, params, queryEnv, + &planduration); } } @@ -410,7 +425,8 @@ ExplainOneQuery(Query *query, IntoClause *into, ExplainState *es, */ void ExplainOneUtility(Node *utilityStmt, IntoClause *into, ExplainState *es, - const char *queryString, ParamListInfo params) + const char *queryString, ParamListInfo params, + QueryEnvironment *queryEnv) { if (utilityStmt == NULL) return; @@ -421,19 +437,40 @@ ExplainOneUtility(Node *utilityStmt, IntoClause *into, ExplainState *es, * We have to rewrite the contained SELECT and then pass it back to * ExplainOneQuery. It's probably not really necessary to copy the * contained parsetree another time, but let's be safe. + * + * Like ExecCreateTableAs, disallow parallelism in the plan. */ CreateTableAsStmt *ctas = (CreateTableAsStmt *) utilityStmt; List *rewritten; - Assert(IsA(ctas->query, Query)); - rewritten = QueryRewrite((Query *) copyObject(ctas->query)); + rewritten = QueryRewrite(castNode(Query, copyObject(ctas->query))); + Assert(list_length(rewritten) == 1); + ExplainOneQuery(linitial_node(Query, rewritten), + 0, ctas->into, es, + queryString, params, queryEnv); + } + else if (IsA(utilityStmt, DeclareCursorStmt)) + { + /* + * Likewise for DECLARE CURSOR. + * + * Notice that if you say EXPLAIN ANALYZE DECLARE CURSOR then we'll + * actually run the query. This is different from pre-8.3 behavior + * but seems more useful than not running the query. No cursor will + * be created, however. + */ + DeclareCursorStmt *dcs = (DeclareCursorStmt *) utilityStmt; + List *rewritten; + + rewritten = QueryRewrite(castNode(Query, copyObject(dcs->query))); Assert(list_length(rewritten) == 1); - ExplainOneQuery((Query *) linitial(rewritten), ctas->into, es, - queryString, params); + ExplainOneQuery(linitial_node(Query, rewritten), + dcs->options, NULL, es, + queryString, params, queryEnv); } else if (IsA(utilityStmt, ExecuteStmt)) ExplainExecuteQuery((ExecuteStmt *) utilityStmt, into, es, - queryString, params); + queryString, params, queryEnv); else if (IsA(utilityStmt, NotifyStmt)) { if (es->format == EXPLAIN_FORMAT_TEXT) @@ -459,11 +496,6 @@ ExplainOneUtility(Node *utilityStmt, IntoClause *into, ExplainState *es, * "into" is NULL unless we are explaining the contents of a CreateTableAsStmt, * in which case executing the query should result in creating that table. * - * Since we ignore any DeclareCursorStmt that might be attached to the query, - * if you say EXPLAIN ANALYZE DECLARE CURSOR then we'll actually run the - * query. This is different from pre-8.3 behavior but seems more useful than - * not running the query. No cursor will be created, however. - * * This is exported because it's called back from prepare.c in the * EXPLAIN EXECUTE case, and because an index advisor plugin would need * to call it. @@ -471,7 +503,7 @@ ExplainOneUtility(Node *utilityStmt, IntoClause *into, ExplainState *es, void ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, const char *queryString, ParamListInfo params, - const instr_time *planduration) + QueryEnvironment *queryEnv, const instr_time *planduration) { DestReceiver *dest; QueryDesc *queryDesc; @@ -480,6 +512,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, int eflags; int instrument_option = 0; + Assert(plannedstmt->commandType != CMD_UTILITY); + if (es->analyze && es->timing) instrument_option |= INSTRUMENT_TIMER; else if (es->analyze) @@ -514,7 +548,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, /* Create a QueryDesc for the query */ queryDesc = CreateQueryDesc(plannedstmt, queryString, GetActiveSnapshot(), InvalidSnapshot, - dest, params, instrument_option); + dest, params, queryEnv, instrument_option); /* Select execution options */ if (es->analyze) @@ -539,7 +573,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, dir = ForwardScanDirection; /* run the plan */ - ExecutorRun(queryDesc, dir, 0L); + ExecutorRun(queryDesc, dir, 0L, true); /* run cleanup too */ ExecutorFinish(queryDesc); @@ -586,7 +620,13 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, totaltime += elapsed_time(&starttime); - if (es->summary) + /* + * We only report execution time if we actually ran the query (that is, + * the user specified ANALYZE), and if summary reporting is enabled (the + * user can set SUMMARY OFF to not have the timing information included in + * the output). By default, ANALYZE sets SUMMARY to true. + */ + if (es->summary && es->analyze) { if (es->format == EXPLAIN_FORMAT_TEXT) appendStringInfo(es->str, "Execution time: %.3f ms\n", @@ -796,8 +836,10 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) case T_TidScan: case T_SubqueryScan: case T_FunctionScan: + case T_TableFuncScan: case T_ValuesScan: case T_CteScan: + case T_NamedTuplestoreScan: case T_WorkTableScan: *rels_used = bms_add_member(*rels_used, ((Scan *) plan)->scanrelid); @@ -864,6 +906,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Result: pname = sname = "Result"; break; + case T_ProjectSet: + pname = sname = "ProjectSet"; + break; case T_ModifyTable: sname = "ModifyTable"; switch (((ModifyTable *) plan)->operation) @@ -917,6 +962,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Gather: pname = sname = "Gather"; break; + case T_GatherMerge: + pname = sname = "Gather Merge"; + break; case T_IndexScan: pname = sname = "Index Scan"; break; @@ -938,12 +986,18 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_FunctionScan: pname = sname = "Function Scan"; break; + case T_TableFuncScan: + pname = sname = "Table Function Scan"; + break; case T_ValuesScan: pname = sname = "Values Scan"; break; case T_CteScan: pname = sname = "CTE Scan"; break; + case T_NamedTuplestoreScan: + pname = sname = "Named Tuplestore Scan"; + break; case T_WorkTableScan: pname = sname = "WorkTable Scan"; break; @@ -1018,6 +1072,10 @@ ExplainNode(PlanState *planstate, List *ancestors, pname = "HashAggregate"; strategy = "Hashed"; break; + case AGG_MIXED: + pname = "MixedAggregate"; + strategy = "Mixed"; + break; default: pname = "Aggregate ???"; strategy = "???"; @@ -1125,6 +1183,7 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_TidScan: case T_SubqueryScan: case T_FunctionScan: + case T_TableFuncScan: case T_ValuesScan: case T_CteScan: case T_WorkTableScan: @@ -1398,6 +1457,23 @@ ExplainNode(PlanState *planstate, List *ancestors, if (es->verbose) show_plan_tlist(planstate, ancestors, es); + /* unique join */ + switch (nodeTag(plan)) + { + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + /* try not to be too chatty about this in text mode */ + if (es->format != EXPLAIN_FORMAT_TEXT || + (es->verbose && ((Join *) plan)->inner_unique)) + ExplainPropertyBool("Inner Unique", + ((Join *) plan)->inner_unique, + es); + break; + default: + break; + } + /* quals, sort keys, etc */ switch (nodeTag(plan)) { @@ -1501,6 +1577,7 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_SeqScan: case T_ValuesScan: case T_CteScan: + case T_NamedTuplestoreScan: case T_WorkTableScan: case T_SubqueryScan: show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); @@ -1530,6 +1607,26 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainPropertyBool("Single Copy", gather->single_copy, es); } break; + case T_GatherMerge: + { + GatherMerge *gm = (GatherMerge *) plan; + + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); + ExplainPropertyInteger("Workers Planned", + gm->num_workers, es); + if (es->analyze) + { + int nworkers; + + nworkers = ((GatherMergeState *) planstate)->nworkers_launched; + ExplainPropertyInteger("Workers Launched", + nworkers, es); + } + } + break; case T_FunctionScan: if (es->verbose) { @@ -1552,6 +1649,20 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); break; + case T_TableFuncScan: + if (es->verbose) + { + TableFunc *tablefunc = ((TableFuncScan *) plan)->tablefunc; + + show_expression((Node *) tablefunc, + "Table Function Call", planstate, ancestors, + es->verbose, es); + } + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); + break; case T_TidScan: { /* @@ -1626,25 +1737,25 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); break; case T_Agg: - show_agg_keys((AggState *) planstate, ancestors, es); + show_agg_keys(castNode(AggState, planstate), ancestors, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); break; case T_Group: - show_group_keys((GroupState *) planstate, ancestors, es); + show_group_keys(castNode(GroupState, planstate), ancestors, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); break; case T_Sort: - show_sort_keys((SortState *) planstate, ancestors, es); - show_sort_info((SortState *) planstate, es); + show_sort_keys(castNode(SortState, planstate), ancestors, es); + show_sort_info(castNode(SortState, planstate), es); break; case T_MergeAppend: - show_merge_append_keys((MergeAppendState *) planstate, + show_merge_append_keys(castNode(MergeAppendState, planstate), ancestors, es); break; case T_Result: @@ -1656,11 +1767,11 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); break; case T_ModifyTable: - show_modifytable_info((ModifyTableState *) planstate, ancestors, + show_modifytable_info(castNode(ModifyTableState, planstate), ancestors, es); break; case T_Hash: - show_hash_info((HashState *) planstate, es); + show_hash_info(castNode(HashState, planstate), es); break; default: break; @@ -2082,6 +2193,19 @@ show_grouping_set_keys(PlanState *planstate, ListCell *lc; List *gsets = aggnode->groupingSets; AttrNumber *keycols = aggnode->grpColIdx; + const char *keyname; + const char *keysetname; + + if (aggnode->aggstrategy == AGG_HASHED || aggnode->aggstrategy == AGG_MIXED) + { + keyname = "Hash Key"; + keysetname = "Hash Keys"; + } + else + { + keyname = "Group Key"; + keysetname = "Group Keys"; + } ExplainOpenGroup("Grouping Set", NULL, true, es); @@ -2096,7 +2220,7 @@ show_grouping_set_keys(PlanState *planstate, es->indent++; } - ExplainOpenGroup("Group Keys", "Group Keys", false, es); + ExplainOpenGroup(keysetname, keysetname, false, es); foreach(lc, gsets) { @@ -2120,12 +2244,12 @@ show_grouping_set_keys(PlanState *planstate, } if (!result && es->format == EXPLAIN_FORMAT_TEXT) - ExplainPropertyText("Group Key", "()", es); + ExplainPropertyText(keyname, "()", es); else - ExplainPropertyListNested("Group Key", result, es); + ExplainPropertyListNested(keyname, result, es); } - ExplainCloseGroup("Group Keys", "Group Keys", false, es); + ExplainCloseGroup(keysetname, keysetname, false, es); if (sortnode && es->format == EXPLAIN_FORMAT_TEXT) es->indent--; @@ -2338,7 +2462,6 @@ show_tablesample(TableSampleClause *tsc, PlanState *planstate, static void show_sort_info(SortState *sortstate, ExplainState *es) { - Assert(IsA(sortstate, SortState)); if (es->analyze && sortstate->sort_Done && sortstate->tuplesortstate != NULL) { @@ -2372,7 +2495,6 @@ show_hash_info(HashState *hashstate, ExplainState *es) { HashJoinTable hashtable; - Assert(IsA(hashstate, HashState)); hashtable = hashstate->hashtable; if (hashtable) @@ -2753,6 +2875,11 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) objecttag = "Function Name"; } break; + case T_TableFuncScan: + Assert(rte->rtekind == RTE_TABLEFUNC); + objectname = "xmltable"; + objecttag = "Table Function Name"; + break; case T_ValuesScan: Assert(rte->rtekind == RTE_VALUES); break; @@ -2763,6 +2890,11 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) objectname = rte->ctename; objecttag = "CTE Name"; break; + case T_NamedTuplestoreScan: + Assert(rte->rtekind == RTE_NAMEDTUPLESTORE); + objectname = rte->enrname; + objecttag = "Tuplestore Name"; + break; case T_WorkTableScan: /* Assert it's on a self-reference CTE */ Assert(rte->rtekind == RTE_CTE); @@ -2997,7 +3129,7 @@ ExplainSubPlans(List *plans, List *ancestors, foreach(lst, plans) { SubPlanState *sps = (SubPlanState *) lfirst(lst); - SubPlan *sp = (SubPlan *) sps->xprstate.expr; + SubPlan *sp = sps->subplan; /* * There can be multiple SubPlan nodes referencing the same physical @@ -3669,13 +3801,15 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp * Optionally, OR in X_NOWHITESPACE to suppress the whitespace we'd normally * add. * - * XML tag names can't contain white space, so we replace any spaces in - * "tagname" with dashes. + * XML restricts tag names more than our other output formats, eg they can't + * contain white space or slashes. Replace invalid characters with dashes, + * so that for example "I/O Read Time" becomes "I-O-Read-Time". */ static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es) { const char *s; + const char *valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_."; if ((flags & X_NOWHITESPACE) == 0) appendStringInfoSpaces(es->str, 2 * es->indent); @@ -3683,7 +3817,7 @@ ExplainXMLTag(const char *tagname, int flags, ExplainState *es) if ((flags & X_CLOSING) != 0) appendStringInfoCharMacro(es->str, '/'); for (s = tagname; *s; s++) - appendStringInfoCharMacro(es->str, (*s == ' ') ? '-' : *s); + appendStringInfoChar(es->str, strchr(valid, *s) ? *s : '-'); if ((flags & X_CLOSE_IMMEDIATE) != 0) appendStringInfoString(es->str, " /"); appendStringInfoCharMacro(es->str, '>'); @@ -3734,7 +3868,7 @@ ExplainYAMLLineStarting(ExplainState *es) } /* - * YAML is a superset of JSON; unfortuantely, the YAML quoting rules are + * YAML is a superset of JSON; unfortunately, the YAML quoting rules are * ridiculously complicated -- as documented in sections 5.3 and 7.3.3 of * http://yaml.org/spec/1.2/spec.html -- so we chose to just quote everything. * Empty strings, strings with leading or trailing whitespace, and strings diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index be8641b573..fa79e71955 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -12,7 +12,7 @@ * postgresql.conf and recovery.conf. An extension also has an installation * script file, containing SQL commands to create the extension's objects. * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -52,6 +52,7 @@ #include "nodes/makefuncs.h" #include "storage/fd.h" #include "tcop/utility.h" +#include "utils/acl.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" @@ -59,6 +60,7 @@ #include "utils/rel.h" #include "utils/snapmgr.h" #include "utils/tqual.h" +#include "utils/varlena.h" /* Globally visible state variables */ @@ -100,14 +102,25 @@ typedef struct ExtensionVersionInfo static List *find_update_path(List *evi_list, ExtensionVersionInfo *evi_start, ExtensionVersionInfo *evi_target, + bool reject_indirect, bool reinitialize); +static Oid get_required_extension(char *reqExtensionName, + char *extensionName, + char *origSchemaName, + bool cascade, + List *parents, + bool is_create); static void get_available_versions_for_extension(ExtensionControlFile *pcontrol, Tuplestorestate *tupstore, TupleDesc tupdesc); +static Datum convert_requires_to_datum(List *requires); static void ApplyExtensionUpdates(Oid extensionOid, ExtensionControlFile *pcontrol, const char *initialVersion, - List *updateVersions); + List *updateVersions, + char *origSchemaName, + bool cascade, + bool is_create); static char *read_whole_file(const char *filename, int *length); @@ -702,42 +715,40 @@ execute_sql_string(const char *sql, const char *filename) */ forboth(lc1, raw_parsetree_list, lc3, querysource_list) { - Node *parsetree = (Node *) lfirst(lc1); + RawStmt *parsetree = lfirst_node(RawStmt, lc1); char *querysource = (char *) lfirst(lc3); List *stmt_list; ListCell *lc2; + /* Be sure parser can see any DDL done so far */ + CommandCounterIncrement(); + stmt_list = pg_analyze_and_rewrite(parsetree, querysource, NULL, - 0); + 0, + NULL); stmt_list = pg_plan_queries(stmt_list, CURSOR_OPT_PARALLEL_OK, NULL); foreach(lc2, stmt_list) { - Node *stmt = (Node *) lfirst(lc2); - - if (IsA(stmt, TransactionStmt)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("transaction control statements are not allowed within an extension script"))); + PlannedStmt *stmt = lfirst_node(PlannedStmt, lc2); CommandCounterIncrement(); PushActiveSnapshot(GetTransactionSnapshot()); - if (IsA(stmt, PlannedStmt) && - ((PlannedStmt *) stmt)->utilityStmt == NULL) + if (stmt->utilityStmt == NULL) { QueryDesc *qdesc; - qdesc = CreateQueryDesc((PlannedStmt *) stmt, + qdesc = CreateQueryDesc(stmt, querysource, GetActiveSnapshot(), NULL, - dest, NULL, 0); + dest, NULL, NULL, 0); ExecutorStart(qdesc, 0); - ExecutorRun(qdesc, ForwardScanDirection, 0); + ExecutorRun(qdesc, ForwardScanDirection, 0, true); ExecutorFinish(qdesc); ExecutorEnd(qdesc); @@ -745,10 +756,16 @@ execute_sql_string(const char *sql, const char *filename) } else { + if (IsA(stmt->utilityStmt, TransactionStmt)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("transaction control statements are not allowed within an extension script"))); + ProcessUtility(stmt, querysource, PROCESS_UTILITY_QUERY, NULL, + NULL, dest, #ifdef PGXC true, /* this is created at remote node level */ @@ -1076,7 +1093,7 @@ identify_update_path(ExtensionControlFile *control, evi_target = get_ext_ver_info(newVersion, &evi_list); /* Find shortest path */ - result = find_update_path(evi_list, evi_start, evi_target, false); + result = find_update_path(evi_list, evi_start, evi_target, false, false); if (result == NIL) ereport(ERROR, @@ -1091,9 +1108,13 @@ identify_update_path(ExtensionControlFile *control, * Apply Dijkstra's algorithm to find the shortest path from evi_start to * evi_target. * + * If reject_indirect is true, ignore paths that go through installable + * versions. This saves work when the caller will consider starting from + * all installable versions anyway. + * * If reinitialize is false, assume the ExtensionVersionInfo list has not * been used for this before, and the initialization done by get_ext_ver_info - * is still good. + * is still good. Otherwise, reinitialize all transient fields used here. * * Result is a List of names of versions to transition through (the initial * version is *not* included). Returns NIL if no such path. @@ -1102,6 +1123,7 @@ static List * find_update_path(List *evi_list, ExtensionVersionInfo *evi_start, ExtensionVersionInfo *evi_target, + bool reject_indirect, bool reinitialize) { List *result; @@ -1110,6 +1132,8 @@ find_update_path(List *evi_list, /* Caller error if start == target */ Assert(evi_start != evi_target); + /* Caller error if reject_indirect and target is installable */ + Assert(!(reject_indirect && evi_target->installable)); if (reinitialize) { @@ -1136,6 +1160,9 @@ find_update_path(List *evi_list, ExtensionVersionInfo *evi2 = (ExtensionVersionInfo *) lfirst(lc); int newdist; + /* if reject_indirect, treat installable versions as unreachable */ + if (reject_indirect && evi2->installable) + continue; newdist = evi->distance + 1; if (newdist < evi2->distance) { @@ -1172,25 +1199,85 @@ find_update_path(List *evi_list, } /* + * Given a target version that is not directly installable, find the + * best installation sequence starting from a directly-installable version. + * + * evi_list: previously-collected version update graph + * evi_target: member of that list that we want to reach + * + * Returns the best starting-point version, or NULL if there is none. + * On success, *best_path is set to the path from the start point. + * + * If there's more than one possible start point, prefer shorter update paths, + * and break any ties arbitrarily on the basis of strcmp'ing the starting + * versions' names. + */ +static ExtensionVersionInfo * +find_install_path(List *evi_list, ExtensionVersionInfo *evi_target, + List **best_path) +{ + ExtensionVersionInfo *evi_start = NULL; + ListCell *lc; + + *best_path = NIL; + + /* + * We don't expect to be called for an installable target, but if we are, + * the answer is easy: just start from there, with an empty update path. + */ + if (evi_target->installable) + return evi_target; + + /* Consider all installable versions as start points */ + foreach(lc, evi_list) + { + ExtensionVersionInfo *evi1 = (ExtensionVersionInfo *) lfirst(lc); + List *path; + + if (!evi1->installable) + continue; + + /* + * Find shortest path from evi1 to evi_target; but no need to consider + * paths going through other installable versions. + */ + path = find_update_path(evi_list, evi1, evi_target, true, true); + if (path == NIL) + continue; + + /* Remember best path */ + if (evi_start == NULL || + list_length(path) < list_length(*best_path) || + (list_length(path) == list_length(*best_path) && + strcmp(evi_start->name, evi1->name) < 0)) + { + evi_start = evi1; + *best_path = path; + } + } + + return evi_start; +} + +/* * CREATE EXTENSION worker * - * When CASCADE is specified CreateExtensionInternal() recurses if required - * extensions need to be installed. To sanely handle cyclic dependencies - * cascade_parent contains the dependency chain leading to the current - * invocation; thus allowing to error out if there's a cyclic dependency. + * When CASCADE is specified, CreateExtensionInternal() recurses if required + * extensions need to be installed. To sanely handle cyclic dependencies, + * the "parents" list contains a list of names of extensions already being + * installed, allowing us to error out if we recurse to one of those. */ static ObjectAddress -CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) +CreateExtensionInternal(char *extensionName, + char *schemaName, + char *versionName, + char *oldVersionName, + bool cascade, + List *parents, + bool is_create) { - DefElem *d_schema = NULL; - DefElem *d_new_version = NULL; - DefElem *d_old_version = NULL; - DefElem *d_cascade = NULL; - char *schemaName = NULL; + char *origSchemaName = schemaName; Oid schemaOid = InvalidOid; - char *versionName; - char *oldVersionName; - bool cascade = false; Oid extowner = GetUserId(); ExtensionControlFile *pcontrol; ExtensionControlFile *control; @@ -1198,83 +1285,43 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) List *requiredExtensions; List *requiredSchemas; Oid extensionOid; - ListCell *lc; ObjectAddress address; + ListCell *lc; /* * Read the primary control file. Note we assume that it does not contain * any non-ASCII data, so there is no need to worry about encoding at this * point. */ - pcontrol = read_extension_control_file(stmt->extname); - - /* - * Read the statement option list - */ - foreach(lc, stmt->options) - { - DefElem *defel = (DefElem *) lfirst(lc); - - if (strcmp(defel->defname, "schema") == 0) - { - if (d_schema) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); - d_schema = defel; - } - else if (strcmp(defel->defname, "new_version") == 0) - { - if (d_new_version) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); - d_new_version = defel; - } - else if (strcmp(defel->defname, "old_version") == 0) - { - if (d_old_version) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); - d_old_version = defel; - } - else if (strcmp(defel->defname, "cascade") == 0) - { - if (d_cascade) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); - d_cascade = defel; - cascade = defGetBoolean(d_cascade); - } - else - elog(ERROR, "unrecognized option: %s", defel->defname); - } + pcontrol = read_extension_control_file(extensionName); /* * Determine the version to install */ - if (d_new_version && d_new_version->arg) - versionName = strVal(d_new_version->arg); - else if (pcontrol->default_version) - versionName = pcontrol->default_version; - else + if (versionName == NULL) { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("version to install must be specified"))); - versionName = NULL; /* keep compiler quiet */ + if (pcontrol->default_version) + versionName = pcontrol->default_version; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("version to install must be specified"))); } check_valid_version_name(versionName); /* - * Determine the (unpackaged) version to update from, if any, and then - * figure out what sequence of update scripts we need to apply. + * Figure out which script(s) we need to run to install the desired + * version of the extension. If we do not have a script that directly + * does what is needed, we try to find a sequence of update scripts that + * will get us there. */ - if (d_old_version && d_old_version->arg) + if (oldVersionName) { - oldVersionName = strVal(d_old_version->arg); + /* + * "FROM old_version" was specified, indicating that we're trying to + * update from some unpackaged version of the extension. Locate a + * series of update scripts that will do it. + */ check_valid_version_name(oldVersionName); if (strcmp(oldVersionName, versionName) == 0) @@ -1309,8 +1356,48 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) } else { + /* + * No FROM, so we're installing from scratch. If there is an install + * script for the desired version, we only need to run that one. + */ + char *filename; + struct stat fst; + oldVersionName = NULL; - updateVersions = NIL; + + filename = get_extension_script_filename(pcontrol, NULL, versionName); + if (stat(filename, &fst) == 0) + { + /* Easy, no extra scripts */ + updateVersions = NIL; + } + else + { + /* Look for best way to install this version */ + List *evi_list; + ExtensionVersionInfo *evi_start; + ExtensionVersionInfo *evi_target; + + /* Extract the version update graph from the script directory */ + evi_list = get_ext_ver_list(pcontrol); + + /* Identify the target version */ + evi_target = get_ext_ver_info(versionName, &evi_list); + + /* Identify best path to reach target */ + evi_start = find_install_path(evi_list, evi_target, + &updateVersions); + + /* Fail if no path ... */ + if (evi_start == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("extension \"%s\" has no installation script nor update path for version \"%s\"", + pcontrol->name, versionName))); + + /* Otherwise, install best starting point and then upgrade */ + versionName = evi_start->name; + } } /* @@ -1321,13 +1408,8 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) /* * Determine the target schema to install the extension into */ - if (d_schema && d_schema->arg) + if (schemaName) { - /* - * User given schema, CREATE EXTENSION ... WITH SCHEMA ... - */ - schemaName = strVal(d_schema->arg); - /* If the user is giving us the schema name, it must exist already. */ schemaOid = get_namespace_oid(schemaName, false); } @@ -1363,11 +1445,8 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) csstmt->authrole = NULL; /* will be created by current user */ csstmt->schemaElts = NIL; csstmt->if_not_exists = false; -#ifdef PGXC - CreateSchemaCommand(csstmt, NULL, true); -#else - CreateSchemaCommand(csstmt, NULL); -#endif + CreateSchemaCommand(csstmt, "(generated CREATE SCHEMA command)", + true, -1, -1); /* * CreateSchemaCommand includes CommandCounterIncrement, so new @@ -1379,7 +1458,7 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) else if (!OidIsValid(schemaOid)) { /* - * Neither user nor author of the extension specified schema, use the + * Neither user nor author of the extension specified schema; use the * current default creation namespace, which is the first explicit * entry in the search_path. */ @@ -1409,8 +1488,8 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) */ /* - * Look up the prerequisite extensions, and build lists of their OIDs and - * the OIDs of their target schemas. + * Look up the prerequisite extensions, install them if necessary, and + * build lists of their OIDs and the OIDs of their target schemas. */ requiredExtensions = NIL; requiredSchemas = NIL; @@ -1420,65 +1499,12 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) Oid reqext; Oid reqschema; - reqext = get_extension_oid(curreq, true); - if (!OidIsValid(reqext)) - { - if (cascade) - { - CreateExtensionStmt *ces; - ListCell *lc; - ObjectAddress addr; - List *cascade_parents; - - /* Check extension name validity before trying to cascade */ - check_valid_extension_name(curreq); - - /* Check for cyclic dependency between extensions. */ - foreach(lc, parents) - { - char *pname = (char *) lfirst(lc); - - if (strcmp(pname, curreq) == 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_RECURSION), - errmsg("cyclic dependency detected between extensions \"%s\" and \"%s\"", - curreq, stmt->extname))); - } - - ereport(NOTICE, - (errmsg("installing required extension \"%s\"", - curreq))); - - /* Create and execute new CREATE EXTENSION statement. */ - ces = makeNode(CreateExtensionStmt); - ces->extname = curreq; - - /* Propagate the CASCADE option */ - ces->options = list_make1(d_cascade); - - /* Propagate the SCHEMA option if given. */ - if (d_schema && d_schema->arg) - ces->options = lappend(ces->options, d_schema); - - /* - * Pass the current list of parents + the current extension to - * the "child" CreateExtensionInternal(). - */ - cascade_parents = - lappend(list_copy(parents), stmt->extname); - - /* Create the required extension. */ - addr = CreateExtensionInternal(ces, cascade_parents); - reqext = addr.objectId; - } - else - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("required extension \"%s\" is not installed", - curreq), - errhint("Use CREATE EXTENSION ... CASCADE to install required extensions too."))); - } - + reqext = get_required_extension(curreq, + extensionName, + origSchemaName, + cascade, + parents, + is_create); reqschema = get_extension_schema(reqext); requiredExtensions = lappend_oid(requiredExtensions, reqext); requiredSchemas = lappend_oid(requiredSchemas, reqschema); @@ -1514,17 +1540,100 @@ CreateExtensionInternal(CreateExtensionStmt *stmt, List *parents) * though a series of ALTER EXTENSION UPDATE commands were given */ ApplyExtensionUpdates(extensionOid, pcontrol, - versionName, updateVersions); + versionName, updateVersions, + origSchemaName, cascade, is_create); return address; } /* + * Get the OID of an extension listed in "requires", possibly creating it. + */ +static Oid +get_required_extension(char *reqExtensionName, + char *extensionName, + char *origSchemaName, + bool cascade, + List *parents, + bool is_create) +{ + Oid reqExtensionOid; + + reqExtensionOid = get_extension_oid(reqExtensionName, true); + if (!OidIsValid(reqExtensionOid)) + { + if (cascade) + { + /* Must install it. */ + ObjectAddress addr; + List *cascade_parents; + ListCell *lc; + + /* Check extension name validity before trying to cascade. */ + check_valid_extension_name(reqExtensionName); + + /* Check for cyclic dependency between extensions. */ + foreach(lc, parents) + { + char *pname = (char *) lfirst(lc); + + if (strcmp(pname, reqExtensionName) == 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_RECURSION), + errmsg("cyclic dependency detected between extensions \"%s\" and \"%s\"", + reqExtensionName, extensionName))); + } + + ereport(NOTICE, + (errmsg("installing required extension \"%s\"", + reqExtensionName))); + + /* Add current extension to list of parents to pass down. */ + cascade_parents = lappend(list_copy(parents), extensionName); + + /* + * Create the required extension. We propagate the SCHEMA option + * if any, and CASCADE, but no other options. + */ + addr = CreateExtensionInternal(reqExtensionName, + origSchemaName, + NULL, + NULL, + cascade, + cascade_parents, + is_create); + + /* Get its newly-assigned OID. */ + reqExtensionOid = addr.objectId; + } + else + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("required extension \"%s\" is not installed", + reqExtensionName), + is_create ? + errhint("Use CREATE EXTENSION ... CASCADE to install required extensions too.") : 0)); + } + + return reqExtensionOid; +} + +/* * CREATE EXTENSION */ ObjectAddress -CreateExtension(CreateExtensionStmt *stmt) +CreateExtension(ParseState *pstate, CreateExtensionStmt *stmt) { + DefElem *d_schema = NULL; + DefElem *d_new_version = NULL; + DefElem *d_old_version = NULL; + DefElem *d_cascade = NULL; + char *schemaName = NULL; + char *versionName = NULL; + char *oldVersionName = NULL; + bool cascade = false; + ListCell *lc; + /* Check extension name validity before any filesystem access */ check_valid_extension_name(stmt->extname); @@ -1560,9 +1669,63 @@ CreateExtension(CreateExtensionStmt *stmt) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("nested CREATE EXTENSION is not supported"))); + /* Deconstruct the statement option list */ + foreach(lc, stmt->options) + { + DefElem *defel = (DefElem *) lfirst(lc); + + if (strcmp(defel->defname, "schema") == 0) + { + if (d_schema) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_schema = defel; + schemaName = defGetString(d_schema); + } + else if (strcmp(defel->defname, "new_version") == 0) + { + if (d_new_version) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_new_version = defel; + versionName = defGetString(d_new_version); + } + else if (strcmp(defel->defname, "old_version") == 0) + { + if (d_old_version) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_old_version = defel; + oldVersionName = defGetString(d_old_version); + } + else if (strcmp(defel->defname, "cascade") == 0) + { + if (d_cascade) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_cascade = defel; + cascade = defGetBoolean(d_cascade); + } + else + elog(ERROR, "unrecognized option: %s", defel->defname); + } - /* Finally create the extension. */ - return CreateExtensionInternal(stmt, NIL); + /* Call CreateExtensionInternal to do the real work. */ + return CreateExtensionInternal(stmt->extname, + schemaName, + versionName, + oldVersionName, + cascade, + NIL, + true); } /* @@ -1620,8 +1783,7 @@ InsertExtensionTuple(const char *extName, Oid extOwner, tuple = heap_form_tuple(rel->rd_att, values, nulls); - extensionOid = simple_heap_insert(rel, tuple); - CatalogUpdateIndexes(rel, tuple); + extensionOid = CatalogTupleInsert(rel, tuple); heap_freetuple(tuple); heap_close(rel, RowExclusiveLock); @@ -1702,7 +1864,7 @@ RemoveExtensionById(Oid extId) /* We assume that there can be at most one matching tuple */ if (HeapTupleIsValid(tuple)) - simple_heap_delete(rel, &tuple->t_self); + CatalogTupleDelete(rel, &tuple->t_self); systable_endscan(scandesc); @@ -1919,43 +2081,28 @@ get_available_versions_for_extension(ExtensionControlFile *pcontrol, Tuplestorestate *tupstore, TupleDesc tupdesc) { - int extnamelen = strlen(pcontrol->name); - char *location; - DIR *dir; - struct dirent *de; + List *evi_list; + ListCell *lc; - location = get_extension_script_directory(pcontrol); - dir = AllocateDir(location); - /* Note this will fail if script directory doesn't exist */ - while ((de = ReadDir(dir, location)) != NULL) + /* Extract the version update graph from the script directory */ + evi_list = get_ext_ver_list(pcontrol); + + /* For each installable version ... */ + foreach(lc, evi_list) { + ExtensionVersionInfo *evi = (ExtensionVersionInfo *) lfirst(lc); ExtensionControlFile *control; - char *vername; Datum values[7]; bool nulls[7]; + ListCell *lc2; - /* must be a .sql file ... */ - if (!is_extension_script_filename(de->d_name)) - continue; - - /* ... matching extension name followed by separator */ - if (strncmp(de->d_name, pcontrol->name, extnamelen) != 0 || - de->d_name[extnamelen] != '-' || - de->d_name[extnamelen + 1] != '-') - continue; - - /* extract version name from 'extname--something.sql' filename */ - vername = pstrdup(de->d_name + extnamelen + 2); - *strrchr(vername, '.') = '\0'; - - /* ignore it if it's an update script */ - if (strstr(vername, "--")) + if (!evi->installable) continue; /* * Fetch parameters for specific version (pcontrol is not changed) */ - control = read_extension_aux_control_file(pcontrol, vername); + control = read_extension_aux_control_file(pcontrol, evi->name); memset(values, 0, sizeof(values)); memset(nulls, 0, sizeof(nulls)); @@ -1964,7 +2111,7 @@ get_available_versions_for_extension(ExtensionControlFile *pcontrol, values[0] = DirectFunctionCall1(namein, CStringGetDatum(control->name)); /* version */ - values[1] = CStringGetTextDatum(vername); + values[1] = CStringGetTextDatum(evi->name); /* superuser */ values[2] = BoolGetDatum(control->superuser); /* relocatable */ @@ -1979,27 +2126,7 @@ get_available_versions_for_extension(ExtensionControlFile *pcontrol, if (control->requires == NIL) nulls[5] = true; else - { - Datum *datums; - int ndatums; - ArrayType *a; - ListCell *lc; - - ndatums = list_length(control->requires); - datums = (Datum *) palloc(ndatums * sizeof(Datum)); - ndatums = 0; - foreach(lc, control->requires) - { - char *curreq = (char *) lfirst(lc); - - datums[ndatums++] = - DirectFunctionCall1(namein, CStringGetDatum(curreq)); - } - a = construct_array(datums, ndatums, - NAMEOID, - NAMEDATALEN, false, 'c'); - values[5] = PointerGetDatum(a); - } + values[5] = convert_requires_to_datum(control->requires); /* comment */ if (control->comment == NULL) nulls[6] = true; @@ -2007,9 +2134,75 @@ get_available_versions_for_extension(ExtensionControlFile *pcontrol, values[6] = CStringGetTextDatum(control->comment); tuplestore_putvalues(tupstore, tupdesc, values, nulls); + + /* + * Find all non-directly-installable versions that would be installed + * starting from this version, and report them, inheriting the + * parameters that aren't changed in updates from this version. + */ + foreach(lc2, evi_list) + { + ExtensionVersionInfo *evi2 = (ExtensionVersionInfo *) lfirst(lc2); + List *best_path; + + if (evi2->installable) + continue; + if (find_install_path(evi_list, evi2, &best_path) == evi) + { + /* + * Fetch parameters for this version (pcontrol is not changed) + */ + control = read_extension_aux_control_file(pcontrol, evi2->name); + + /* name stays the same */ + /* version */ + values[1] = CStringGetTextDatum(evi2->name); + /* superuser */ + values[2] = BoolGetDatum(control->superuser); + /* relocatable */ + values[3] = BoolGetDatum(control->relocatable); + /* schema stays the same */ + /* requires */ + if (control->requires == NIL) + nulls[5] = true; + else + { + values[5] = convert_requires_to_datum(control->requires); + nulls[5] = false; + } + /* comment stays the same */ + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + } } +} - FreeDir(dir); +/* + * Convert a list of extension names to a name[] Datum + */ +static Datum +convert_requires_to_datum(List *requires) +{ + Datum *datums; + int ndatums; + ArrayType *a; + ListCell *lc; + + ndatums = list_length(requires); + datums = (Datum *) palloc(ndatums * sizeof(Datum)); + ndatums = 0; + foreach(lc, requires) + { + char *curreq = (char *) lfirst(lc); + + datums[ndatums++] = + DirectFunctionCall1(namein, CStringGetDatum(curreq)); + } + a = construct_array(datums, ndatums, + NAMEOID, + NAMEDATALEN, false, 'c'); + return PointerGetDatum(a); } /* @@ -2081,7 +2274,7 @@ pg_extension_update_paths(PG_FUNCTION_ARGS) continue; /* Find shortest path from evi1 to evi2 */ - path = find_update_path(evi_list, evi1, evi2, true); + path = find_update_path(evi_list, evi1, evi2, false, true); /* Emit result row */ memset(values, 0, sizeof(values)); @@ -2134,7 +2327,7 @@ Datum pg_extension_config_dump(PG_FUNCTION_ARGS) { Oid tableoid = PG_GETARG_OID(0); - text *wherecond = PG_GETARG_TEXT_P(1); + text *wherecond = PG_GETARG_TEXT_PP(1); char *tablename; Relation extRel; ScanKeyData key[1]; @@ -2301,8 +2494,7 @@ pg_extension_config_dump(PG_FUNCTION_ARGS) extTup = heap_modify_tuple(extTup, RelationGetDescr(extRel), repl_val, repl_null, repl_repl); - simple_heap_update(extRel, &extTup->t_self, extTup); - CatalogUpdateIndexes(extRel, extTup); + CatalogTupleUpdate(extRel, &extTup->t_self, extTup); systable_endscan(extScan); @@ -2479,8 +2671,7 @@ extension_config_remove(Oid extensionoid, Oid tableoid) extTup = heap_modify_tuple(extTup, RelationGetDescr(extRel), repl_val, repl_null, repl_repl); - simple_heap_update(extRel, &extTup->t_self, extTup); - CatalogUpdateIndexes(extRel, extTup); + CatalogTupleUpdate(extRel, &extTup->t_self, extTup); systable_endscan(extScan); @@ -2491,9 +2682,8 @@ extension_config_remove(Oid extensionoid, Oid tableoid) * Execute ALTER EXTENSION SET SCHEMA */ ObjectAddress -AlterExtensionNamespace(List *names, const char *newschema, Oid *oldschema) +AlterExtensionNamespace(const char *extensionName, const char *newschema, Oid *oldschema) { - char *extensionName; Oid extensionOid; Oid nspOid; Oid oldNspOid = InvalidOid; @@ -2509,12 +2699,6 @@ AlterExtensionNamespace(List *names, const char *newschema, Oid *oldschema) ObjectAddresses *objsMoved; ObjectAddress extAddr; - if (list_length(names) != 1) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("extension name cannot be qualified"))); - extensionName = strVal(linitial(names)); - extensionOid = get_extension_oid(extensionName, false); nspOid = LookupCreationNamespace(newschema); @@ -2660,8 +2844,7 @@ AlterExtensionNamespace(List *names, const char *newschema, Oid *oldschema) /* Now adjust pg_extension.extnamespace */ extForm->extnamespace = nspOid; - simple_heap_update(extRel, &extTup->t_self, extTup); - CatalogUpdateIndexes(extRel, extTup); + CatalogTupleUpdate(extRel, &extTup->t_self, extTup); heap_close(extRel, RowExclusiveLock); @@ -2680,7 +2863,7 @@ AlterExtensionNamespace(List *names, const char *newschema, Oid *oldschema) * Execute ALTER EXTENSION UPDATE */ ObjectAddress -ExecAlterExtensionStmt(AlterExtensionStmt *stmt) +ExecAlterExtensionStmt(ParseState *pstate, AlterExtensionStmt *stmt) { DefElem *d_new_version = NULL; char *versionName; @@ -2766,7 +2949,8 @@ ExecAlterExtensionStmt(AlterExtensionStmt *stmt) if (d_new_version) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); d_new_version = defel; } else @@ -2812,7 +2996,8 @@ ExecAlterExtensionStmt(AlterExtensionStmt *stmt) * time */ ApplyExtensionUpdates(extensionOid, control, - oldVersionName, updateVersions); + oldVersionName, updateVersions, + NULL, false, false); ObjectAddressSet(address, ExtensionRelationId, extensionOid); @@ -2831,7 +3016,10 @@ static void ApplyExtensionUpdates(Oid extensionOid, ExtensionControlFile *pcontrol, const char *initialVersion, - List *updateVersions) + List *updateVersions, + char *origSchemaName, + bool cascade, + bool is_create) { const char *oldVersionName = initialVersion; ListCell *lcv; @@ -2902,16 +3090,16 @@ ApplyExtensionUpdates(Oid extensionOid, extTup = heap_modify_tuple(extTup, RelationGetDescr(extRel), values, nulls, repl); - simple_heap_update(extRel, &extTup->t_self, extTup); - CatalogUpdateIndexes(extRel, extTup); + CatalogTupleUpdate(extRel, &extTup->t_self, extTup); systable_endscan(extScan); heap_close(extRel, RowExclusiveLock); /* - * Look up the prerequisite extensions for this version, and build - * lists of their OIDs and the OIDs of their target schemas. + * Look up the prerequisite extensions for this version, install them + * if necessary, and build lists of their OIDs and the OIDs of their + * target schemas. */ requiredExtensions = NIL; requiredSchemas = NIL; @@ -2921,16 +3109,12 @@ ApplyExtensionUpdates(Oid extensionOid, Oid reqext; Oid reqschema; - /* - * We intentionally don't use get_extension_oid's default error - * message here, because it would be confusing in this context. - */ - reqext = get_extension_oid(curreq, true); - if (!OidIsValid(reqext)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("required extension \"%s\" is not installed", - curreq))); + reqext = get_required_extension(curreq, + control->name, + origSchemaName, + cascade, + NIL, + is_create); reqschema = get_extension_schema(reqext); requiredExtensions = lappend_oid(requiredExtensions, reqext); requiredSchemas = lappend_oid(requiredSchemas, reqschema); @@ -3010,7 +3194,7 @@ ExecAlterExtensionContentsStmt(AlterExtensionContentsStmt *stmt, * does not exist, and will also acquire a lock on the object to guard * against concurrent DROP and ALTER EXTENSION ADD/DROP operations. */ - object = get_object_address(stmt->objtype, stmt->objname, stmt->objargs, + object = get_object_address(stmt->objtype, stmt->object, &relation, ShareUpdateExclusiveLock, false); Assert(object.objectSubId == 0); @@ -3019,7 +3203,7 @@ ExecAlterExtensionContentsStmt(AlterExtensionContentsStmt *stmt, /* Permission check: must own target object, too */ check_object_ownership(GetUserId(), stmt->objtype, object, - stmt->objname, stmt->objargs, relation); + stmt->object, relation); /* * Check existing extension membership. @@ -3055,6 +3239,16 @@ ExecAlterExtensionContentsStmt(AlterExtensionContentsStmt *stmt, * OK, add the dependency. */ recordDependencyOn(&object, &extension, DEPENDENCY_EXTENSION); + + /* + * Also record the initial ACL on the object, if any. + * + * Note that this will handle the object's ACLs, as well as any ACLs + * on object subIds. (In other words, when the object is a table, + * this will record the table's ACL and the ACLs for the columns on + * the table, if any). + */ + recordExtObjInitPriv(object.objectId, object.classId); } else { @@ -3082,6 +3276,16 @@ ExecAlterExtensionContentsStmt(AlterExtensionContentsStmt *stmt, */ if (object.classId == RelationRelationId) extension_config_remove(extension.objectId, object.objectId); + + /* + * Remove all the initial ACLs, if any. + * + * Note that this will remove the object's ACLs, as well as any ACLs + * on object subIds. (In other words, when the object is a table, + * this will remove the table's ACL and the ACLs for the columns on + * the table, if any). + */ + removeExtObjInitPriv(object.objectId, object.classId); } InvokeObjectPostAlterHook(ExtensionRelationId, extension.objectId, 0); diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index 6963855373..554656b6ec 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -3,7 +3,7 @@ * foreigncmds.c * foreign-data wrapper/server creation/manipulation commands * - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * * * IDENTIFICATION @@ -256,8 +256,7 @@ AlterForeignDataWrapperOwner_internal(Relation rel, HeapTuple tup, Oid newOwnerI tup = heap_modify_tuple(tup, RelationGetDescr(rel), repl_val, repl_null, repl_repl); - simple_heap_update(rel, &tup->t_self, tup); - CatalogUpdateIndexes(rel, tup); + CatalogTupleUpdate(rel, &tup->t_self, tup); /* Update owner dependency reference */ changeDependencyOnOwner(ForeignDataWrapperRelationId, @@ -397,8 +396,7 @@ AlterForeignServerOwner_internal(Relation rel, HeapTuple tup, Oid newOwnerId) tup = heap_modify_tuple(tup, RelationGetDescr(rel), repl_val, repl_null, repl_repl); - simple_heap_update(rel, &tup->t_self, tup); - CatalogUpdateIndexes(rel, tup); + CatalogTupleUpdate(rel, &tup->t_self, tup); /* Update owner dependency reference */ changeDependencyOnOwner(ForeignServerRelationId, HeapTupleGetOid(tup), @@ -629,8 +627,7 @@ CreateForeignDataWrapper(CreateFdwStmt *stmt) tuple = heap_form_tuple(rel->rd_att, values, nulls); - fdwId = simple_heap_insert(rel, tuple); - CatalogUpdateIndexes(rel, tuple); + fdwId = CatalogTupleInsert(rel, tuple); heap_freetuple(tuple); @@ -786,8 +783,7 @@ AlterForeignDataWrapper(AlterFdwStmt *stmt) tp = heap_modify_tuple(tp, RelationGetDescr(rel), repl_val, repl_null, repl_repl); - simple_heap_update(rel, &tp->t_self, tp); - CatalogUpdateIndexes(rel, tp); + CatalogTupleUpdate(rel, &tp->t_self, tp); heap_freetuple(tp); @@ -850,7 +846,7 @@ RemoveForeignDataWrapperById(Oid fdwId) if (!HeapTupleIsValid(tp)) elog(ERROR, "cache lookup failed for foreign-data wrapper %u", fdwId); - simple_heap_delete(rel, &tp->t_self); + CatalogTupleDelete(rel, &tp->t_self); ReleaseSysCache(tp); @@ -882,13 +878,26 @@ CreateForeignServer(CreateForeignServerStmt *stmt) ownerId = GetUserId(); /* - * Check that there is no other foreign server by this name. + * Check that there is no other foreign server by this name. Do nothing if + * IF NOT EXISTS was enforced. */ if (GetForeignServerByName(stmt->servername, true) != NULL) - ereport(ERROR, - (errcode(E |