diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000..5ad68edc33 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +# Don't send some content to the Docker host when building +img +travis +.git +.travis.yml + +*.gcno +*.gcda +*.gcov +*.so +*.o diff --git a/.gitignore b/.gitignore index 22aedb5513..a64cea1abf 100644 --- a/.gitignore +++ b/.gitignore @@ -4,11 +4,13 @@ results __pycache__ *.pyc +rum--*.sql tmp_install +log # virtualenv bin include lib pip-selfcheck.json - +regression* diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..0c21a422c2 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,37 @@ +os: linux + +dist: jammy + +language: c + +services: + - docker + +before_install: + - cp travis/* . + +install: + - ./mk_dockerfile.sh + - docker-compose build + +script: + - docker-compose run $(bash <(curl -s https://codecov.io/env)) tests + +notifications: + email: + on_success: change + on_failure: always + +env: + - PG_VERSION=17 + - PG_VERSION=17 LEVEL=hardcore + - PG_VERSION=16 + - PG_VERSION=16 LEVEL=hardcore + - PG_VERSION=15 + - PG_VERSION=15 LEVEL=hardcore + - PG_VERSION=14 + - PG_VERSION=14 LEVEL=hardcore + - PG_VERSION=13 + - PG_VERSION=13 LEVEL=hardcore + - PG_VERSION=12 + - PG_VERSION=12 LEVEL=hardcore diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..a51596793f --- /dev/null +++ b/LICENSE @@ -0,0 +1,11 @@ +RUM is released under the PostgreSQL License, a liberal Open Source license, similar to the BSD or MIT licenses. + +Portions Copyright (c) 2015-2024, Postgres Professional +Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group +Portions Copyright (c) 1994, The Regents of the University of California + +Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL POSTGRES PROFESSIONAL BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF POSTGRES PROFESSIONAL HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +POSTGRES PROFESSIONAL SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND POSTGRES PROFESSIONAL HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. diff --git a/META.json b/META.json new file mode 100644 index 0000000000..eb6b80e21a --- /dev/null +++ b/META.json @@ -0,0 +1,56 @@ +{ + "name": "RUM", + "abstract": "RUM index access method", + "description": "RUM is an extended version of GIN (generalized inverted index) access method. Unlike GIN, RUM stores additional information in posting lists/trees besides item pointers. For example, those additional information might be lexemes positions or timestamps. Thanks to that RUM accelerate certain types of queries in orders of magnitude.", + "version": "1.1.0", + "maintainer": [ + "Alexander Korotkov ", + "Oleg Bartunov ", + "Teodor Sigaev ", + "Arthur Zakirov " + ], + "license": { + "PostgreSQL": "http://www.postgresql.org/about/licence" + }, + "prereqs": { + "runtime": { + "requires": { + "PostgreSQL": "9.6.0" + }, + "recommends": { + "PostgreSQL": "9.6.5" + } + } + }, + "provides": { + "rum": { + "file": "rum--1.1.sql", + "docfile": "README.md", + "version": "1.1.0", + "abstract": "RUM index access method" + } + }, + "resources": { + "homepage": "https://github.com/postgrespro/rum", + "bugtracker": { + "web": "https://github.com/postgrespro/rum/issues" + }, + "repository": { + "url": "https://github.com/postgrespro/rum.git", + "web": "https://github.com/postgrespro/rum", + "type": "git" + } + }, + "generated_by": "Alexander Korotkov", + "meta-spec": { + "version": "1.0.0", + "url": "http://pgxn.org/meta/spec.txt" + }, + "tags": [ + "index", + "search", + "GIN", + "full text", + "additional information" + ] +} \ No newline at end of file diff --git a/Makefile b/Makefile index 8c73a0b439..a8d510019d 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,49 @@ # contrib/rum/Makefile MODULE_big = rum +EXTENSION = rum +EXTVERSION = 1.3 +PGFILEDESC = "RUM index access method" + OBJS = src/rumsort.o src/rum_ts_utils.o src/rumtsquery.o \ src/rumbtree.o src/rumbulk.o src/rumdatapage.o \ src/rumentrypage.o src/rumget.o src/ruminsert.o \ src/rumscan.o src/rumutil.o src/rumvacuum.o src/rumvalidate.o \ - src/rum_timestamp.o $(WIN32RES) + src/btree_rum.o src/rum_arr_utils.o $(WIN32RES) -EXTENSION = rum -DATA = rum--1.0.sql -PGFILEDESC = "RUM index access method" -INCLUDES = src/rum.h src/rumsort.h +DATA_updates = rum--1.0--1.1.sql rum--1.1--1.2.sql \ + rum--1.2--1.3.sql + +DATA_built = $(EXTENSION)--$(EXTVERSION).sql -REGRESS = rum rum_hash ruminv timestamp orderby orderby_hash altorder \ - altorder_hash limits +INCLUDES = rum.h rumsort.h +RELATIVE_INCLUDES = $(addprefix src/, $(INCLUDES)) LDFLAGS_SL += $(filter -lm, $(LIBS)) +REGRESS = security rum rum_validate rum_hash ruminv timestamp orderby orderby_hash \ + altorder altorder_hash limits \ + int2 int4 int8 float4 float8 money oid \ + time timetz date interval \ + macaddr inet cidr text varchar char bytea bit varbit \ + numeric rum_weight expr + +TAP_TESTS = 1 + +ISOLATION = predicate-rum predicate-rum-2 +ISOLATION_OPTS = --load-extension=rum +EXTRA_CLEAN = pglist_tmp + ifdef USE_PGXS + +# We cannot run isolation test for versions 12,13 in PGXS case +# because 'pg_isolation_regress' is not copied to install +# directory, see src/test/isolation/Makefile +ifeq ($(MAJORVERSION),$(filter 12% 13%,$(MAJORVERSION))) +undefine ISOLATION +undefine ISOLATION_OPTS +endif + PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) @@ -28,10 +54,49 @@ include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif +$(EXTENSION)--$(EXTVERSION).sql: rum_init.sql + cat $^ > $@ + +ifeq ($(MAJORVERSION), 9.6) +# arrays are not supported on 9.6 +else +REGRESS += array +endif + +# For 9.6-11 we have to make specific target with tap tests +ifeq ($(MAJORVERSION), $(filter 9.6% 10% 11%, $(MAJORVERSION))) wal-check: temp-install $(prove_check) +check: wal-check +endif + +# +# Make conditional targets to save backward compatibility with PG11, PG10 and PG9.6. +# +ifeq ($(MAJORVERSION), $(filter 9.6% 10% 11%, $(MAJORVERSION))) + install: installincludes installincludes: - $(INSTALL_DATA) $(addprefix $(srcdir)/, $(INCLUDES)) '$(DESTDIR)$(includedir_server)/' + $(INSTALL) -d '$(DESTDIR)$(includedir_server)/' + $(INSTALL_DATA) $(addprefix $(srcdir)/, $(RELATIVE_INCLUDES)) '$(DESTDIR)$(includedir_server)/' + +uninstall: uninstallincludes + +uninstallincludes: + rm -f $(addprefix '$(DESTDIR)$(includedir_server)/', $(INCLUDES)) + +ISOLATIONCHECKS= predicate-rum predicate-rum-2 + +submake-isolation: + $(MAKE) -C $(top_builddir)/src/test/isolation all + +submake-rum: + $(MAKE) -C $(top_builddir)/contrib/rum + +isolationcheck: | submake-isolation submake-rum temp-install + $(pg_isolation_regress_check) \ + --temp-config $(top_srcdir)/contrib/rum/logical.conf \ + $(ISOLATIONCHECKS) +endif \ No newline at end of file diff --git a/README.md b/README.md index 6971b465e6..b6fb08420c 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,88 @@ +[![Build Status](https://api.travis-ci.com/postgrespro/rum.svg?branch=master)](https://travis-ci.com/postgrespro/rum) +[![PGXN version](https://badge.fury.io/pg/rum.svg)](https://badge.fury.io/pg/rum) +[![GitHub license](https://img.shields.io/badge/license-PostgreSQL-blue.svg)](https://raw.githubusercontent.com/postgrespro/rum/master/LICENSE) + [![Postgres Professional](img/PGpro-logo.png)](https://postgrespro.com/) # RUM - RUM access method ## Introduction -The **rum** module provides access method to work with `RUM` index. It is based -on the `GIN` access methods code. +The **rum** module provides an access method to work with a `RUM` index. It is based +on the `GIN` access method's code. -`GIN` index allows to perform fast full text search using `tsvector` and -`tsquery` types. But full text search with GIN index has several problems: +A `GIN` index allows performing fast full-text search using `tsvector` and +`tsquery` types. But full-text search with a GIN index has several problems: -- Slow ranking. It is need position information about lexems to ranking. `GIN` -index doesn't store positions of lexems. So after index scan we need additional -heap scan to retreive lexems positions. -- Slow phrase search with `GIN` index. This problem relates with previous -problem. It is need position information to perform phrase search. -- Slow ordering by timestamp. `GIN` index can't store some related information -in index with lexemes. So it is necessary to perform additional heap scan. +- Slow ranking. It needs positional information about lexemes to do ranking. A `GIN` +index doesn't store positions of lexemes. So after index scanning, we need an +additional heap scan to retrieve lexeme positions. +- Slow phrase search with a `GIN` index. This problem relates to the previous +problem. It needs positional information to perform phrase search. +- Slow ordering by timestamp. A `GIN` index can't store some related information +in the index with lexemes. So it is necessary to perform an additional heap scan. -`RUM` solves this problems by storing additional information in posting tree. +`RUM` solves these problems by storing additional information in a posting tree. For example, positional information of lexemes or timestamps. You can get an -idea of `RUM` by the following picture: +idea of `RUM` with the following diagram: ![How RUM stores additional information](img/gin_rum.png) -Drawback of `RUM` is that it has slower build and insert time than `GIN`. -It is because we need to store additional information besides keys and because -`RUM` uses generic WAL. +A drawback of `RUM` is that it has slower build and insert times than `GIN`. +This is because we need to store additional information besides keys and because +`RUM` uses generic Write-Ahead Log (WAL) records. ## License -This module available under the same license as +This module is available under the [license](LICENSE) similar to [PostgreSQL](http://www.postgresql.org/about/licence/). ## Installation -Before build and install **rum** you should ensure following: +Before building and installing **rum**, you should ensure following are installed: * PostgreSQL version is 9.6+. Typical installation procedure may look like this: +### Using GitHub repository + $ git clone https://github.com/postgrespro/rum $ cd rum $ make USE_PGXS=1 - $ sudo make USE_PGXS=1 install + $ make USE_PGXS=1 install $ make USE_PGXS=1 installcheck $ psql DB -c "CREATE EXTENSION rum;" +### Using PGXN + + $ USE_PGXS=1 pgxn install rum + +> **Important:** Don't forget to set the `PG_CONFIG` variable in case you want to test `RUM` on a custom build of PostgreSQL. Read more [here](https://wiki.postgresql.org/wiki/Building_and_Installing_PostgreSQL_Extension_Modules). + +## Tests + +$ make check + +This command runs: +- regression tests; +- isolation tests; +- tap tests. + + One of the tap tests downloads a 1GB archive and then unpacks it + into a file weighing almost 3GB. It is disabled by default. + + To run this test, you need to set an environment variable: + + $ export PG_TEST_EXTRA=big_values + + The way to turn it off again: + + $ export -n PG_TEST_EXTRA + ## Common operators and functions -**rum** module provides next operators. +The **rum** module provides next operators. | Operator | Returns | Description | -------------------- | ------- | ---------------------------------------------- @@ -59,16 +91,19 @@ Typical installation procedure may look like this: | timestamp <=| timestamp | float8 | Returns distance only for left timestamps. | timestamp |=> timestamp | float8 | Returns distance only for right timestamps. +The last three operations also work for types timestamptz, int2, int4, int8, float4, float8, +money and oid. + ## Operator classes -**rum** provides next operator classes. +**rum** provides the following operator classes. ### rum_tsvector_ops For type: `tsvector` -This operator class stores `tsvector` lexemes with positional information. Supports -ordering by `<=>` operator and prefix search. There is the example. +This operator class stores `tsvector` lexemes with positional information. It supports +ordering by the `<=>` operator and prefix search. See the example below. Let us assume we have the table: @@ -103,21 +138,21 @@ SELECT t, a <=> to_tsquery('english', 'beautiful | place') AS rank FROM test_rum WHERE a @@ to_tsquery('english', 'beautiful | place') ORDER BY a <=> to_tsquery('english', 'beautiful | place'); - t | rank ----------------------------------+----------- - The situation is most beautiful | 0.0303964 - It is a beautiful | 0.0303964 - It looks like a beautiful place | 0.0607927 + t | rank +---------------------------------+--------- + It looks like a beautiful place | 8.22467 + The situation is most beautiful | 16.4493 + It is a beautiful | 16.4493 (3 rows) SELECT t, a <=> to_tsquery('english', 'place | situation') AS rank FROM test_rum WHERE a @@ to_tsquery('english', 'place | situation') ORDER BY a <=> to_tsquery('english', 'place | situation'); - t | rank ----------------------------------+----------- - The situation is most beautiful | 0.0303964 - It looks like a beautiful place | 0.0303964 + t | rank +---------------------------------+--------- + The situation is most beautiful | 16.4493 + It looks like a beautiful place | 16.4493 (2 rows) ``` @@ -125,38 +160,37 @@ SELECT t, a <=> to_tsquery('english', 'place | situation') AS rank For type: `tsvector` -This operator class stores hash of `tsvector` lexemes with positional information. -Supports ordering by `<=>` operator. But **doesn't** support prefix search. - -### rum_timestamp_ops - -For type: `timestamp` +This operator class stores a hash of `tsvector` lexemes with positional information. +It supports ordering by the `<=>` operator. It **doesn't** support prefix search. -Operator class provides fast search and ordering by timestamp fields. Supports -ordering by `<=>`, `<=|` and `|=>` operators. Can be used with -`rum_tsvector_timestamp_ops` operator class. +### rum_TYPE_ops -### rum_timestamptz_ops +For types: int2, int4, int8, float4, float8, money, oid, time, timetz, date, +interval, macaddr, inet, cidr, text, varchar, char, bytea, bit, varbit, +numeric, timestamp, timestamptz -For type: `timestamptz` +Supported operations: `<`, `<=`, `=`, `>=`, `>` for all types and +`<=>`, `<=|` and `|=>` for int2, int4, int8, float4, float8, money, oid, +timestamp and timestamptz types. -Operator class provides fast search and ordering by timestamptz fields. Supports -ordering by `<=>`, `<=|` and `|=>` operators. Can be used with -`rum_tsvector_timestamptz_ops` operator class. +This operator supports ordering by the `<=>`, `<=|` and `|=>` operators. It can be used with +`rum_tsvector_addon_ops`, `rum_tsvector_hash_addon_ops' and `rum_anyarray_addon_ops` operator classes. -### rum_tsvector_timestamp_ops +### rum_tsvector_addon_ops For type: `tsvector` -This operator class stores `tsvector` lexems with timestamp field. There is the example. +This operator class stores `tsvector` lexemes with any supported by module +field. See the example below. Let us assume we have the table: + ```sql CREATE TABLE tsts (id int, t tsvector, d timestamp); \copy tsts from 'rum/data/tsts.data' -CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_timestamp_ops, d) +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't'); ``` @@ -164,7 +198,7 @@ Now we can execute the following queries: ```sql EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------------------------- Limit -> Index Scan using tsts_idx on tsts @@ -173,7 +207,7 @@ EXPLAIN (costs off) (4 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? + id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 @@ -183,31 +217,22 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) ``` -### rum_tsvector_timestamptz_ops - -For type: `tsvector` +> **Warning:** Currently RUM has bogus behaviour when one creates an index using ordering over pass-by-reference additional information. This is due to the fact that posting trees have fixed length right bound and fixed length non-leaf posting items. It isn't allowed to create such indexes. -See comments for `rum_tsvector_timestamp_ops` operator class. - -### rum_tsvector_hash_timestamp_ops +### rum_tsvector_hash_addon_ops For type: `tsvector` -This operator class stores hash of `tsvector` lexems with timestamp field. -**Doesn't** support prefix search. - -### rum_tsvector_hash_timestamptz_ops +This operator class stores a hash of `tsvector` lexemes with any supported by module +field. -For type: `tsvector` - -This operator class stores hash of `tsvector` lexems with timestamptz field. -**Doesn't** support prefix search. +It **doesn't** support prefix search. ### rum_tsquery_ops For type: `tsquery` -Stores branches of query tree in additional information. For example we have the table: +It stores branches of query tree in additional information. For example, we have the table: ```sql CREATE TABLE query (q tsquery, tag text); @@ -224,17 +249,62 @@ Now we can execute the following fast query: ```sql SELECT * FROM query WHERE to_tsvector('black holes never exists before we think about them') @@ q; - q | tag + q | tag ------------------+------- 'black' | color 'black' & 'hole' | color (2 rows) ``` +### rum_anyarray_ops + +For type: `anyarray` + +This operator class stores `anyarray` elements with length of the array. +It supports operators `&&`, `@>`, `<@`, `=`, `%` operators. It also supports ordering by `<=>` operator. +For example, we have the table: + +```sql +CREATE TABLE test_array (i int2[]); + +INSERT INTO test_array VALUES ('{}'), ('{0}'), ('{1,2,3,4}'), ('{1,2,3}'), ('{1,2}'), ('{1}'); + +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +``` + +Now we can execute the query using index scan: + +```sql +SET enable_seqscan TO off; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{1}'::smallint[]) + Order By: (i <=> '{1}'::smallint[]) +(3 rows + +SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; + i +----------- + {1} + {1,2} + {1,2,3} + {1,2,3,4} +(4 rows) +``` + +### rum_anyarray_addon_ops + +For type: `anyarray` + +This operator class stores `anyarray` elements with any supported by module +field. + ## Todo - Allow multiple additional information (lexemes positions + timestamp). -- Add support for arrays. - Improve ranking function to support TF/IDF. - Improve insert time. - Improve GENERIC WAL to support shift (PostgreSQL core changes). @@ -243,6 +313,12 @@ SELECT * FROM query Alexander Korotkov Postgres Professional Ltd., Russia -Oleg Bartunov Postgres Professional Ltd., Russia +Oleg Bartunov Postgres Professional Ltd., Russia + +Teodor Sigaev Postgres Professional Ltd., Russia + +Arthur Zakirov Postgres Professional Ltd., Russia + +Pavel Borisov Postgres Professional Ltd., Russia -Teodor Sigaev Postgres Professional Ltd., Russia +Maxim Orlov Postgres Professional Ltd., Russia diff --git a/TODO b/TODO index 848f55d0b2..f128bd54c4 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,10 @@ -1. with naturalOrder=true make scan the rest to be consistent with seqscan -2. add leftlink to data page to privide backward scan on index (<=| op) +1. with naturalOrder=true make scan the rest to be consistent with seqscan [done] +2. add leftlink to data page to privide backward scan on index (<=| op) [done] 3. Compression of ItemPointer for use_alternative_order 4. Compression addInfo -5. Remove FROM_STRATEGY ugly magick +5. Remove FROM_STRATEGY ugly magick [done] + +BTREE: +1 fix changes in rum--1.0.sql [done] +2 adding using as addinfo diff --git a/data/rum_array.data b/data/rum_array.data new file mode 100644 index 0000000000..b3903d0f33 --- /dev/null +++ b/data/rum_array.data @@ -0,0 +1,7000 @@ +{18,31,54,95} +{23,50,13,9,39} +{99,54,77} +{79,83,16,63,32} +{52,41,61,79,94,87} +{76,59,39,36,21} +{} +{41,79,76,96,3} +{25,59,5,96,32} +{92,58,12,57} +{24,48,41,88} +{39,5,17} +{10,41,78,25,35} +{31,89,4} +{68,74,94} +{97,78,44,68,81,16} +{87,76} +{30,81} +{72,20,99,26} +{87,90,98,40,44} +{24,99,66,61} +{79,8,48,16} +{62,99,48,80,75,39} +{10,60,35,15} +{45,71,10,97,56} +{64,79,19,31} +{30,57,42,31,45} +{61,42,14,26} +{12,38,65,36,56,36} +{17,62,18,56} +{84,85,90,60,55,17} +{27,11,82,20,43} +{14,27,18,48,39,51} +{53,13,52} +{56,35,81,60,27} +{79,89,89,7} +{65,17,31,17,29,85} +{21,3} +{53,55,16,83,4} +{62,3,63} +{73,40,99} +{23,80} +{2,74,42,37,21} +{12,16} +{80,60} +{19,62,34} +{38,19,31,6,15,2} +{63,96,64,4,36,15} +{9,3} +{91,87,15,18,7,66} +{17,10} +{77,96} +{11,43,31,2,89} +{17,77,89,50} +{24,6,61,88,51} +{61,50,59,90,5,89} +{58,1,39,48} +{78,36,70,92} +{43,3,22,95,51} +{} +{88,64,25,64,86} +{34,6,49,90,25} +{86,35,13,22} +{21,44,83} +{42,88,72,65,59,96} +{36,33,1,98} +{16,54} +{35,16,44} +{73,23,20} +{84,25,1,52,35} +{27,36,54,87,31} +{38,47,83,3} +{64,13} +{65,84,85,16,22} +{57,9,39,73} +{89,11,67,55,73} +{78,39,84,63,62,45} +{50,63,8} +{} +{96,36,58,65,96} +{59,86,41,30} +{90,60,39,47,19} +{70,100,73,99} +{} +{85,14,39} +{76,53} +{96,38,52,13,87,85} +{97,51,15,30,53,87} +{30,59,9,40,13} +{31,91,68,79} +{37,56,39,78,75} +{82,2,47} +{33,25,45,40} +{51,21,92,20,18,76} +{84,93,36,95,34,69} +{66,25,5,40} +{77,6,57,42} +{} +{88,81,85,37,12} +{56,73,38} +{70,70,6,19} +{82,54,91} +{75,8} +{45,33,64,90,95} +{8,71,66,12} +{56,26,68,94} +{70,77,4,96,62,83} +{23,87} +{34,34,4,33} +{28,84} +{78,75,77} +{88,53} +{27,38} +{2,2,82} +{30,52,88,61,33} +{29,72,94,68} +{85,72} +{88,4} +{63,90,43,66,24,33} +{88,48,47} +{3,11,98,37,61} +{45,65,63,15,38} +{79,45,56,94} +{56,74,78,19,76} +{24,81,64,13,100} +{93,27,63,71,27,3} +{74,13,85,86,32,60} +{98,40,63,13} +{41,95,19,93,17,84} +{90,28,100,100,19,2} +{35,15,54} +{29,81,77} +{54,64,63,12,18} +{38,43,85,21,35} +{84,28,27,4,80,27} +{80,77,55,98} +{13,71,48,55,89,38} +{58,43,27,5,57} +{5,33,96,6} +{73,93,87,69,100,24} +{58,96,38,85,55,51} +{37,30,88,4,8,59} +{24,68,43,48,18,84} +{23,100,82,30,42} +{23,36,16,99,27} +{41,75} +{66,41,10,37,16,6} +{54,49,60} +{4,56,44,72,40} +{71,96,67,100,59} +{7,41} +{8,3,27} +{38,69,47,68,5,24} +{43,100,59,62} +{92,14,34,5,71,48} +{72,5,91,29,99,36} +{62,71,37,80,62,50} +{32,45,17} +{89,68} +{52,17,55} +{21,47,15,92} +{36,100,5} +{14,76,59,11,15} +{59,72} +{37,55,89,49} +{87,79,96,20,93} +{6,44} +{32,46,25} +{27,47,76,4,54} +{2,16} +{90,36} +{11,19,27,79} +{54,4} +{72,88} +{14,85,71,69,5,22} +{31,48} +{28,35,18} +{77,55,100,73,57,62} +{} +{14,59,53} +{98,3} +{13,56} +{26,61,88,54,88,33} +{70,12} +{55,16,15,42,76} +{13,75} +{97,38,82,51,86,53} +{41,76,39,84,32} +{94,66,47} +{55,28} +{} +{94,65,59,20} +{55,50,56,14,58} +{14,94,52,25,69,95} +{20,96} +{37,38} +{26,35,9,98,74} +{11,9,41,79} +{36,57,87,69,92,89} +{11,39,60,4,47,3} +{97,5} +{16,58,38,98,42} +{46,69} +{35,54} +{36,79,54} +{} +{63,78} +{12,86,52,29,60,30} +{29,27,58,86,42,62} +{42,12,60} +{90,93,85,29} +{16,8,45} +{29,33,85} +{32,14,6,47,74} +{14,85,14,26,3} +{46,71,10,16} +{30,63} +{} +{91,30,56} +{46,36,68,91,36,88} +{24,61} +{66,21,80,14} +{43,63,50,21,11} +{38,46,18,51} +{38,28,70} +{17,41,76,1,30} +{47,63} +{56,80,85,1,7,97} +{75,5,79,32} +{5,17,66,51,68} +{6,83,2} +{25,40,79,84} +{58,38,12,68} +{55,86,20,67,27} +{58,64} +{14,51} +{12,86,57,68} +{61,91,65,3,83,68} +{40,31,82,21} +\N +{24,64,35,32} +{32,83,18,27,43,32} +{50,83} +{94,84,58,3,25,79} +{66,2,27,36,24} +{71,34} +{17,57} +{22,40,49,50,10} +{79,62,94,78} +{92,79,24,72} +{23,41} +{69,60,77,70,18,48} +{39,45,91,85} +{27,43,22,21,85} +{84,51,96,7,18} +{100,38,69,93,66,39} +{73,42,35,15,69,98} +{100,17,37,15,40} +{1,91,2,17,90,48} +{18,12,52,24} +{39,43,89} +{16,13,88} +{69,8,75} +{34,91,54,81} +{37,68,89,1,56} +{81,83,39,36,14} +{12,15,2} +{14,16,88,43} +{59,12} +{1,62,21,94} +{29,43,70,52,93} +{29,36,56,78} +{91,56,86,89,53} +{14,83,39,94} +{29,58,72,4,45} +{76,56,84,28,58} +{4,52,6,88,43,17} +{21,1,35,62,77,6} +{78,74} +{1,20,93,43} +\N +{30,100,35,94,74,64} +{81,3,21,4} +{9,19,33} +{28,62,40,64,26} +{69,72,26,30,90} +{52,70,78,43} +{91,58,33,22,92,26} +{98,36,96,94,66} +{86,43,82} +{93,52,4,58,51} +\N +{49,61,80,79,90} +{50,81,72} +{57,29} +{54,31,36} +{52,31,6,48,2} +{4,51,37,83,17} +{60,20,94,82,18} +{52,64,26,81,69,61} +{39,8,22,2,8} +{31,25,95,99} +{11,72,30,95,20,28} +{78,87} +{21,40,98,41,73,33} +{67,88,42,62,11,47} +{85,1} +{4,68,100,72,24} +{82,43} +{97,55,47,52} +{51,52} +{20,21} +{69,46,34,59,54,61} +{9,31,43} +{68,20} +{73,63} +{71,12,93,8,48,10} +{44,46,42,91,21} +{98,52} +{45,60} +{95,38,30,3} +{27,77,2,46,53,18} +{99,5} +{79,33,34,48,82} +{3,29,82,72,35} +{73,75,83} +{25,43,37,26} +\N +{51,95,40} +{18,23,10,90,15,20} +{85,66} +{25,76,22,87,88,18} +{92,4} +{27,51} +{25,77,12,37} +{44,52,69,39,21,63} +{94,30,74,36} +{60,18} +{62,88,94,93,26} +{5,72,96,25} +{99,1,85,98,85,70} +{33,21,37,19} +{44,78} +{47,2,73,32,3} +{91,35,10,81} +{80,64,7,45,84} +{86,16,96,8,88} +{32,29,84,81,30,8} +{51,28,6,16} +{88,51,50,54,56,96} +{79,19,41,40} +{40,26,10,26,2} +{60,34,3,29} +{68,80,70,56} +{60,23,39} +{50,69,6,71,70,25} +{98,53,94,14,45,11} +{98,39,64,89,98,32} +\N +{45,5,15,23,41,63} +{54,31,55,58,32} +{36,56} +{38,78,65,4,75,38} +\N +{40,6,93,40,13,59} +{42,50,10,65,96} +{6,94,49} +{63,44,36,55} +{40,79} +{39,75,27} +{8,31} +{81,75} +{99,82,85,34,24,89} +{86,82,20} +{63,96} +{47,83,29} +{70,46,48} +{44,11} +{94,19,84,79,77,22} +{68,47,100,48,65,77} +\N +{76,12,86,58} +{13,14,79,61,12} +{68,65,16,93,89} +{95,18,29,89,92,43} +{19,12,50,47} +{82,93,85} +{71,40,85} +{95,96,100,86} +{2,40,71,36,25} +{11,95,25} +{79,46,41,35,39} +\N +\N +{88,29} +{54,14,94,88} +{59,67,81,41} +{46,68,78,56,47,30} +{5,76,87} +{23,89,47,46} +{47,98,14,31,1,60} +{32,14,96,61,37} +{79,66,93} +{98,1,77,44} +{21,68,2,31,17} +{94,23,15} +{48,47,57,94,49,71} +{54,3} +{99,40,81,86,81} +{85,12,98,81,5} +{60,41,21} +{38,82,55,41,96} +{11,98,12,69,93} +{11,70,66,44} +{23,92,80} +{10,8,43,97} +{17,30} +{78,56,58} +{84,87,84} +{12,32,7,58,47,48} +{29,46} +{87,34} +{59,30,72,85,71} +{67,48,83,98} +{35,10,73,71,1,77} +{21,51,16,60,64,12} +{36,61} +{54,98} +{44,74,84} +{83,14} +{71,52,48,48,15,92} +{79,78,98,35} +{52,29,47,86,96} +{10,37} +{21,25} +{57,22,28,30,97,47} +{15,28} +{88,24,98,45,65} +{78,42} +{36,70} +{40,48} +{72,3,78,69,57,33} +\N +{21,96,16,21,75,23} +{55,5,72,45} +{99,2,72,29} +{48,17} +{84,84,40,1,59} +{34,11} +{34,80,45,31} +{56,82,25,65,22,64} +{10,4,55} +{74,67,42,74,80} +{84,22,42,6,87,30} +{6,51,89,2,84,78} +{19,95,93,87,8} +{45,84,25} +{7,12,16,92} +{89,82,16} +{22,64} +{16,31,49,48,45,14} +{69,64,19,14,39,8} +{40,96,26,48,65} +{17,45,4,57} +{73,8} +{85,89,1,15,74,51} +\N +{57,89} +{25,12,55} +{39,62,35} +{85,88,71,98,83} +{64,63,75,72} +{100,40,38,1} +{2,44} +{13,46,59,43} +{87,9,93,50} +{77,7,11,30} +{61,11,18} +{19,25,68,83} +{67,25} +{54,18,85} +{96,81,38,11,3} +{87,32,47,79,62,56} +{42,49} +{41,65,24,13,79,75} +{85,32,96} +\N +{3,63,47,84,67,13} +{53,57,59,61} +{95,27,8,89,35} +{76,78,76,76,14,37} +{31,62,65} +{97,57,60,80} +{18,81,93,67} +{8,10} +{65,25} +{68,1,62,63,64,88} +{27,56,74} +{29,61,78,40} +{54,72} +{96,30,71,21,99} +{67,11,67} +{26,65,31} +{89,90,89,68} +{56,39,63,39} +{50,67} +{72,100,24,84,9} +{29,57,65,37,3} +{72,75,79,30} +{78,44,87,67} +{100,19} +{35,60,82} +{16,83,69,38} +{29,98,13,60} +{42,60,87} +{18,67,60} +{31,77,50} +{3,22,40,59,7} +{82,80} +\N +{32,92,70,30,18,35} +{48,38,92,82} +{10,92,66,59} +{4,67,42,21,71} +{27,88,20,21,9} +{46,22,27,85,36} +{42,55,36} +{24,2,96} +{96,48,40,48,52} +{15,5,90,10,68,20} +{30,2,67,92,96,63} +{16,82,87,26} +{88,98,76,29} +{29,11,94,23} +{58,20} +{52,18,55,73} +{20,81,52,19,37} +{93,21,97} +{2,77} +{46,91,80,48,71,20} +{87,7,93} +{68,77,61} +{59,33,52} +{67,62,89,2,62,90} +{30,82,72,44} +{72,18,60,38} +{11,14,59} +{74,65,54,58,67,66} +{74,56,40,73,50,66} +{42,17,56,59,53,19} +{75,25,76,9,72,50} +{14,57} +{61,47} +{90,11,72,13} +{52,27} +{80,84,53,55,98} +{16,26,55,17,79,96} +{42,73,77} +{6,84,67,54,96} +{99,48,99,63,73,77} +{5,41,72,5,88,81} +{19,20,20} +{21,89,55,44} +{82,67,11,64,61,5} +{44,34,8,62,53} +{75,53,66,36,100} +{46,65,6,70,4} +{84,10,56,35,18} +{65,60} +{88,56,27,11} +{10,9,97} +{97,49,100,100,76,32} +{2,98,57} +{47,57,84,74,79} +{80,9,24} +{96,33,86,28,19} +{43,76} +{46,14,55,92} +{60,69,66,62,22} +{45,85} +{45,9,36,13,45,1} +{24,49,8,37,66,64} +{98,53,96,47,2} +{36,44,32,4} +{77,36,78,51,63} +{82,36} +\N +{54,55,33,45,69,18} +{82,93} +{65,59,2,62,10,25} +{75,70,76,69,7,23} +{10,34,67,85} +{94,66,28,40,64,41} +{35,73,64,28,45,68} +{75,2} +{58,49,4,87,19} +{91,99,11,66,75,70} +{26,64} +\N +{13,51,18} +{39,33,21,18} +{27,50,82,2,3,71} +{51,89,44,53} +{88,91,34} +{45,96,27,12,51,52} +{31,96} +{2,9,54,89} +\N +{57,99} +{87,84,70,7,98,42} +{32,80} +{57,64,28} +{24,39,76,4,30} +{59,38,15,45,47,28} +{71,20,37,1} +{72,59} +{7,44} +{50,37,18,1,58,40} +{13,18,21,56} +{72,3,26,74,91} +{60,22,71,49} +{55,82,61,8,48,66} +{28,22,75,41,52} +{51,63,27,41,16} +{59,89,40,85,86} +{12,1} +{52,11,6} +{37,10,43,88,15,7} +{14,94,81} +{34,56,57,4} +{81,43,11,88,74,76} +\N +{67,10,50,79,70,35} +{14,51} +{49,50,23,84} +{51,41,57,100,19,14} +{31,55,40,96} +{8,42,33} +{83,34,1} +{56,80,22,93} +\N +{8,77,91} +{58,39} +{55,30,74} +{50,22,63,73} +{80,19,67,70,18} +{7,99,45,23,59,78} +{36,97,10,33,22,45} +{43,78,90} +\N +{1,68} +{63,95,54} +{5,67,61,37,89} +{32,97,2,56} +{83,31,6,80,63} +\N +{34,15,30,40,16} +{13,43,6} +{35,86,31} +{45,59,4,95,26} +{63,48,25} +{56,97,89,45,87,21} +{42,81,69} +{49,99,87} +{81,21,15,36,70,2} +{93,41,53} +{54,71,82} +{88,90,51} +{100,35,18} +{88,81} +{76,16,87,14} +{16,83,81,44} +{16,53,100,91} +{55,75,92} +{27,97,76,88,66} +{14,100,95,95} +{95,84,93,29,67} +{32,10} +{82,12,51} +{40,6,18,14,29,70} +{3,100,81} +{83,69} +{35,63,31,15} +{5,100,81,54,37,78} +{99,76,33} +{88,85,16} +{46,20,15,10,6,90} +{53,15,75,76,94} +{5,76} +{16,7,21,70} +{3,84,15} +{29,58,73,91} +{82,39,64} +{49,66,83,76} +{79,49,19,67,18,76} +{9,56,41} +{12,22,19} +{62,54} +{20,73,40} +{34,53,58,68,96} +{97,14,61,63} +{38,55,90,63} +{83,78,81,29,12,46} +{96,97,40,89,10} +{67,33,19,19,74,47} +{78,31} +{92,74,93} +{59,54,90,52,29,87} +{92,39,55,89,81,21} +{20,85,64} +{13,97} +{88,18,85,24,54,90} +{67,51,47} +{27,29,90} +{48,27,7,92} +{100,37,24,41,68,66} +{45,7,100,83,51} +{34,10} +{60,36,44} +{55,46,4} +{86,64} +{61,77,98,64} +{14,82,14,50,1} +\N +{53,31} +{64,43,35,44,98,75} +{98,15,52,58,76} +{55,94,92,40,80} +{1,14,100,42,45,74} +{13,90,84,97,18,92} +\N +{13,91} +{67,33,15} +{18,96,38} +{95,70,34,100} +{17,29,64,32} +{19,14,83,69,60,99} +{69,29,64,61,45,17} +{78,48,24} +{40,60,61,93,17} +{19,89,22,71} +{48,8,13,11,56} +{75,18,77,100} +{29,78} +{51,92,97,31} +{83,5,2,97,68,69} +{39,86,86,94,41} +{66,21,27} +{30,84,11,60} +{50,61,28,46,38,45} +{12,59,66,80,15,64} +{69,22} +{30,54,58,99} +{14,28,80,22} +{44,31,14,61,83,72} +{55,53,78,91,76,55} +{43,3,90,22,7} +{51,34,24} +{3,99,5,72,82} +{95,38,61} +{22,8} +{78,40,93,65,18,26} +{21,17,19,8,89} +\N +\N +{94,88,27} +{49,45} +{67,24,64,86,18,1} +{5,33,18,84,51} +{15,71,89,48,94,81} +{71,69} +{98,63,73,64} +{14,75,12} +{47,42,88,13} +{35,51,60} +{63,41} +{73,11,66,99,8} +\N +{2,17,6,44,97} +{95,24} +{2,13,35,21} +{76,29} +{81,37,21} +{23,63,27,53} +{70,66,58,27,4} +{69,62,22} +{62,96,44} +{68,87,99} +{51,40,81,52,93} +{81,11,45,92,22,21} +{5,39,46} +{44,7} +{14,63,62,9,12} +{9,19,90,72,51} +{70,61,24,36} +\N +{29,19,3,30} +{76,86,28,58,38} +{59,27} +{9,65,65,10,37,6} +{89,51,50,23} +{65,2} +{33,51} +{25,55,69,55,1,78} +{76,71,93,46,23} +{70,30,50,11,2,89} +{74,39} +{4,29,22,80,15,23} +{16,30,69,76,61,67} +{43,34,4,70,36} +{59,32,25,93,32,98} +{64,4} +{52,33,47} +{31,49,7,62,7,95} +{44,69,12,45,34,8} +{81,37,83,35,3} +{24,74,16,89,94,27} +{79,71,72,49,88,35} +{17,96,72,87,48} +{81,18,50} +{11,19,70} +{42,95,42,58,90} +{27,65,83,86,33} +{55,7} +{43,55,92,79} +{97,55} +{85,25} +{93,42,69,44,26,78} +{2,40,46,19,84} +{8,42,16,26,87} +{36,8,42} +{31,47,61,44,13} +{85,97,47} +{27,30,71,88,15,100} +{69,27,4,19} +{3,52,31,62,98} +{64,86} +{91,6} +{76,40} +{57,77,7,40} +{71,34,48,53,37} +{36,72} +{61,99,53,2,31,6} +{86,15} +{52,93,59,51} +{57,27,52} +{48,67,31,69} +{34,90,37,73,60,83} +{71,24,49,59} +{93,71,90} +{77,31,77} +{47,40,32,20} +{97,40,63,80,44} +{88,55,10,40} +{86,36,40,72,38,9} +{31,97} +{56,19,55,62,60} +{53,95} +{33,36} +{50,12,55,42,96,100} +{41,17,100,76} +{65,1,61,69,64,21} +{90,92} +\N +{74,42,86} +{2,4} +{99,78,5,92,1,61} +{1,69} +{80,73,60,31} +\N +{10,25,13} +{50,34,75} +{12,90,6,36,42} +{23,54,46} +{67,28,66,87} +{8,88,88,51,55,32} +{15,19,24,91,75} +{80,16,70} +{41,7,90,37} +{97,57,32,21} +{54,74,29,12,55,78} +{60,76,37,92,44,73} +{1,56,14} +{40,79} +{97,1,30,78,56} +{36,25,61} +{33,3,51,30,38} +{2,94,19,15} +{7,38,72} +{96,18,3} +{18,95,15,62,74,53} +{59,61} +{18,66,66,65,4,33} +{49,83,10} +{17,52,90,39,61,87} +{38,92,55,26} +{8,43} +{77,51,68,23,47} +{27,65,24,43,88,73} +{54,34,30,2,19,62} +{12,36,81,24,66,8} +{38,91,90,99,84} +{51,55,94,97,91,15} +{50,42,20,22} +{70,4,22} +{64,26} +{56,86,16,21,31,61} +{7,19,86,49,10,53} +{81,16,74} +{95,9,11,46,47} +{34,23,16} +{94,38,19,4,4} +{39,79} +{41,3,62} +{84,67,53,90,46} +{17,46,23} +{62,1,5,58,52,1} +{23,83,80,62,19} +{99,61,77} +{51,95,48,96,75} +{39,2,6,95,43,36} +{69,9,59} +{62,97,31} +{75,96} +{33,29,35,13,94,78} +{28,71,16,99} +{72,86,25} +{5,28,15,33} +\N +{13,13,52,20} +{58,98,83,85,81} +{13,75,42} +{7,91,3,83,82,37} +{72,91} +{10,67,61} +\N +{43,86,76} +{36,62} +{64,56} +{63,13,22,24} +{76,49,38,23,9,8} +\N +{92,58,24,19,96,90} +{24,37,76,37,1,95} +{91,9} +{46,35,48,37,91,76} +{72,21,6} +{30,80,39,90,89,18} +{83,30,67,17} +{43,36,46,43} +{4,31,34,36,33,48} +\N +{16,49} +{75,56,8,91,4} +{92,80} +{74,72,68,89,7,82} +{79,17,26} +{14,15,28,72,58} +{42,21,9} +{71,39,98,98,61} +{68,63,23,74,74,48} +{91,80,22,100,57,30} +{63,60} +{90,9,10,67,89,14} +{53,93} +{75,49,34,30,38} +{2,43} +{32,4,24,48,23,31} +{45,24,31,15,51} +{65,62,21} +{83,50} +{10,90,98,86,87,1} +{63,2,9,17,30} +\N +{77,46,60} +{49,39} +{37,86,4,63} +{33,28,37,33} +{4,88,80,14,47,45} +{90,64,17,65} +{60,90,12} +{7,2,38,33} +\N +{39,90,7} +{89,32} +{27,47,63,31} +{54,10,10,73,84,87} +{55,58,25,87} +{41,24} +{71,26,8,31} +{74,19,33,81,74} +{47,58} +{44,16,22,59} +{2,10,97,16,25} +{1,98,3,41,6,80} +{12,13} +{3,50,61,85} +{54,5,44,97,71,86} +{54,72,94} +{59,13,28,79} +{73,68,7,13} +{90,49,63,45} +{95,47,84} +{31,79,98,22} +\N +{13,15,83,89,87,20} +{1,58,87} +{15,21,39} +{93,27} +{40,81,13,31} +{29,52} +{28,48,36,41} +\N +{71,23,89} +{29,59,31,45,35} +{49,83,24,19,44,26} +{41,61,36,34,38,88} +{66,17,18,9} +{55,38,93,33} +{84,42,71,15,12} +{11,38,78,80,90,92} +{1,6,28,68,58} +{96,63,73,22,74,29} +{65,97,68} +{92,29,92,36} +{47,25,30} +{25,44,67,95,16} +{7,26,41} +{79,12,44,69} +{17,27,4,60} +{45,30,57} +{68,24,63} +{39,64,94,92} +{27,68,39,68,75,8} +{88,48,48} +{86,86,8,54,7,45} +{93,60,14,90,14} +{97,42,54,67,38} +{13,38} +{84,34,30} +{34,71,77,71,13} +{82,18} +{53,7,79,79} +{28,65,38,20,93,100} +{96,10} +{94,12,93,48,51,20} +{12,4,41,11,25,59} +{95,69,23,25,1,19} +\N +{44,38} +{12,4,96,7,48} +{18,24,52,81,58,77} +{15,36,1,50,81,23} +{39,66,74} +{52,22,99} +{51,11,77,44,22} +{51,19,18,91,75} +{20,17,5,96,63,30} +{31,56,9,21} +{45,70,31,62,9} +{84,22} +{99,62,97,44,87,90} +{95,94,95,24,25,47} +{79,72,57,18,3,26} +{54,67,37,57} +{3,90,9,3} +{95,90,40,7} +{36,70,76,68,14,71} +{15,59,7,1,48} +{91,29,79,62,94} +{76,36,92,82} +{50,79,68} +{55,63,88,87} +{86,89,49,17} +{19,74,14,52,8,59} +{8,58} +\N +{77,74,20,39,26,29} +{38,89} +{58,21,44,81,17,16} +{40,72,12,32,90} +{93,34,92,17,39,85} +{39,2} +{43,21,83} +{81,3,59,28} +{34,97,52} +\N +{84,90,6,74,43,70} +{41,6,10,98,86,41} +{13,72,78,11,37,5} +{100,40,54,75,33} +{66,31} +{58,58,75,83} +{81,90,8,73,87,41} +{9,63,22} +{19,66,19,93,52} +{39,88,13,25,66} +{80,85,66} +{66,76,11,71,97,77} +{70,35,87} +{36,17,69,2,41} +{30,85,65,39,38} +{39,35} +{64,100} +{83,53} +{25,29,29,72} +{19,63} +{32,2,82,15} +{31,31,46,11,2} +{41,1} +\N +{55,41,15} +{18,61,43,22,100} +{47,60,16} +{80,5} +{52,2,76} +{40,26} +{81,12,16,25} +{31,93,89,20,95,75} +{26,75,86,1} +{36,69,70,73,79} +{38,39} +{45,49,52} +{88,53,45,10,49,31} +{21,14,1,83} +{7,71} +{59,38,83,64,44} +{6,52} +{99,99,26,54,47,8} +{13,46,72,5,23} +{7,86,40,73,55} +{28,47,50,62,44} +{32,89} +{39,48,50,100,62,95} +{66,56,11,21,58,59} +{7,44,95,53,95,36} +{83,33,79} +{34,65,51,52} +{67,95,46,45,61} +{69,84,71,38,46} +\N +{24,57,48,27,97} +{83,91,97,94,37,44} +{22,31,38,77,21} +{72,32,53} +{30,45} +{93,94,27,95} +{95,4,79,3} +{33,90,92,54} +{55,8,76,39,85,64} +{82,54,93} +{31,42,5} +{38,14,73,12,14} +{64,13,64,28,32,89} +{5,28,4,22,72} +{37,78,94} +{58,73} +{24,57,33} +{48,28} +{69,42} +{97,91,75,84} +{95,69} +{64,95} +{1,3} +{76,38,81,11,90} +{21,30,54} +{92,100,97,21} +{10,76,64} +{85,79,100,79,76,63} +{13,96} +{91,47,84} +{100,19,45,49} +{99,71,21,10,69} +{19,41,7,63,56,85} +{16,32,6,92} +\N +{62,7,22,65} +{1,86,67,47,83} +{26,2,100,51,1} +{20,22,86} +{74,95,79} +{8,53} +{85,59,61,45,83,8} +{2,76,63,26} +{40,42,84,55,56,23} +{37,7,25,14,2,47} +{86,16,98,41,33} +{76,30} +\N +{16,88,61,4,41,42} +{59,92,94,76} +{96,76,57,62,99,61} +{14,30,23,13,9,32} +{47,49,86} +{48,19} +{73,25,40} +{29,75,31} +{53,26} +{28,95,78,84} +\N +{22,77,13,64,68} +{15,69,82,26} +{42,37} +{64,59,95} +{37,72,86,95} +{9,59,92,57} +{65,37,13} +{93,67,81,54,89} +{21,52,78,59,30} +{98,90} +{17,35,57,4} +{44,56} +\N +\N +{25,26,13} +{62,41,60} +{28,92,16,74,4} +{92,19,85,77,11} +{20,67,85,22} +{75,69,34,29,64,73} +{70,40,2,29} +{87,27,70,54,6} +{10,8,9,62} +{71,41,14,22,23} +{83,79,46,37,99} +{79,42,3,54,20} +{12,60,42,100,39,33} +{13,79} +{95,28,54,52,77,3} +{55,50,25,41,42,16} +{96,67,23,54} +{65,54,32,52,16} +{100,11,69,96,95} +{1,18,93} +{53,78} +{24,40,47,30,40,11} +{87,7,12,10,52,90} +{3,72,95,15,32} +{60,69,19,8,43,72} +{88,10,11,55,37} +{67,48,31,48} +{98,70,38,97,14} +\N +{52,12,94} +{41,26} +{81,65} +{66,74,9,66,12,3} +{47,6,33,92} +{95,2,12,90,73,97} +{23,76,34,23,2,20} +{7,22,37,83,15} +{44,61,21,8,36} +{88,52,8} +{66,3,67,43,87} +{16,51,10} +{66,43,28,69,70} +{47,2,94} +{57,67,55} +{40,59,6} +{63,19} +{51,71,68,28} +{73,97,48,56,70} +{3,4,28,48,18} +{31,94,27,70,44} +{85,18,40,6} +{78,91,79,88,33} +{11,90,78,49,97} +{74,91,27,79,75,53} +{1,70,3,40,43,99} +{97,35} +{58,27,40,6,47,33} +{43,42,60,94} +{41,34,23,53} +{57,44,50} +{8,10} +{49,53,22} +{91,2,90,13} +{46,80,27,82,42,99} +{12,96,72,23,83,56} +{48,82,71,8,35,16} +{38,69,38,49,47} +{80,28,13,9} +\N +{84,13,12,33} +{31,57} +{68,86} +{4,96,64,19,48,29} +{66,8} +{33,86} +{32,38,86,86,41,84} +{38,51,31} +{59,17,76,36} +{52,87,60,54} +{7,58} +{34,52,58,90} +\N +{30,67,97,2,1} +{93,10} +{47,16,46,8,39,84} +{90,77,37} +{92,58} +{38,94,49,53,11} +{70,49,35,67,18,28} +{58,81} +{79,100,9} +\N +{97,13,56} +{99,40,87,67,58} +{24,47,19,16} +{12,27,47,48,3,59} +{1,58,15} +{97,28,6} +{94,50,31} +{71,34,94,53} +{26,5} +{46,66,56,27,37} +{76,4,1} +{80,63,40} +{89,82} +{39,100,71,82,95,8} +{81,86,27,83,57,47} +{30,30,92,8,33} +{95,20} +{4,19,8,74} +{20,32,83,62,19,18} +{75,29} +{100,13,6,41,23} +{63,5,93,72,43} +{64,13,73} +{35,91,61,26,41,96} +{49,56} +{2,28,80,84} +{15,48} +{32,49,96} +{72,73,57,69,16} +{95,1,88,64} +{70,55,88,66} +{76,66,30,92,1} +{88,21,74,65,93} +{72,75,75,3,26} +{55,32,85,68,84} +{45,40,93,33,72,20} +{83,89,6} +{4,60} +{72,56} +{73,7,69,25,96,74} +{100,72,41,48,63,37} +{21,72,70,94,67,54} +{6,9,58,77,35} +{70,59,35,25} +{86,96,87,62,13,5} +{93,52,74,57,58} +{93,23,88,50,56} +\N +{95,72,68} +{63,52,58,41,54,90} +{52,23,53,32} +{93,87,39} +{23,73,6,46,79,72} +{44,17,12} +{79,59} +{31,62,14,26,75,23} +{64,72,18,48,63,50} +{71,40,59,87} +\N +{82,17,10} +{44,29} +{6,4,39,16,21} +{94,17} +{91,61,37,36,9} +{53,38,7,28,92} +{95,93,35,18,48} +{35,77,53,87,97,92} +{56,28,68,19,28,86} +\N +{23,91,56} +{97,5,89,24} +{18,81,17,78,63} +{83,19,46,10,22,66} +{100,17,45} +{25,87,61,79} +{17,57,99,1,39,1} +\N +{2,51,26} +{93,69,84,85,87} +{40,58,70} +{86,84,96,41} +{28,36} +{39,85} +{16,84,75,68,87,17} +{14,84,57} +{25,85,35,82,56} +\N +\N +{7,30,17,2,66,91} +{45,17,57,27,98,65} +{57,86,15,40,68,23} +{82,32,28,89,41,79} +{28,3,35,61} +{76,95,19,81,48,50} +{34,6,85,47,65,2} +{70,23,91,33,15} +{30,24,47,96,61,47} +{78,88,64,60} +{87,40,86,97} +{47,14,54,37,100} +{48,95,32,77,69} +{58,12} +{63,20,49} +{78,85,41,72,6} +{39,20,89,21,62,76} +{71,6,10} +{63,4,71} +{51,21,37,63,54} +{66,6,63,12,58} +{89,97} +{64,70} +{53,1,65} +{57,73,30,26} +{15,99,47,89,95,99} +{12,86,7} +{50,68,1,31,67} +{47,86,54,44} +{78,7,86,76,22} +{46,71,98,62,67} +\N +{64,91,80,63} +{82,61,17,58} +{85,64,90} +{37,26,64,97} +{68,25,26,61,68} +{11,21} +{63,53} +\N +{87,88,75,65,10,48} +{32,7,38,72,44} +{99,81,59,10} +{31,58,60,66,41,28} +{23,27,57,74,4} +{20,94,28,29} +{91,5,15,61,50,29} +{34,58,15,85,65,29} +{52,50,2,95,87} +{3,94,54} +{7,61,96,49} +{51,70,23} +{87,49,27,6,7} +{83,61} +{36,92,48,57,20,83} +{53,12,60} +{60,11} +{68,43,74,23,66,55} +{66,8,54,24} +{48,72,41,74} +{81,99,50,33,20,13} +{27,80,60,83,26,74} +{80,1,59,50,15,99} +{11,70,20,29} +{23,84,63} +{63,24,91,19,28} +{25,17,95} +{94,13,81,69,26,89} +{31,48} +{45,20,74,51,62,33} +{77,55,17,63,4,18} +{89,14} +{85,85} +{23,11,85,74} +{29,76} +{62,40,96} +{1,29,25} +{56,26,12} +{5,22,6} +{61,9,6,85} +\N +{31,34,49,11,19} +\N +{14,20,64,73} +{63,1,85} +{2,58,61,100,9} +{89,92} +{37,13,81,77} +{36,26,16,76} +{78,10,10,92,63} +{68,6,35,71,92,27} +{2,88,33,14,85,27} +{80,95,71,98} +{8,33,33,55,90} +{62,74,15,10,64} +{60,18} +{6,77} +{27,38,4,49,27,89} +{94,84,94,8,98} +{15,73,47,47,26} +{73,38,69,90,9,13} +{17,33,32} +{51,57,25,40,41,37} +{77,70} +{66,10} +{50,90} +{96,88,30,65} +{30,49,100} +{34,46,19,89,52,24} +{83,85,62,72,10,64} +{98,56,23,77,79} +{97,90,83,85} +{19,66,70} +{70,89,59,12,71} +{24,96,22,4} +{43,32} +\N +{92,85,41} +{96,90} +\N +{4,5,82} +{58,32,34,86,30} +{51,8,44} +{31,96,37,47} +{51,15,41,97} +{86,41} +{41,26,61} +{62,79,68,73,5} +{32,9,88,30} +{89,34,64} +{70,18} +{64,31} +{14,73,1,50,75} +{57,1} +{53,92,38,13,56} +{41,1,87,40,60} +{83,75,19} +{69,98,25,64} +{69,75} +{84,13,25,8,81} +{41,52} +{90,80,17} +{19,53,72,62,94} +{29,30,99,32} +{32,85,73,26,47} +{6,48,89,93,23} +{73,47,93,10,48} +{60,21,26,60,63} +{85,41} +{75,61,61,45} +{51,7,5} +{9,46} +{83,36,7,84,96} +{71,78,55} +{43,53,88} +{8,1,80,69} +{88,86,51,12,37} +{45,69,40,85} +\N +{36,53,60,15,7,7} +\N +{92,5} +\N +{51,13,34} +{39,23} +{16,26,93} +{91,96,19} +{89,64,2} +{8,74,29,24,66} +{26,19,30,27} +{81,59,52} +{99,28} +{5,12,63,79} +{14,80,90,83,47,79} +{67,64,32,58,44,19} +{27,32,52,79,55} +{68,87} +{14,31,20,12} +{38,99,65,32,15} +{27,57,35,17,53} +{63,64,6,60} +{70,38,47,65} +{24,87,20,4} +{86,27,19,56} +{62,44,1} +{46,10,26,48} +{40,57} +{61,9,59,80,51,20} +{83,44} +{77,1} +{78,63,42} +{75,93,95,76,9,52} +{20,58,10,37} +{72,75,41,73} +{63,93,5} +{57,65,47} +{34,6,51,38,21} +{54,7,19,9} +{61,6,47,64,100,86} +{39,45,55,17} +{81,53,67,33,70} +{11,94} +{57,98} +{78,81} +{75,71,20,8,13} +{3,2,58,95} +{37,58,5,46,54} +{40,50,36,27,69} +{73,42,86} +{97,73,87,80,38} +{27,56,94,73} +{80,81,74} +{53,79,86} +{79,4,55,21,34,74} +{84,63,21,97,92,38} +{72,38,76,63,97,79} +\N +{64,91,100,98} +{34,10} +{97,73,7} +{49,31} +{87,39,65,96} +{54,88,60,55,18,4} +{20,72,96,26} +{40,51} +{37,46,89} +{88,53,3,52,39} +{10,34,77,95} +{20,66,84,12} +{51,19,61} +{67,35} +{73,56,89,43,35} +{94,54,27,63} +{63,53,21,79,76,49} +{79,23,28,63,49} +{47,94,75,11} +{67,95,56} +{80,86} +\N +{62,73} +{98,69,11,57,24,90} +{87,41,77,21,94} +{21,87} +{3,40,75} +{67,53,78,29,16} +{18,46,70,76,98} +{14,67,50,63,22} +{4,2,92,4,8} +\N +{41,76,79,95,96,61} +{35,30,18,57} +{34,91,89,27} +{22,25,9,34,85} +{4,53} +{23,6,65,86,56,93} +{54,81,8,59,36,47} +{90,10,4,25,31,46} +{91,82,82,80} +\N +{64,12,59,21,10} +{49,93,76,26} +{22,10,21,15,57} +{14,29,93,31} +{68,21} +{62,95,12} +{34,74,55,4} +{26,39,93,31} +{67,31,63} +{23,89,98,88} +{48,93,22,79,28} +{1,88} +{95,74,84,18,38} +\N +{82,29,22,45,15,81} +{15,48} +\N +{17,36,97,77} +{93,59,71,15,51,35} +{67,33,57,11} +{35,80,72,43} +{69,89,69,48} +{52,29,16,52,100,22} +{60,30,45,19,25} +{28,3,39,86,13} +{81,40,25,20,39,5} +{77,14,93,47,23,6} +{42,19} +{52,52,98} +{9,29} +{78,77,6,72} +{2,59,73} +{13,85,77,26,29} +{64,63,94} +{54,76,3} +{7,1,5,91,100} +{24,94,57,94,79,55} +{4,22,1,75} +{34,53,19,87} +{69,75} +{71,47,47,61,42,89} +{3,32} +{84,61,4,13,73} +{74,61} +{47,65,85} +{50,84,83,18} +{51,97,11,3} +{59,92,4} +{49,42,65,27,97,52} +{19,33,40,44,71,100} +{82,68,99,60,47,59} +{47,33,82} +{3,45} +{47,28,60} +{3,98,60,30,50} +\N +{11,40} +{33,67,72,43,74} +{9,49} +{42,47,48} +{53,88} +{17,87,28} +{20,4,72,62} +{65,25,22,76,64} +{9,62,57} +{59,93,52,93,60} +{85,85,1,55,50} +{69,22,57} +{8,50,81,32,4} +{80,47} +{60,88} +{16,54,80,66} +{99,87,66,65} +{60,19,58,18} +{14,77,66,48,59,41} +{75,96,82} +{42,72,93,79} +{14,23,78,82,40} +\N +{29,47,16,41} +{13,11,45,67,23,92} +\N +{8,3,52,41,56} +{57,41,63} +{5,50,59,87,50,58} +{58,99,9} +{60,99,15,63} +{59,14,9} +{68,81,34} +{83,18,3,94,39} +{27,52,100,66,48,82} +{10,23,50,96} +{72,14,12,68,62} +\N +{45,30,55,86,89,48} +{5,80,97} +{52,67,86,81} +{99,4,38,79} +{21,98,78,71,73} +{10,23,38,61} +{12,17,19,70} +{79,23} +{55,66,65,60,19} +{7,34,68,88} +{37,70,5} +{41,57,86,31,10,6} +{70,59,96,78} +{88,18,32,22,56,21} +{93,72,81,47,89,72} +{100,14,49} +{83,80} +{73,11,97,14} +{60,47,32,34,13,29} +{39,6,88,24,6} +{54,66,55,52,47} +{56,89,88,98,94,48} +{2,37} +{13,54} +{68,39,68} +{60,81,10,85} +{74,54,14} +{30,52} +{41,74,47} +{77,28,8} +{90,3,43,89,4} +{29,46,84,63,79,83} +{26,15,80,19} +{76,28,77,6,47,91} +{51,15} +{93,15,51} +{8,68,34,58,15} +{5,56,57,81} +{27,87,61,54,41} +{31,37} +{68,80,3,98,49,6} +{96,10,39} +{25,19,21,72,79} +{69,1} +{5,51,61,80} +{76,25} +{36,92} +{54,46,31,87,13,8} +{25,13,83,57} +{29,53,73} +{83,60,26,19} +{27,89,34,13,20,38} +{29,74,26,67,65} +{90,36} +\N +{32,15,43,50} +\N +{55,86,68,51} +{91,14,53,70,49,38} +{75,29,79} +{19,59,38,44,18,79} +{43,31,24,20,16} +{43,83,59,37} +{61,17,95,61} +{67,89,1} +{65,20,46,58,49} +{72,54,38,52,49} +{75,12} +{63,95} +{99,17,79,11,35} +{62,60} +\N +{69,83,89,73,20} +{30,60,39,73} +{78,99,29,45,61,21} +{38,61} +{51,15,47,11,4} +{34,75} +{57,26,42,42} +{8,90,4,68} +{63,70,99,3} +{74,70,33,50,59} +{27,18,73,83} +{36,90} +{82,77,2,83} +{90,99} +{15,25} +{65,30,39,82,89,34} +{12,24,64,54,49,83} +{54,59} +{63,49,81,36,75,52} +{6,59,90,55,87} +\N +{97,52,54,97,3} +{8,53,89,42,30} +{68,42,64} +{97,42,99,74} +{19,31,32,52,7} +{69,83} +{61,17,35,39} +{81,47,70,7,63} +{78,10,63,97,31,48} +{84,92} +{64,82,40,39,57,44} +{39,25,92,33,5} +{27,74,85} +{90,67,21,28,84} +{36,33,62} +{77,87,98,82,11,88} +\N +{11,41,17,91,56} +{1,1} +{84,100,8,22,20} +{57,39,85,5} +{55,47} +{13,2,36,59,45} +{95,66,53,32,29} +{21,92} +{35,32,9,58,77} +{19,71,99,82} +{19,37,87,43} +{100,18} +{67,86,29,40} +\N +{66,54,64,55} +{67,25,18,31} +{60,26,59,86,26,67} +{26,21} +{70,67,30} +{93,82} +{89,58,39,91,95} +{15,86,25,8,12} +{59,20,41,33,78,87} +{10,72,89} +\N +{52,17,99} +{77,29,7,7,1} +{49,96,57,24,66,67} +{10,26,83,84} +{82,7,25} +{66,77,57,25} +{92,77} +{24,48} +{44,26,37,75,11} +{73,80} +{51,47,93,21,25,78} +{76,49,15,98} +{12,85,63,59,6} +{25,51,47,58} +{16,10} +{17,30} +{67,5} +\N +{54,96,21} +{12,47} +{29,90,69,22,89,82} +{78,93,86,65,66} +{83,84,58,67,13} +{85,35,81,27,1,2} +{76,29} +{64,82,91} +{35,89,38,89,10} +{19,40,96} +{83,70,85} +{72,85,70,99} +{34,1,39,16} +{84,53,22,86,73} +{32,23,70,49} +{15,67,91,11} +{73,95} +{71,57,64} +{88,91,56} +{12,16} +\N +{62,82,26,84} +{70,51,52,63,96} +{34,93,49,57} +{16,5,47} +{18,59,12,82,83,51} +{61,93,87,9} +{46,9,45,38} +{15,85,28,73} +{31,99,26,3} +{66,91,48,73} +{98,80,9} +{31,55,42,69,13,58} +{43,8,70,29,83} +{39,57,53,70,74} +{89,13,60,38,89,3} +{37,28,15} +{67,77} +{30,100,89,36,53,75} +{36,19,48} +{7,8} +{12,76,26} +{14,56,52,47,39,67} +{87,83,51,2,97,25} +{51,1} +{59,69,37} +{95,93,21} +{100,92,37} +{37,23,66,95,7,63} +{52,56,77,86,46} +{31,62,17} +{57,48,79} +{26,96,40,5,43,54} +{40,92} +{75,83,1,73,71} +{75,61} +{6,38} +{35,23,76} +{52,3,38,25,100,99} +{45,15,44} +{96,9,11,35,16,58} +{9,80,76} +{22,43,34,43,46} +{34,68,21} +{95,70,83} +{60,7} +{34,22,68,2} +{78,30} +{46,70,90,96} +{5,24,69,61,32} +{41,17,79,27} +{59,88,64} +{12,48,41,68,15,98} +{43,84,59,62,36,14} +{84,8,71,88,4,23} +{45,67,67,17} +{14,96,72,66} +{91,23,4,11,28} +{18,5} +{65,51} +{31,87,33} +{17,97,76,81,69} +{56,71} +{95,23} +{33,58,66,47} +{46,99,69} +{43,87,40} +{49,1,26} +{18,36,89,87,25,100} +{76,37,19} +{57,91,9,100,23,59} +{80,60} +{55,23,32,49} +{15,73} +{87,50} +{43,62,50,54} +{65,3,89,49,77} +\N +{73,12,25,78} +{79,89,38,59} +\N +{44,62,25} +{96,13,57} +{35,14,3} +{90,71} +{34,8,59,81,63,90} +{15,90,89,32,69} +{90,61,54,10,29} +{22,3,85,41,66} +{17,4,99,91,45,57} +{89,32,43,39,61,9} +{45,40,6} +{47,100,75,8,85} +{88,43,89} +{45,41} +{54,48,87,66,100,5} +{58,65,39} +{17,82} +{95,14,31,51} +{30,3,46} +{8,66,22,52,51,24} +{61,62,38} +{4,50,83,32,76} +{96,36} +{87,27} +{82,100,44} +{30,91,44} +{29,48,8,38,43,96} +{56,65} +{34,36,99,11} +{11,1,25,65,12,89} +{17,100,62,53,24} +{86,81,63} +{17,63,30,82,87,91} +{12,63,76,78,85} +{52,19} +{21,91,53,86,49,83} +{67,65,78} +{8,77} +{89,1,56,100,72,96} +{20,51,41,21,30,20} +{41,73,37,92,9,5} +{95,34,21,12} +{28,14,2,62} +{14,74,33,32} +{37,82,67} +{65,99,56,11,21,83} +{99,51} +{56,42} +{59,30,74,40} +{18,27,63,44,86} +{48,25,41} +{5,26,63,88} +\N +{24,66,64,1,26} +{72,74,11,61,70} +{28,27,90,30} +{96,35,21} +{64,100,75,94,88,3} +{93,79,42} +\N +{37,51,4,41} +{31,68} +{93,42} +{76,96,47} +{8,6,16,57,51,72} +{67,72} +{50,36,40} +{69,28} +{17,92,40} +{72,74} +{76,87,93,22,95,30} +{14,88} +{39,56,74,36,25,87} +{55,68} +{32,9} +{35,2,17,86} +{92,73,82} +{40,13,95} +{15,28,95} +{65,40} +{47,56} +{63,72,78,20,22} +{71,49,4,80} +{68,16,50,44,29,38} +{81,96,23} +{44,73} +{4,68} +{30,54,41,66,89} +{92,33} +{10,92,49,46,59,42} +{14,91,18,96,27,37} +{40,32,12} +{14,97,15,96,44} +{75,96,52} +{50,20,9} +{39,84,83} +\N +{14,48,3} +{47,85,76,27} +{5,3,25} +{55,36,29,76,41,44} +{34,56} +{62,29,83,6,58} +{67,32,85} +{75,62,4,66,100} +{47,31,27,43,9,57} +{92,44,36} +{31,22} +{14,88} +{18,25} +{82,63} +{54,67,6,59} +{90,42,19,91,37,75} +{70,39,87,52,32} +{51,20,34} +{85,62} +\N +{95,6,55,93} +{44,67,15} +{93,58,20,12} +{42,6,22,29,36} +{46,81} +{57,95,56,52} +{3,79,69,45,8,74} +{75,44} +{4,17,78,96,66,41} +{27,100} +{85,76,22,17,45,58} +{9,12,70,29,96} +{5,68} +{54,79,5,19,17,24} +{99,13,9,52,86} +{94,6,99,57} +{71,62} +{63,50,9} +{42,42,80} +{25,96} +{93,20,10} +{83,73} +{14,76,36} +{57,31,29} +{17,25,18,18,54,95} +{34,27,86,37,92,83} +{57,57,28,32} +{98,53,60} +{8,59,41,88,49,46} +{95,42,30} +{12,51,98,74,76} +{6,49,26} +{21,35,27,32,83,93} +{16,56,89} +{85,34,73,74} +{52,95,22,4,71} +{96,42,63,88,80,91} +{78,34,41,99} +{11,68,27} +{50,14} +{78,52,66,15} +{100,82,1} +{35,2,93,71,45} +{4,56,8} +{83,19,5} +{82,39,63} +{50,64,83,87,76} +{47,59,93,88,22,67} +{16,6} +{86,98,88} +{32,4,52,34,30,71} +{68,25,97} +\N +{19,17,91,84} +{97,88,89,98,33} +{37,56,70} +{27,17} +{56,58,51} +{69,80,47,84} +{89,22,89,88,16,1} +{95,14} +{14,95,97} +{47,15} +\N +{19,20,65,74,83,38} +{57,56} +{78,67,68,89,1,95} +{61,84,93} +{10,56,96,31,56} +{3,51,90} +{15,85,42,25,15,41} +\N +{50,7,89,89,96} +{90,10,44} +{11,43,15,27,30} +{55,68,48,30,44} +{38,69,3,95,39,6} +{57,51,88,94,82,23} +{69,37,2,67,49} +{93,94,5,84,39,47} +{45,47} +{58,55,79,63,64} +{63,65,59} +{42,36,76,75,89,86} +{41,83,98} +{13,90,13,46,11,37} +{76,33,52,65} +{52,29} +{20,60,45,23,29} +{89,6,14,8} +{91,69,64,72,41} +{46,91,31,66,83,33} +{6,58,61,65} +\N +\N +{90,65,16,5} +{24,46,33,36,47,45} +{11,62,40,98,21,88} +{28,95,58,33,27} +{45,63,99,31,38,90} +{11,49,41} +{23,24,82,25,28} +{42,3,34} +{52,10,58,88,97,37} +{20,41,11} +{86,30} +{36,92,93,10} +{5,36,85,50,71} +{51,75,100,46} +{55,81,31,45,87,8} +{83,10,45,81,33} +{16,94,91,23,76,44} +{62,73,14,39} +{16,14,83,100,82,7} +{25,69,86,12,71} +{29,86,45} +{76,62,100,47,57,52} +{41,21} +{33,56,58} +{23,96,44,16,91,86} +{65,15} +{3,92,56,4,21} +{32,39,95} +{95,87} +{65,96} +{16,96,93,100,35,78} +{64,33,55} +{96,75,41,40,62} +{50,50,86,11} +{93,34,83} +{19,30,62,67,93,19} +{53,67} +{55,46,99} +{70,32,38,4,84,91} +{50,36,40} +{21,93} +{29,6,10} +{4,73,45} +{72,33} +{36,73,18,55,27,100} +{65,73,98,90} +{20,1} +{59,36,60,87} +{20,79,63,93,34,31} +{60,18,92,6} +{48,34} +{63,70,78,1,2} +{15,32} +{5,15,84,73} +{32,35,90,11,40,23} +{91,41,7,52} +{84,90,88,30} +{12,10} +{84,86,36,79} +{76,45,84,66} +{41,25,61,96,97} +{18,100} +{63,39,17,34,32} +{22,45,74} +{83,24,45,48,69,84} +{43,41,12,44,75,91} +{69,75,95} +{100,28,14,66,1,14} +{94,91,60,36} +{88,28,54,63} +{68,78} +{29,68,6,100} +{12,84,35,44,59,55} +{30,59} +{64,18,40,57} +{97,97} +{85,64,73,82,49,88} +{99,31,24,6,90} +{23,89,38,20,40,95} +{84,64} +{21,3,91,7,7,87} +{91,74,32,76,43} +{13,22,96,8,75} +{59,71} +\N +{34,94,45} +{14,5} +{95,10,37,74} +{69,82} +{6,58,45,49,81} +{72,72} +{17,58,10} +{62,77,9,6,44,62} +{37,53,49,41} +{24,11,11} +{10,57} +{26,72} +{18,15,83,60,54,80} +{88,49,73,92,67} +{26,88,64,2,59} +{49,50,3,90,44,49} +{58,54,43} +\N +{86,78,40} +{42,17,65} +{1,86,17,6} +{79,27,37,60,8} +{46,62,46,22} +{9,75,17,68,54,35} +{99,86,64,10,20} +{3,21,35,6,24,64} +{25,62,9,50} +{63,2,79,42,81} +{44,41,2} +{99,93,98,78} +{2,92,9,96} +{79,82,25,64} +{47,84,52} +{97,77} +\N +{47,94,38} +{22,33,76} +{35,52,11} +{17,48} +{1,100,27} +{87,93,19} +{72,3,32,78,81} +{47,28,4,23,79} +{27,88,7,85} +{49,40,47} +\N +{91,89} +{80,2} +{86,78,42,6,81} +{7,50,25,4,8,22} +{23,3,64,59,53} +{1,42,63} +{95,81,86,31} +\N +{81,83,52,47,25,43} +{17,57,100,49,59,63} +{44,91,95,72,29,100} +{80,78,55,41} +{14,52,20,64,9,87} +{48,14,82} +{31,5} +{64,50,66,38,97} +{61,2,90,2,64} +{64,69,26} +\N +{64,62,68,89,12} +{12,10,88,71} +{41,66} +\N +{67,77,25,6} +{14,75,15,66,19} +\N +{88,52} +{78,56,61} +{93,88,47,38,52} +{72,100,54,34,18} +{77,99,89,53,25} +{38,51} +{3,25} +{83,39,85} +{60,15,77,59,69} +{38,64,91,97} +{65,35,30,8} +{46,6,48} +{63,91,29,91,85} +{43,100,56,60,74,53} +{95,30} +{86,63,28,62,37,79} +{2,48,29} +{1,44,20,47,56} +{43,34,86,86,64,14} +{11,82,99,71,63,41} +{77,45,74,17,56} +{18,25} +{51,82} +{27,35} +{1,20,84} +\N +{89,37,16,90} +{58,83,34,88,50,21} +{61,25,1} +{41,6} +{9,100,32,54,38,66} +{40,53} +{29,76,16,13,55,31} +{71,67,54,83,3,82} +{19,62,18,94,73,38} +{17,83,8,45,52} +{80,25,50,59,53} +{4,2} +{52,48,6,72} +{50,32,70} +{36,97} +{17,82,36,97,20} +{22,87} +{46,29,96,98,14,90} +{14,92,5} +{69,9,68} +{20,86,29,61,54} +{62,67,87} +{86,18,31,80,82,45} +{65,89,67,34,41} +{44,8,48,38,91} +{47,32} +{85,25,56,39} +{15,54} +{84,57,44,46} +{65,61,29,86,77,53} +\N +{26,58} +{76,1,57,93} +{57,91} +{13,15,66,11} +{84,12} +{43,32} +{83,24,31} +{82,9,65,84,27,94} +{62,93,55,7,39,46} +{90,100,33,22,61,46} +{9,51} +{87,93,82,94} +{49,45,95,95,66,39} +{100,56} +{11,5,78,42,45,37} +{3,57,80,46,13,34} +{1,74,53,31,33} +{11,84,8} +{27,99,21,31,96,58} +{99,81,90,17} +\N +{66,49,47,55} +{88,30} +{76,62,17,88,83} +{40,7,42,61} +{17,57,9,64,54,1} +{9,54,84} +{50,61} +{72,15,25,30,6} +{64,95,69,89,11} +{64,18,86,25} +{81,59,70,6,92} +{78,76} +{33,40,29} +{15,63,1,12,14,57} +{33,81,8,65,26} +{58,15,56,37,67} +{2,50,35,92,11,27} +{17,13} +{91,100,15,27,39,24} +{58,48,46} +{5,95,28} +{7,21,99} +{5,15,6,10} +{82,99} +{66,22,86,83,76} +{99,68,39} +{43,90,22} +{31,94} +{21,64,56,26,95,40} +{7,81,3,53,83} +{29,42,90,60} +{53,49} +\N +{26,31,14,73,88,51} +{69,2,100,9,34,16} +{78,35,97} +{68,16} +{34,45,42,73} +{7,19,55,70,69,11} +{11,62,61} +{32,17,51,33,87,6} +\N +{54,97,36,13,45,12} +{46,2,26} +{14,6,17} +{99,20,31,61,6,4} +{60,72,53,31,34,25} +{88,46,68,78} +{56,94,49} +\N +{33,65} +{70,51,84} +{55,91,27,33} +{22,19} +{34,78,11,94,3} +{16,67,91} +\N +\N +{64,5} +{76,18,83,5} +{57,13,30,56} +{60,92,25,31,43} +{38,17,54,5,2} +{56,58,39} +{42,43,5,69,56,89} +\N +{50,23,97,85,70,39} +{97,56,33,90,64,2} +{9,54,51,26,24,99} +{18,7,59} +{44,5,40,69,18} +{77,96} +{44,58,47,17,26,45} +{90,71} +{88,32,11,96,17,13} +{42,3} +{97,28,56,10} +{38,36} +{50,52,47,31} +{64,5,99,77,83} +{11,56,1} +{91,92} +{7,53,35,52} +{93,65,47,97,44,82} +\N +{64,66} +\N +{62,4,57,23,34,91} +{52,55,75,99,27} +{29,54,44,87,61,96} +{21,3,66,35,25,80} +{96,68} +{3,41,66,81,78} +{49,98,79,65} +{71,38} +{88,79,70,37,3,82} +{49,74} +{19,29} +{57,68,9,8,99} +{81,88,14} +{99,29,24,99} +{55,96,29,89,49} +\N +{56,2,84,79,74} +{30,52,64,74,62,5} +{88,32,19,25,9} +{40,11,49} +{98,52,27} +{11,86,29,86,6} +{91,53,63,53,44,28} +{88,10,30,48} +{75,64,75} +{14,92} +{98,62,35,67,66,35} +{40,65,11,80,73} +{1,1,63} +{85,32,53} +{91,27,68,50,66,63} +{66,54,38} +\N +{45,43,14,94} +{62,84} +{54,24,83,33,46} +{93,72,2} +{43,4,14} +{18,11,5,99,79,94} +{26,59,9,2} +{58,69,70,45,14,54} +{84,5,42,97} +{7,82,41} +{69,53,8,55,20} +{4,13,6,45,83} +{41,92,41,98,51,85} +{72,85,74} +{19,50,79} +{79,47,47} +{25,25} +{17,56,46,30,73,78} +\N +{92,42,83,34,92,29} +{8,52,76,80,9,55} +{80,100,2,52,24,4} +{55,15,92,27,86,50} +{83,79,41,88,86,53} +\N +{44,16,90,54} +{99,20,64} +{44,30,26,26} +{35,35,24,74,72} +{97,24,94,55} +{78,42,32,76,100,98} +{31,86,12,87,72,86} +{87,35,33,88,33} +{31,83,23} +{46,51,5,6,71,31} +{39,97,91,53,39} +{19,18,25} +{16,4} +{65,77,13} +{61,30,13,26,75} +{67,9} +\N +{31,3} +{15,19} +{97,39,71,30} +{12,96} +{36,96,82,62,5,74} +{81,22,46,11,19} +{97,55} +{58,67} +{10,68,79,74,23} +{29,71} +{50,59,8,1} +{12,51,32,7} +{62,16} +{48,82} +{84,21,24,13} +{46,86} +{100,96,32,54,13} +{72,41,3,67} +{61,9,7,75} +{39,44,50,30,38,6} +{63,63,6} +{69,35,6} +\N +{7,91,82,48,55} +{57,22,31,57} +{55,72,91} +\N +{76,98,43,71,10} +{100,34} +{78,53,14,73,23} +{42,90,28,44,44} +{90,34,22,81} +{60,32,56} +{98,53,58,58,61} +{61,70,59,78} +{2,96,27} +{83,99,25,47,13} +{17,54,11,47,70} +{70,43,11,89} +{93,70,82} +{72,57} +{35,95,49,36,19} +{82,25,16,49,43,93} +{2,51,96,48,88} +{20,81} +{74,4} +{66,83} +{90,75,98} +{25,87,59,92,55,96} +\N +{20,80,92,93} +{59,63,39,3,7,38} +{64,10,85,22} +{63,32,18,38,83} +{49,38,83,54,1} +{27,97} +{18,34,84,58,7,86} +{93,4,67} +{43,49,32} +\N +{29,14,5,50,30} +{59,15} +\N +{76,31,31,47,17,35} +{95,41,71,27} +{47,43} +{75,80,56,78} +{56,75,43} +{99,10,100,76,44,1} +{5,31,72,3,25} +{21,90,59} +{59,45,75,93,78,88} +{76,55,4} +{20,87,44,94,56,78} +{38,87,71,13,23} +{33,6,79,91,92,27} +{13,15,31,15,11} +{57,18,57,71,11} +{67,60,64} +{66,15} +{57,45,74} +{93,91,97,30,12,94} +{37,83,62,18,28} +{94,88} +{12,11,85,10} +{42,96,89} +{15,65,5,65} +{52,58,36,27,10} +{72,88,76,50,96} +{40,70,55,93} +{80,33,24} +{53,35,50} +{11,37,55} +{25,80,32,91,68} +{11,2,52,39,37} +{17,51,45,44,85,84} +{81,21,77,15} +{67,93,27,70,72,94} +{86,99,95,98,83,28} +{9,65} +{1,26,5,23,5,17} +{57,82,42,60} +{46,67,65,98,69,79} +{41,50,94} +{77,81} +{87,82,18,57} +{88,27} +\N +{32,58,81,88,94,90} +{23,37,65,38,29} +{61,11,65,77,25} +{50,53} +{38,2,11,9,27,94} +{64,9} +{1,45,97} +{61,41,67,46} +{13,41,90,15,80,82} +{83,6,9,22,25,37} +{95,74,22,64} +{16,17,4,80,66,33} +{25,42,43,84,96,85} +{25,93,50,87,6} +{35,67,90} +{82,37,59} +{4,44,83,2,81} +{78,46} +{64,79} +{18,41,3} +{56,20,51,83} +{26,77,52,70,93,13} +{54,53,12,47,57,63} +{94,48} +{39,12,41,5,3} +{28,33,93} +{20,29,9} +{75,38,10} +{96,54,96} +{47,87} +{19,35,11,3,80,72} +{75,56,84,24,55,48} +{58,5,13,6} +{10,53,32,6} +{23,8,59} +{71,2,35} +{41,16,99} +{77,6,16} +{30,27,56,85,11} +{47,21,93,82} +{50,68,85,34,19,57} +{14,76,58} +{78,81} +{68,99} +{19,79,67} +{91,73,82,88,44,36} +{49,18,75,32} +{54,18,99,74,9} +{51,58,60,30} +{99,86,83,22,88} +{24,42,76,30} +{86,16,54,69} +{37,39,72,45} +{92,62,3,36} +{31,80} +{43,22,11,15} +{38,88,95,25,49} +{92,21,10,28,47,55} +{99,18} +{26,64,72} +{29,12,17} +{54,69,49,84} +{57,42,4,61,10} +{60,85,74} +{24,29} +{91,71} +{96,49} +{47,51} +{88,67,59,18,86} +{32,18,64,54,41,27} +{78,100} +{77,30,85,93,2,20} +{80,90,68} +{49,37,5,42} +{88,12,94,51} +{85,65,2,41} +{60,38} +{87,37,20} +{27,81,94,37,54,84} +\N +\N +{38,74,78,78,89} +{3,100} +{42,80,68} +{34,17,96,91} +{7,29,83,71,87,26} +{28,81,76,8,43,48} +{74,11} +{28,85,84,78,59,69} +{30,22} +{3,83,75,60,78,11} +{20,62,18} +{74,69} +{91,44,50,62} +{57,18,9} +{14,48,21,33} +{91,1,53,58,92,51} +{64,41,90,52} +{81,95,38,78,7,44} +{65,25,15,90,40,51} +{66,41,31} +{5,92} +{17,98,7,57} +{97,36,99,77,50,88} +{96,56} +{40,62,88,8,53,62} +{18,91,63,93,94} +{88,79,43} +{31,87,98,85} +{12,88,58,53,77,38} +{83,10,37,69,1,7} +{13,47,66} +{85,33,39,48,6,39} +{74,87} +\N +{26,50,26} +{48,78,10,39,17} +{27,43} +{58,17,18,80} +{86,43,58,77,67} +{53,12} +{9,79,9} +{85,79,89,88} +{35,77,22,52} +{93,64} +{47,52,90,17} +{75,15,25,68,28} +{35,6,68,37,18,53} +{80,14,2,89,4} +{52,49,5,66,59,44} +{5,26,96,1,84} +{71,8,61,19,72} +{17,94,84,72,55,83} +{72,10,16,40,17,75} +{6,70,15} +{22,99,7,19} +{55,19,4} +{6,47,69,42} +{17,9,63,44,15} +{23,20,72} +{10,80,20,87} +{99,3,23} +{11,76,8,77,58,38} +{45,14} +{22,89,73,63,54,9} +{16,19} +{1,26,29} +{92,20,65,33,16,40} +{27,64} +{22,19,95} +{36,16,9,30,31,14} +{40,50,68,52,77,58} +{35,15,11} +{67,2} +\N +{63,48,76,25} +{14,65,42,60} +{61,58,31,51,70,4} +{35,41,72,29,46,62} +{98,48} +{90,23} +{1,79,80} +{10,5,54,59,16} +{15,1,9,75} +{34,100,90} +{73,76,25,77} +{98,82,77,67} +{79,5,20} +{9,69,9,52,2} +{23,22,77,80,79} +{32,51} +{23,52,5,33} +{95,20,96,78,48} +{100,37,6,15} +{98,1,53,20,97} +{5,28,26,78,33} +{19,75} +{49,42,30,72} +{50,98,56,26,87} +{76,59} +{51,16,18,48,46,97} +{80,60,51,43,58,28} +{23,12,70} +{40,16,14,18,46,21} +{72,79,47,57,23} +{7,17} +{49,95,6} +{14,24,29,13,90} +{82,28,34,55,15,87} +{31,24,3,50,45} +{86,95} +{97,22,17} +{27,14,27} +{61,63,31,74} +{55,81,87,67,90} +{81,9,79} +{100,29,43} +{41,88,37,29} +{62,57,16,91,60,65} +{94,90,34,94,27,48} +{15,36,80,30,23,90} +{47,91,6,42,93} +{53,74,5} +{84,14,56} +{30,56} +{10,12,92} +{33,7,75} +{96,39,50,77} +{89,85} +{20,39,63,22,44,18} +\N +{90,23,79,91,85,8} +\N +{73,70,52} +{75,100} +{27,4,29,96,25} +{56,31,80,59} +{1,91} +{16,67} +\N +{17,88,59,41} +{13,49,29,76,71,9} +{41,38,16,29,66,40} +{68,67} +{39,74,47,71,63,80} +{4,74,33,92} +{17,60,82,7,52} +{62,88,39,19,22} +{77,21,1,95,42,2} +{98,62} +{55,17,81,31,11,88} +{73,52,18,94} +{16,64,90,33} +{87,41,81,95,85} +{20,55,96,75} +{71,72,11,11,83} +{75,94,89,47,41,7} +{56,48} +{76,29,74,31,67} +{47,70,68,36,70} +{5,69,10,94,54,32} +{29,96,71} +{64,28,86,58} +{82,57} +{42,2} +{64,48,59,8,45} +{61,69,43,40,1} +{69,84} +{68,51} +{32,20} +{21,7,5,60,35} +{100,40,18,98,37} +{50,96} +{87,10,12,27} +{47,3,46,43} +{60,87,10,31} +{92,87,50,37,72,73} +{99,61,77,87,29} +{23,95,31} +{96,100,43} +{17,64,84} +{13,19,57} +{65,86,4,75,46,69} +{49,60,99,37,50,11} +{77,82,88,12} +{12,95,66,98,63} +{83,78,68} +{76,14,87,25,29,14} +{20,9,99,73,67} +{42,51} +{36,22,33,6,63} +{53,46,22} +{40,89} +{37,7,89,17} +{32,89,16} +{65,87,4} +{16,16,57,35} +{34,90} +{80,54,1} +{11,93,34} +{5,19,31,50,99,33} +{98,1,33,54,7} +{45,39,23,78} +{37,47,98,83,10,5} +{55,88} +{42,76,18,99} +{86,31,25,5,45} +{67,87,47,1} +{23,15} +{78,88,66,96} +{58,55,41,67,86,41} +{21,53} +{90,14,28,38,95,71} +{20,5,13,52,1,88} +{29,98,50,75} +{91,3,24,75,82} +{62,33} +{56,69,31,95,66} +{46,85,40} +{17,22,67,57,39,16} +{58,25,92} +{31,53,82,64,69,40} +{40,12,30,1,39} +{78,10,42,40,25} +{58,27,1,12} +{28,11,80} +{36,89,69} +{50,95} +{61,63,59,62} +{51,77} +{90,24,88,84} +{61,27,57} +{51,81,33,75,48} +{47,30,33,23,44} +\N +{79,51} +{62,44,5} +{98,83,80} +{31,33,89,69,19,40} +{22,38,61} +\N +{90,70,10} +{37,90,49,65,95,52} +{95,42,4,47} +{92,100,43,31,27,1} +{39,17,88,20,2,80} +{82,64,36,84} +{31,18,21,18,52,34} +{3,35,36,33} +{26,39,69} +{67,63,66} +{54,85} +{65,71} +{26,14,84} +{63,82,73,30} +{66,79,21} +{71,13} +{57,25,40,14,22} +{63,9,91,31} +{70,63,36,32,93} +\N +{11,85,47,31,54} +{34,47} +{42,78,92,85} +{75,64,12,83} +{33,98,52,91} +{22,25,91,79,33} +{33,51,20,77,23,65} +{83,66,46,24} +{54,65,31} +{43,41} +{52,47,66} +\N +{59,85,85,63,83,53} +{16,76} +{44,97,48,52} +{26,36,72} +{26,55,98,7} +{70,88,7,87,62,37} +{11,42} +{98,38,36,72} +{51,90,82,33,92} +{59,80,79} +{76,77,18,71} +{34,56,62} +{85,12,37,66} +{34,64,74} +{77,63,28,76,11} +{2,63,87,50} +{60,98,60,19,15,57} +{93,66,33,71,36,3} +{41,94} +{62,72,87,19} +{57,83,36} +{63,64,21,13,70,32} +{71,36,9,55,34} +{92,52,90,45,88} +{59,54} +{4,51} +{55,25,35,90,93,2} +\N +{75,15} +{25,86,43,18,77} +\N +{31,40} +{55,49} +{67,1,84,20,9} +{15,1,48,18,100} +{62,46} +{4,39,86,55} +{49,17} +{65,20,71,49,55,49} +{40,57,63,14,3} +{48,68} +{67,97,58,55,5,34} +{3,73} +{79,97} +{82,63,87,66,32} +{19,49,96,50,55} +{32,19,41} +{17,53} +{64,81,70} +{66,75,18,92,54,93} +{7,94,38,86} +{16,62,45,19,10,11} +{18,47} +{58,96,69} +{65,25,58,98} +{29,51,37,40,44} +{91,78} +{37,84,85,65} +{70,61,31,22,32,22} +{67,12,99,39,78} +{41,79,46,54,84,22} +{38,26,43,4,45,75} +{29,68,35} +{69,59,69,33} +{4,46,52,49} +{1,25,44,12,71,29} +{38,75,99} +{83,58,86,6} +{93,35,35,34} +{85,26} +{15,46,95,60} +{62,63,65,49,10} +{44,67,19,80,83} +{63,41,30,43,85} +{13,46} +\N +{13,95,1,34,72,37} +{4,32,22,47,6} +{67,65,77,3} +{40,70,22,44} +{74,9} +{44,28,5,32,67,51} +{55,14} +{41,3,72,68} +{64,82,72} +\N +{11,88} +{91,90,92} +{68,66,95,80,58,54} +{30,49,11} +{54,86,59,69,67} +{56,83,36} +{15,67,9,47} +{92,30,78,2,87} +{12,54,2,1,59,36} +{84,25,67,38,19,53} +{28,45} +{54,84,9,75,59,26} +{47,35,54,93} +{36,96,59,75} +{78,78,52,93} +{87,96,67} +{5,61,15,13,27} +{53,58,6,78,86} +{43,70} +{72,38,15,61,58} +{75,27,30,12,35,71} +{18,72,35,62,81} +{45,10} +{36,91,73,25} +{81,85,22,34,29} +\N +{15,97,82,44,19,83} +{51,23,18,6,74} +{53,75} +{62,9,73,95,37} +{58,42,33,41,71} +{5,97} +{30,2,89,81,93,61} +{32,3,18,84,24} +{6,97,20,89,23} +{27,74} +{22,86,81} +{77,19,42} +{92,9} +{58,90,59,91,30,54} +{29,51,92,34} +{85,68,59} +{36,83,75} +{37,50,86,9} +{79,70} +{33,46,93} +{97,17,6,88,10} +{18,42,88,4} +{41,95,71,27,95} +{8,2,81,56} +{54,94,54,28,70} +{34,87,20,10,5} +{36,76,87,5,100} +{97,91,25,89,89,95} +{76,26,73} +{82,23,7,42,58,72} +{53,16,99} +{10,34,57,47,2,96} +{81,93,26,19} +{8,1} +{79,55,37,61,62,3} +{34,16,69,58} +\N +{41,7,99,87} +{70,21,86} +{59,2,49,45,91,97} +{37,2,74,2,61,68} +{97,39,15,4,13,1} +{67,71,8} +{51,2,84,38} +{55,8} +\N +{75,27} +{37,36,49,70,82,41} +{70,20,85,89,99,90} +{69,61,100,49,75,35} +{11,4,67,4,91,17} +{77,56,65,78,25,8} +{16,58,6} +\N +{88,38,19,88,27,27} +{12,46} +{36,67} +{62,33,96,94,80,96} +{56,94,12,1,65,54} +{58,73} +{19,80,27,72} +{47,55} +{14,91} +{94,75,92,32,19} +{99,12,91,4,85} +{56,55} +{86,83,77,66,66,87} +{46,68,13,45} +{49,75,62,35,39} +{20,25,33} +\N +{91,47,56,68,14} +{88,43,24,42,4} +{50,24,94,18} +\N +{71,54,91,66,97,22} +{81,16,19,67,6} +{78,46,81} +{63,93,71,75,87} +{90,38,10,85,12} +{11,24,93,42,25,77} +{30,14,32,67,70} +\N +{86,91,77} +{73,74,64,66} +\N +{7,18} +{85,94} +{37,15,55,100,59} +{55,18,44,79,57} +\N +{52,40,97,75} +{60,53} +{38,9} +{27,67,77} +\N +{43,83,82,24,35,64} +{22,75,29} +{9,19} +{67,1} +{15,35,11} +{65,45,95} +{65,9} +{63,84,99,89,6,77} +{20,44,31} +{82,50,88} +{29,12,46,21,98,7} +{98,71,3,73,6,86} +{61,44,74,2,45,33} +{16,56} +{31,87} +{72,30,37,94} +{65,30,82,17,12} +{86,19} +{55,76,96,61} +\N +{44,92,83} +{41,22,79,95,20} +{36,33,86,9,61} +{22,88,8,57,73,30} +{63,97} +{36,53} +{56,52,48} +\N +{35,8,3,93} +\N +{53,52} +{7,48,78,46,70,14} +{33,92,55,17} +{39,57} +{71,43,72,7} +{92,85,55,38,35} +{68,30,67,8,18,92} +{9,85,82,24} +{46,46,19,14} +{96,97,31,59} +{35,99} +{54,7,20,28,29} +{20,21,56,82,19,40} +{2,39} +{33,49,63,49,93} +{35,40,26} +{30,35} +{94,70,2,23,91,74} +{34,37,72,19,15} +{92,21} +{72,63,64,35,40} +{59,11,9} +{24,3} +{93,75} +{22,14} +{63,99} +{39,47,10,14,3,45} +{51,74,5,85,70} +{6,33,15,4,89,20} +{97,82,29,15,66} +{47,47} +{88,79,57,10,68} +{18,22,13,100,100,67} +{75,50,9} +{3,12,34} +{39,51,20} +{56,5,63,18} +{83,44,86,46,37} +\N +\N +{60,16,54,75,62} +{91,95} +{39,55,11} +{37,7} +{29,49} +{38,4,52,85,67,38} +{36,56,2} +{52,14,92,39,77,16} +{42,25,49,55} +{70,10,33} +{53,46} +{83,15,28,59} +{35,69,82,4,58,46} +{73,55,64,9} +\N +\N +{60,25,8,8,39} +{50,71,61,64,64} +\N +{65,67,67,34} +{77,59,18,64,16} +{43,72,32,44,59} +{55,57} +{12,47} +{30,75,89,81} +{23,92,16,31} +{64,45,21,74,19} +{4,47,49,47,96} +{37,14,20,18,87} +{61,45,38,39,1,87} +{4,98,99,52,27} +\N +{23,6,50} +{22,61,46,79} +{90,54,60,9,49,42} +{73,27,51,72} +{73,11,23,60} +{7,31,52,34} +{27,68,39} +{39,8,21,48,64} +{86,64,92,60} +{55,36,40,46,23,46} +{32,79,86,44} +{72,29} +{33,87,57} +{57,87,61,22} +{67,84} +{32,99,26,92} +{22,27,34,82,8} +{99,25,99} +\N +{29,75} +{39,63,25,45,7} +{39,67,18,13,18} +{23,83} +{77,69,22} +{60,13,46} +{2,10,42} +{37,20,27} +{30,21} +{85,15,52} +{6,89,38} +{68,22,26,37,96} +{6,85} +{93,51,63,46,26,64} +{79,77,15,26} +{90,6,39} +\N +{50,58,85,27} +{69,8,72,47} +{7,59} +{55,16,54,95} +{96,5,50} +\N +{77,92,13} +{46,30} +{43,65} +{17,65,32} +{10,6,46,1,47,75} +{48,82,71} +{63,12} +{68,14,10,97,34} +{15,45,58,100,7,74} +{9,23,88,1,95} +{61,60,15,12,58} +{84,51,46,41,71,26} +{58,62,39} +{86,67,31} +{32,31,89,2,30} +\N +{90,74} +{65,79,76} +{22,30,77,47,40,23} +{67,99,56,73} +{11,24,30,93,89} +{70,17,65,78} +{100,6,67,29} +{39,4,22,59} +{84,29,70,9} +{74,43,72,27,55,27} +{12,39} +{1,83,100} +{48,23,9} +{21,88,21,35,16} +{92,34,44} +{91,96,13} +{93,57,40,79,81} +{86,3,94,82,43} +{78,70,19,97,49} +{47,22,98,36} +{20,59,65,54,81,27} +{58,13,73,19,54,96} +{26,20} +{70,75,14,70,82} +{77,67,53,33,83} +{2,43,36} +{84,17,28} +{68,25,95,62,92} +{47,90,15,69,85,23} +{92,92,24,37} +{96,14,14,38,38} +{80,4} +{66,86,28,15} +{18,90,74} +{93,76} +{64,96,14} +{76,41,86,67,64} +{58,95,2,86} +{12,60,96,70} +{22,37,58} +{1,67} +{75,23,24,7} +{3,57,66} +{57,30,68,100} +{68,57,33} +{26,32,65,51,75} +{40,14,60,97,83} +{88,96,42} +{66,21,21,78,34} +{15,56} +{86,60,66,66,16} +{94,6,58} +{99,63,70,57,10} +{82,59,62,38,82,51} +{48,61,9,46,28,57} +{29,23,61} +{12,30,42,20} +{99,65,24,7,97} +{20,5} +{6,49,85,56,97,4} +{62,93,88,86,75,29} +{46,2,94} +{57,71,45} +{38,60,21,78} +{95,53,92} +{61,1,88} +{67,80,49} +{59,82,1,48} +{19,94} +{25,64,16} +{96,73,50,85} +{28,17,46} +{81,51,50,18} +{57,99,66,93} +\N +{23,62,57,94,40} +{21,6,83} +{4,11} +{83,16,50} +{46,41,23,1} +{4,15,8} +{86,51,29,80} +{48,34,55,81,89} +{5,2,43,67,66} +{42,59,37,91,1} +{14,98,27,80,33} +{18,58} +{49,93,60,91,94,88} +{32,62,64,63,48} +{51,1,90} +{56,8,68,49} +{16,34,79,18,76} +{66,88,41} +{31,66,93,44,96,40} +{100,99,30} +{37,49,95,91,18,43} +{95,2,94} +{84,15,70,31,30,84} +{31,41,45} +{9,73,2,7,34} +{17,35,43,1,25,72} +{8,70,8} +{1,93,32,16,71,61} +{98,51,27,56,46,65} +{1,11,57,72,33,7} +{48,96,64,55,75} +{83,82} +{7,74,70,29,59,60} +{29,44,5,77,52} +{84,58} +{87,63,62,52,69} +{29,58,32,11,13,17} +{35,99,67,67,93} +{54,31} +{53,24} +{58,59,32,22} +{8,76,23,63,94,54} +{3,88,75,17,64,91} +{29,30} +{3,81,39,9,77,82} +{77,85,59,56,8} +{47,12,63,13,40} +{66,81} +{67,33} +{39,46,28,79,95,67} +{49,13,98,63,10,58} +{14,42} +{80,70,60,92} +{63,54} +{30,70} +{60,89,14,62} +{56,40,94,55} +{70,31,46,20,95} +{18,65,89,7,75} +{60,33,80,43,37,4} +{85,19,98,79,36,84} +{69,1,48} +\N +{30,87,9,22,99,60} +\N +{23,96,9,85} +{22,94,39,58} +{30,38,4,97} +{16,70,62,5} +{35,52} +{32,10,72} +{35,34,40,31,66,80} +{7,77,14,48,97} +{67,64,37,22,69} +{51,53} +{67,71,90} +{87,71,45} +{44,84} +{19,58,11,34,45,85} +{68,19,55} +{27,16} +{7,14,92,22,33,46} +{47,2,49,53,63,32} +{15,39} +{13,47,84} +{29,74,97} +{51,74} +{70,26,46,33,51} +{31,86,14,23,61} +{20,85} +{21,10,57} +{90,94,59,72,97} +{97,30,74,84} +{15,89,69} +{11,40,2} +{68,19,47,28} +{47,65} +{2,7,52,53,44} +{40,74,34,36,78,71} +{22,60} +\N +{37,75,47} +{53,78,2} +{4,32,42} +{35,76,69,88} +{95,13,3,38,3} +{74,74,62,90} +{8,72,42,2} +{11,43,5,43,70,16} +{69,19} +{61,37,26,49} +{16,100,69,32,35} +{58,77,26,76} +{74,87,37,47,84} +{8,82,29,93,15} +{74,88,93,85,97,95} +\N +{29,23,99,98,36,93} +{8,36,87,64} +{71,90,43} +{7,28,78,46,52} +{62,25} +{33,90,7} +{60,72,39,18,86} +{98,59,73,24} +{17,69,2} +{49,16,63,56} +{13,37,62,1,95} +{98,89,69,92} +{50,26,34} +{90,16} +\N +{40,54,3,79,51,19} +{29,24} +{6,12,82,24} +{92,52} +{89,2} +{64,25,68,55,81,2} +{64,77} +{71,46,58,50,56,34} +{94,17,35,30,60,33} +{37,30,2,40} +{98,15,16,92,2,50} +{44,19,82,57} +{37,34,6} +{59,43,1,53,79} +{7,37,14,14,92} +{80,78,49,81,23,17} +\N +{91,51,12,35,79} +{9,14,2,84} +{62,3,77} +{25,5,40,12,40,79} +{65,88,82,94,89,90} +{20,35} +{80,71,83} +{6,9,83} +{94,58} +{2,76,55,61,42,53} +{60,53,45,82,3} +{1,37,75,96} +{82,61,81,10} +{36,46,1,31,90,45} +{22,55,11,25,21} +{69,13,29,20} +{95,54} +{16,79,82,67} +{4,58,84,84} +{52,7} +{25,14,94} +{69,8,67,54} +{30,71,36} +{81,78,23,38,76,58} +{86,59,61} +{11,42,63,74,99} +{66,4,55,34,16} +{39,57} +{10,81,9,8,21,10} +{75,55,64,97,7,45} +{8,46,86} +{39,100,52} +{30,51,7,13,54} +{72,85} +{10,52} +\N +{61,7} +{93,1} +\N +{74,31,3} +{90,96,26,84} +{88,58,74} +{28,45,74,24,74} +{95,88} +{42,70,43,64,22} +{46,83,48,36} +{81,99,100,43,11} +{47,24} +{46,67,63} +{26,15,36,89} +{90,11,78,70,81,87} +{65,90} +{89,99,21,81,47,38} +{37,42} +\N +{94,51} +{12,57,95,63,29} +\N +{68,99} +{27,8} +{16,52,11} +{72,5,85,44,57,51} +{11,6,91,7} +{87,80} +{94,61,1,38,77,89} +{93,60,6,98,46} +{52,47,44} +{93,66,61,22} +{7,61} +{15,83,93,91,12,40} +{66,3,5,72,72,36} +{67,72,68} +{42,42} +{38,17} +{75,60,47,39} +{58,28,51} +{61,8,61,81,65} +{46,52,97,84,27,47} +{97,53,47} +{64,93,83,72,27} +{34,79,34,36} +{25,5,92,37} +{12,20,55,94} +{17,43} +{39,37,16,70} +{79,62,15,16,64,28} +{80,87,96,41} +{51,55,1,94,72} +{75,22,56} +{2,55,7,20,39} +{8,91} +{73,8,42,73,31} +{90,90,23} +{82,68} +{63,64,68,12,59,19} +{100,80,23,24,17} +{23,46} +{25,13,31} +{43,95,54,85} +{40,62,21,21,82} +{70,20,16} +{90,11,23,18} +{16,9} +{51,57,30,27,21} +{50,55,75,77,53,33} +{84,92} +{14,66,32} +{44,100,16,30,82} +{41,48,58,60,7,44} +{81,76,13} +{18,26,82} +{84,35,15,98} +{52,84} +{13,80,36,35,28} +{91,16,71,55} +{87,89,6,20,28} +{12,75,92} +{48,41,55} +{59,75,26} +{48,19,48,72} +{91,4,100,25,17} +{46,52,97,78,94} +{7,81,76} +{54,54,49} +{89,37} +{78,22,57} +{75,25,83} +{25,89,10,38,96} +{52,12,1,74,35} +{13,48,88,7} +{6,97,20,19,91} +{53,2,99,76} +{4,58,46} +{30,30,89} +{97,2,87,47,55} +{14,11,72,83,97,74} +{44,69,11,51} +{47,17,86,27} +{15,19,56,96,24,94} +{81,67} +{11,11} +{20,94,49,36,39} +{39,78,40,46} +{33,87} +{76,89,58} +{94,74,25} +{33,77,5,47,55} +{28,67,99,81,93,83} +{31,10,19,65,60} +{53,25,74,24,48} +{73,69,23,45,88} +{70,56,41} +{21,73,72,28,99,5} +{75,69} +{78,99} +{66,49,89,86,2} +{30,53,18,21} +{67,69} +{1,98,38} +{91,25,16,39} +\N +{75,54,93,39,18} +{96,84} +\N +{64,71} +{6,15,78,50} +{8,45,26,15,25} +{8,90,94} +{52,66,13,98,86,69} +{3,25,28,56,88} +{84,72,89} +{10,33,46,6,57,100} +{13,91,99,2,49} +{83,59} +{88,64,42,50,77,16} +{81,12,27,45} +{12,17,31,93,22,53} +\N +{28,84,85,35,3} +\N +{42,12,86,76,37,63} +{46,23,18} +{45,80,76} +{94,18,100} +{17,80,84,80} +{84,88,29,16,10} +{7,42,90,51,33,40} +{79,51,22,2} +{31,30,72,24,23,84} +\N +{55,50} +{69,47,82,29,83} +{94,56,69,18} +{7,81,71} +{95,13,32} +{66,59,68,62} +{52,19,62,51,80,32} +{38,18} +{73,24,81,58,22} +{11,59,41,19,96} +{61,11,56,89,89} +{61,72,9} +{63,12,8} +{76,54,75,84,6,44} +{74,3,11,62,30,74} +{46,60,71,55} +{28,47,52,71,33,33} +{35,51,37} +{38,89,40,29,30} +{18,26,99,89} +{36,44,8,100,72} +{1,23,6,5,23} +\N +{84,17,16,44,81} +{29,70,24,85} +{23,57} +{20,98,30,23,1,66} +{82,3} +{70,7} +{15,49,58} +{19,40,70,88,35} +{45,10} +{62,89,47,71,55} +{34,100,88,44,3,91} +{92,65,16,24,7,9} +{20,12,61,95,7} +\N +{57,49,42,87,88,14} +{89,99,86,31} +{32,55,51,78} +{55,66,78,10,12} +{37,19} +{13,5,36,66} +{89,7,40,45} +{41,58,41,24,11} +{98,8,9,27,40} +{49,83,89} +{91,36,78,44,100,62} +{76,78,9,52,57,27} +{100,59,37} +{51,1} +{92,83} +{45,1,85} +{8,81,45,94,32} +{52,26,9,98,7} +{97,52,4,85,13,11} +{94,38,58,4,72,91} +{5,39,26,14,74,51} +{31,44,37,24,89} +{8,74} +{56,93,36,3} +{23,46,25,90,42} +{4,98} +{31,95,27,26,20} +{3,7,79,9,90} +{29,22} +\N +{35,34} +{80,28,12,21} +\N +\N +\N +{36,49,94,83,25,9} +{6,62,89,93,59} +{67,75,3,93} +\N +{94,62,3} +{97,36} +{43,89,26,94} +{46,56,22} +{50,15} +{45,47,39,61} +{23,32,24,45,43,11} +{97,66,29,8,52,67} +{37,1,48} +{30,84,86,91} +{4,46,59,35} +{76,37,41,90} +{26,28,92,27,88,17} +{76,37,27,41} +{74,51,31} +{16,33} +{66,85,68} +{4,81,72,62} +{65,14} +\N +{11,43,28,14,9,43} +{60,88,95,1} +{52,92,69,48} +{37,81,85} +{57,73,8,79} +{50,26} +{52,41,99,6,33} +{9,34,58,22,9} +{56,37,19,77,50} +{93,21,18,90,41,40} +{28,89,76} +{4,36} +{89,54} +{70,28} +{66,11,3,47,30,43} +{69,54,86} +{45,41,57,34,18} +{91,46,32,68,42,68} +{25,87} +{75,57,12} +{55,15,68} +{6,63} +{22,39,88} +{77,39,10} +{39,49,69,61,66,77} +{78,25,42,73,89} +{17,47,36,27,79} +{33,83,44} +{27,75,12,96,94,87} +{50,17,95,42,25} +{67,13,22} +{59,85,95,2} +{81,57,83} +{25,11,72} +{32,84,97,6,65,52} +{62,25,24,27,50} +{80,64,23,74,54,75} +{97,17,15,100} +{50,11,41} +{57,82,40} +{10,90,41,52,39} +{4,11,86} +{79,17,51} +{48,100,92,77,58} +{88,67,19} +{40,96,52,35,16} +{89,63,32,81,28,63} +{44,56,66,50,55} +{28,73,46} +{32,40} +{52,65,85} +\N +{51,34,18,82,83} +{49,49,90,71} +{84,16,74,78,86,10} +\N +{73,9,47} +{51,59,49,90} +{85,13,78} +{98,77,18,15,92,85} +{40,94,66,94} +{89,51,80,12} +{23,26,75,17} +{96,2,51} +{88,62,90,32} +{85,19,87,89,30,15} +{33,38,9,46,19,87} +{27,45,15} +{39,79,82,88} +{31,33} +{41,64,10,1} +{35,61,22,76,74} +{75,11,90,16} +{71,23,43} +{35,3,97} +{88,4,97} +{100,61,28} +\N +{64,74} +{9,44,81,98,55} +\N +\N +{76,89} +{18,34,80} +{77,83,91,50,20,41} +{65,50,26,65} +{79,18,90} +{5,60} +{42,21} +{31,70,80} +{20,98,15,14} +{58,65,45,6,64} +\N +\N +{88,82,98} +{75,81,32,34,59} +{37,14} +{30,36,55,70,65} +{84,55,26} +{56,64,1} +{31,41,89} +{46,43,43,90,34,100} +{78,36,21,14,69} +{100,10,45} +{73,69} +{60,86,5,70,78,99} +{6,89,92,8} +{86,68} +{44,4,71} +{41,36} +{95,80,42,94,34} +{73,29,50,49} +{61,20,57,17,36} +{37,58,67} +{56,83,77,37} +{98,67,40,10,35,76} +{54,84,6} +{7,71} +{65,74,43,6} +{62,98,74} +{81,26,17,22,47} +{49,32,59,35,11,94} +{80,50} +{91,1,50,97} +{71,35,84} +{97,4,46,45,8,36} +\N +{81,62,76} +{69,78} +{89,3,16,64,17,17} +{78,72,26,88,81} +{25,34,9} +{50,27,34} +\N +{55,44} +{61,51,39,53,44,46} +{23,94,32,92,90} +{91,47,67} +{1,13,76,57,63} +{77,19,73,18,95} +{100,82,87,6,83,23} +{69,58,48,97,60,50} +{4,83,85,6} +{3,5,91,37,94} +{91,72,31,32,80} +{57,23,39,46,50,20} +{92,28,53} +{71,27,46} +\N +{59,73,29,21,89,30} +{1,83,96} +{34,41,65,35} +{52,89} +{62,68,80,7} +{82,82} +\N +{11,2,62,46,28,9} +{9,16} +\N +{22,44,73,82,39,86} +{97,52} +{46,36,76,56} +{17,97,26,72} +{16,78,9,70} +{65,55,30,91,16} +{27,45,76} +{17,5,90} +{86,52,93,89,42,27} +{51,70,41,35,1} +{91,57,66} +{53,59,62,57,52,56} +{100,100} +{32,78,87} +{61,57,74} +{86,79} +{55,94,64} +{81,20,26,22,23} +{9,96} +{86,65,35,19,88} +{1,37,90,51} +{79,47} +{93,93} +{32,85,69} +{49,12,6,69} +{6,44,92,37} +{28,42,57,28,2,69} +\N +{63,90,25} +{53,28,74,42} +{83,79,94,37,64,69} +{93,100,57,84,80} +{39,93,80} +{97,92,55} +{27,6} +{20,100} +{19,66,3,66} +{7,76,15} +{7,56,92,11} +{61,76,6,98,52} +{20,46,51} +{12,77,45,67} +{78,79,32,22,21,47} +{62,35,1} +{86,66,57,10,47,43} +{43,24,76,18,87,68} +{39,52,71,35,87} +{81,78,8,10} +{33,70,53,54} +{25,77,27,68,95} +{29,53,89,62,51} +{21,76,33,72,39} +{13,22} +{1,1,51,73,20} +{26,97} +{64,75,23,94,62,68} +{25,20,84,57,27} +{26,7} +{92,80,17,48,72,73} +{73,49,88} +{24,36,70,53} +{7,79} +{80,58,33,25,91} +{19,43,61} +{54,49,73} +{51,88,4} +{9,32,5,83} +{17,68,90,15,30} +{98,50,42} +{29,52} +{32,41,4} +{33,97,69,34} +{94,2,60,5,83} +{23,86,43,74,35} +{63,37,38,58,39,14} +{56,7,82} +{88,81} +{50,75} +{78,49,67,68} +{10,61,58} +{84,35,20,30} +{36,34,48,31,16} +{35,7,47,22} +{98,40,56,43} +{16,4,7,9,44,55} +{86,90,30,80,47,91} +{34,91} +\N +{12,67,77,23,11} +{94,8} +{5,68,31,82} +{26,65} +{51,19,86} +{55,83,39,39,96,51} +{31,22,70} +{20,50,15,93} +{1,55,64} +{8,2,14,3,40} +{2,71,25,41,5,5} +{98,61} +{21,64} +{100,76,99,18,78} +{17,4,69,97,61} +{52,79,97} +{52,26} +\N +{90,54,2,62,11,51} +{33,12,34,45,2} +{91,63,51,42,82} +{100,79,73,70,54,14} +{57,94,81,55} +{13,18,94,17,16,34} +{58,79} +{90,64,68,46,95} +\N +{37,46} +{91,94,10,85,100,24} +{65,86} +{94,89,7} +{72,79,77,53,95} +{65,19,92} +{41,79,53,8,63} +{28,60,50,42,9,32} +\N +{6,23,97,23,10} +{12,28,16,39,70,50} +{26,97,61,48,79,23} +{38,98,21,34,65,89} +{29,13,36,19,13,45} +{72,65,58,81} +{43,98,84,5} +{79,41,100} +{35,30,69,42} +{59,13} +{65,90} +{40,38,21,23} +{2,19,26,38,66} +{5,16} +{84,85,97,84} +{34,26} +{87,17,21,32,29,25} +{75,66,87,90,18} +{84,32,29,51,71,68} +{57,25,73,24,53,2} +{74,16,92} +{99,60,19} +{98,14,70,72} +{24,34} +{37,34,81,100} +{67,10,17,60,16,55} +{39,58,5,23,85,95} +{75,93,19,31,47} +{13,27} +{42,14,32,90} +{59,79,70} +{48,96,45,38,58} +{96,87,84} +{23,70} +{25,31,81,36,75,32} +{64,49} +{30,18,38} +{69,27} +{76,82,43,96,73,17} +{84,95,97,12,20} +{57,69,36} +{60,79,19,67,9,12} +{32,39,3,21} +{55,83,51,48} +\N +{37,11,98,53,11} +{2,73,24,18,77,74} +{69,96,17,49} +{53,2} +{1,76,72} +{35,93} +{35,36,36,25} +{59,77,30,13} +{35,69,36,31} +\N +{20,23,51} +{81,83,57} +{87,43,40,56,81,64} +{24,63} +{29,51,45,93} +{73,85} +{59,1} +\N +{13,57,14,11,34,91} +{69,1,4,28,77} +{63,68,41,53,64,43} +{11,1,46,40,6,88} +{51,19,77,10,86,66} +{74,40} +{25,54,46,62} +{94,17,64,15,20,36} +{100,71} +{63,66} +{33,88,5,92} +{92,86} +{91,69,75,13,20} +{57,22,32,33} +{72,87,44,64,46,6} +{50,56} +{36,23,7} +{74,63,3,6,14,29} +{91,42,8,11,49} +{32,64,94,88} +{91,78,55,27,59} +{2,20} +{52,95} +{57,59,35} +{51,15,52,24,14,13} +{64,16,18} +{50,98,71,10} +{92,99,92,80,77,73} +{96,12,70,85,54,73} +{10,44,30,77} +{29,47} +{40,55,62,58,30} +{59,93,7,21,6,20} +{58,91} +{5,70} +{36,23,58,80} +{16,93,54} +{20,8,97} +{78,32} +{10,31} +{24,10} +{56,14,28,10,45} +{1,79,53} +{56,58,86} +{93,83,17,89,93} +{12,4,26,45,97,17} +{42,67,17,13} +{31,90,59,38,4,20} +{86,52,67,10} +\N +{49,59,10,25} +{69,88,31,38,7,36} +{84,21,57} +\N +{60,8,19} +{35,81,66,96} +{13,95,54,38,31} +{27,25,34,11,65,64} +{54,43,20,20,65,95} +\N +{19,27,100,69,43} +{91,8} +{30,65,98,87,84} +{83,85,100,16,20,18} +{80,48,56} +{61,5,92} +{14,94,43,91} +{35,52,60,43} +{73,25,26,61} +{66,41,39,16} +{2,96,90,37,99,92} +{25,31} +{72,57,50,82} +{40,69,5} +{98,34,66} +{90,44} +{34,78,93,15,65,71} +{98,1,28,36} +{16,59,79} +{88,1,14,45} +{41,91,87,20,72} +{46,9,81,90,63,32} +{2,84,29,56} +{2,57,92,69,63,46} +{3,32,76,62,36} +{11,81,3,81,90,16} +{36,1,42,51} +{29,86,53,51,85} +{17,66,16} +{4,21,25,17,65,92} +{13,26,33} +{74,6,46} +{69,19} +{47,78,85,46,41} +{41,62,100,85} +{22,71,66} +{28,15,58,84,22,92} +{68,82,82,85,15,54} +{34,58,72,35,99} +{51,100,40,13,61} +{80,89,94,31,96} +{48,29,33} +{32,85,75} +{76,43,17} +{79,70,3,64} +{76,64,85} +{94,90,3,85} +{86,21,1,8,54} +{87,92,30,36,59} +{20,51,62,17} +{81,61,86,96,47,67} +{5,75,97} +{60,24,79,3} +{85,49,49,48,61} +{66,60,58,92,88,90} +{2,18} +{42,54} +{42,83,68} +{98,76,42,25,90,32} +{64,36,39,45,34,95} +{56,43,78,10,63,18} +{51,40,98} +{85,11,74,41,14,25} +{37,12} +{76,32} +{6,77,16,86,36,25} +{23,93,18} +{75,51,67,29} +{22,9} +{18,58,25,88} +{95,31,12,20,62,54} +{23,97,89,63,73} +{77,41,11,27} +{91,86,97,98} +{84,6} +{74,69,55} +{58,42,92,65,52} +{77,31} +{8,91} +{5,83} +{64,48} +{1,37} +{51,4,49,97,64} +{29,70,3,62,57,1} +{91,8,31} +{86,71} +\N +{61,50,8,6,59,76} +{83,8,54} +{50,45,66,86,52} +{75,48,18,88,82} +{1,52,60,78,45} +{46,65} +{53,2,63} +\N +{89,97} +\N +{75,23} +{30,58,13,50,2} +{59,73,52,8,70,39} +{20,35,77,34,10} +{55,86,14,74,14} +{67,46,48} +{20,9} +{20,92,62,75,34,98} +\N +{72,14,18,46} +{48,71,92,17,31,13} +{47,46,42} +{42,75,14,86} +{97,55,97,78,72} +{8,4,96} +{44,13,13,18,15} +{16,40,87} +{87,66,79} +{14,44} +{35,79,25,57,99,96} +{23,66} +{90,49,24,11,8} +{50,3,24,55} +{60,94,68,36} +{11,20,83} +{66,100,47,4,56,38} +{36,34,69} +{41,57,15,32,84} +{32,25,100,45,44,44} +{70,32} +{15,37,67,63,71,34} +\N +{81,62,20,91} +{32,62,1,68,86,54} +{20,91,40} +{79,69,22,98,14} +{45,42,24,2} +{30,53,15,62} +{81,100,42,20,96,42} +{93,19,7,59,100,49} +{25,7,18,64} +{11,27,1} +{89,67,65} +{39,97} +{47,62,30,61,58} +{4,11,83,18} +{38,30,95,58,13,81} +{83,6,33,73,64} +{89,51,77,45,58,16} +{13,11,88} +{96,79,71} +\N +{18,66,83,52,84,76} +{52,17} +{74,95,16,5,16,51} +{21,20,16,39,84,71} +\N +{75,47,36} +{65,45,12,5,100} +{41,74,84,21,73} +\N +{8,90,46,39,30} +{47,84,42,49,17} +{76,100,35,89,17} +{61,53,50,31,8} +{94,53,20,33,15} +{97,46,62,85,74} +{8,59,40} +{95,71,21,41,85,81} +{55,71,20,74} +{70,95} +{61,42} +{83,74,25,84,18} +{56,43,46,40} +{42,78} +{95,48,98,93,35,98} +{77,34} +{4,54,58} +\N +{13,54} +{87,66} +{12,88,90,95,6,95} +{65,20,10} +{62,74,59} +{49,17,51} +{14,17,65,3,27,41} +{43,42,43,46,79} +{88,75} +{21,46,84,95,31} +{17,17,28} +{32,73,29,11,46,94} +{3,34,81} +{80,83,1,92,69,100} +{9,24,56,17} +{3,80,57,36,14,94} +{39,89,54,17,31} +{70,19,67,21,31,72} +{82,48,68,52} +{96,81} +{92,18,39,50,18} +{6,54,27,52,28,100} +{23,40,7,74,93,50} +{87,51,38,88} +{98,42,43,30,8,71} +{33,26} +{20,21,83,35,99,100} +{28,77,94,32,1,13} +{17,15} +{35,100,9} +{42,6} +{16,28,55} +{7,94,81,60,91} +{100,63,21,28} +{65,20,35,16,76} +{95,3,88,62,1} +{73,44,46,13,55,69} +\N +{60,49,71,77} +{93,39,75,63,69} +{97,36} +\N +{77,16} +\N +{57,30} +{39,31,56,51} +{62,78,62,38,54} +{69,86,99,10,12} +{11,43} +{60,70,83} +{83,82,3,1,60} +{24,55,61,85} +{65,72,13,77,79,100} +\N +\N +{28,97,71,78,68,95} +{34,1,72,79,84} +{10,49,91,44,27,51} +{15,48,80,37,69} +{42,46,32,34,86} +{80,21,26,50,5,8} +{61,71,100,78,54,50} +{36,20,80} +{67,40,47,68} +{60,7,36,36,55,2} +{32,91,13,98,88} +{15,56,65,23,13} +{20,66,81} +{19,36,99,54,86,92} +{82,28} +{43,32,91,37,70,68} +{71,78,82,50} +{1,31,23,48,10,12} +{88,96,1,44} +{27,49,97,29,89,35} +{63,72,58} +{79,9,32,64} +{75,67} +{46,31,83,54} +{66,24,6,89} +{82,10,64} +\N +{19,31,52,34,89} +{16,36,11,12,23} +{55,50,6,20} +{81,72} +{71,74,8,6,31} +{6,20,96,80} +{95,85,56,91} +{36,33,88,12,50} +{77,44,52,50,50} +{94,12,7} +{97,44,40,43,8,21} +\N +{61,14,40,75,87} +{43,21,67,66} +{46,19,80,12,46,28} +{56,11,14,59} +{31,94,50} +{45,26,61,15} +{84,45,44,82} +{9,16,86,54,93,30} +{50,39,37} +{35,60,64,55,73,90} +{61,65,87,20,30} +{12,59,44} +{23,8,97} +{30,59,7} +{85,32,14,95,38} +{18,91} +{10,40,20,8,58} +{5,58,4,94} +{100,11,96,70} +{66,72,7} +{5,31,89,89,4} +{81,68,44,37} +{22,22,76,67,72} +{22,26,30} +{73,47,27,18,54,30} +{44,13,73,95,83} +{18,93,72} +{30,22,73,13,16} +{14,11,66} +{45,33,59,72,92,81} +{97,82} +{30,4} +{1,9,46,70} +{47,50,20,71,48,60} +{26,62,53,70,63,49} +{39,26} +{47,94,9} +{55,3,18,1,75,22} +{42,87,74,57,60,55} +{95,46,21,38,27} +\N +{13,35,48} +{24,39,24,67} +{44,83,49,72} +{22,8} +{77,39,87} +{37,41,44} +{100,57} +{48,54,58,79} +{14,84,40} +{11,51} +{23,80} +{80,82,43,59,2} +{92,53,56,44,90,66} +{44,67,78,9} +{43,91} +{70,74,100,69} +{12,5,75} +{65,51,22,65,56,36} +{52,54} +{38,78} +{30,45,38,99} +{18,88,88,63,51} +{61,24,53} +{72,24,77} +{61,46} +{11,83,49,86,27,60} +{86,60,83,34,33,28} +{65,15,10,51} +{98,92} +{49,49,60,3} +{58,56,43} +{19,25,15} +{24,40,36,49,61} +{5,62,9} +{72,8,71} +{64,85} +{72,84,67} +\N +\N +{80,87,30,70,21} +{30,86,95,19,21} +{17,90,15,89,81} +{40,51} +{77,88} +{14,89,82,62} +{40,66,93,16,55,45} +{22,46,31,17,4,71} +{8,41,88,94,25,61} +{80,8,23,71,59,53} +{61,70,23} +{2,4,79,6,67} +{27,70,42,68,33} +{46,27,10} +{1,93,42,12,8} +{31,9,19,32,62,15} +{16,42,81} +{56,29,12,17,61} +{52,100,98,42} +\N +{29,38} +{49,40,47,63,22,4} +{99,70,13} +{70,28,67,100} +{37,75,65,63,35} +{45,67,37,28} +{42,78,71,39} +{33,35,76,69} +{65,84,57,63} +{17,12,86,23} +{31,62,79} +{3,22} +{85,81,59} +{38,5,15,100,1,27} +{36,96,93,46,75} +{44,61,85,70,71} +{79,72,86,71,77,9} +{23,51,47} +{4,59,48,38,44} +{93,54,86,98} +{60,29} +{49,38} +{54,84} +{72,25} +{51,40,25,27,68} +{24,17} +{95,3,82,31,95} +{56,37,57} +{15,84,98,16,53} +{47,36,15} +{27,36,76} +{38,82,26} +{47,70} +{60,89} +{59,73,99,7,28,89} +{87,49,70,76} +{71,93,76,81,11,46} +{74,87,92,24,43,22} +\N +{26,1,85} +{18,73,43,94} +{92,2,73} +{5,58,85} +{20,7,39,18,59,90} +{11,16,19,77,60,56} +{77,1,95} +{4,4,11} +{48,40,56,74,96,29} +{71,1,62,69} +\N +{34,61,26} +{86,75,13,73,28} +{17,35} +{100,29,37,26,47} +{69,36,52,61} +\N +{81,51,54} +{54,78,46} +{1,78,96} +{33,54} +{72,9,37,30,100} +{67,10,52} +{77,19,74} +{52,27,41,37,98,73} +{8,74,86} +{4,40,99,6,59} +\N +{98,43} +{74,91} +{69,45,73,59,19} +{87,43,31,85} +{2,51,54,3} +{45,73,8,86,4,40} +{2,51,96} +{74,5,8,64,1,46} +{5,64,86,63,12,75} +{6,62,71,24} +{56,84,54} +{61,37,79,63} +{81,39,78,23,86,74} +{50,79,34,23} +{85,36,78,80,19} +{34,94,1,46} +{5,23,38,4,78,2} +{85,100,80,13,73} +{48,86,9} +{47,22,65} +{49,81,18,52,36} +{84,85} +{89,15,71,88,44} +{1,21,81,52,2} +{53,18,7,53,50,11} +{91,89} +\N +{20,6,20,70,12,32} +{98,94,70,52,41,35} +{43,25,2,63} +{95,86,6,82,2,41} +{79,24,63} +{12,96,7,18,48,67} +{55,35,4,75,28,39} +{48,46,33,75} +{10,99,5,5,98,25} +{43,87,5,53,76,64} +\N +{100,13,9,4} +{4,35,65,56} +{27,74,88} +{59,66,10} +\N +{59,85,39,48,17,29} +{59,42,17} +{27,99,12,21} +{9,10} +{15,4,80,25,67,59} +{12,89,96} +{50,32,92,49} +{40,74,10,6,26,43} +{80,71,29,54} +{74,82} +{22,25,27,65,12} +{84,88,53,43,75} +{84,16,51,84,46} +{10,9,44,95} +{87,19,22,10,44,80} +{18,20,87,41,86} +\N +{9,64,4,33} +{65,87,23,65,32,92} +{50,2,23,68} +{29,8,82,28} +{54,92,6,2,28,70} +{23,11,65,78,34} +{77,85} +{30,49,59,8,60} +{77,30,34} +{55,73} +{89,68,55,81,8,81} +{54,28} +{35,22,67,63,48} +{43,37,46,56,81} +{16,78,32,81,77,37} +{35,80,41,76} +{4,93} +{3,32,23} +{43,18,50} +{87,5} +{30,40,91} +{36,69,17,82,70,57} +{73,71,47,63,58} +{24,11,36} +{2,72,61,76,9} +{61,97,10,85,92,56} +{5,44,47} +{24,57,79} +{69,39,97,8} +{78,16} +{62,52,17,35,28} +{48,79,66,64,36} +{14,72,75,30} +{17,21,41,25} +{28,100,66,56,15} +{89,3,32,86,6} +{67,34,16} +\N +{48,27,70,60,1,40} +{69,34,36,46,95} +{59,24,84} +{44,21,90} +{22,30,5,62,13,58} +{79,67,44,10,1} +{67,8} +{40,48} +{64,5,65,35} +{74,45,75,15,31,69} +{42,3,49,33,52,97} +{86,59,69,84,53} +{64,64,41,64,99} +{47,95,16,78,73,68} +{54,11,52,90} +{54,62,79,58,96,59} +{28,34} +{52,94,17,42,9} +{94,22,77,7,56} +{72,24,47} +{6,11,3,23} +{9,6,97,82,40,39} +{73,47,57,8,7,97} +{27,26,1,2} +{64,45,38} +{71,6,6,83,33} +{78,28,40} +{25,8,17,15} +{24,67,53} +{72,42} +{66,25,56,36,32,93} +{18,11,22} +{88,9,75,23} +{20,32,24,44,51,34} +{76,86,11,7,1,61} +{11,77,41,55,87,59} +{62,53,94,46} +{77,20} +{74,97,59,78,9} +{7,94,26,18,77} +\N +{49,59} +{72,22,42,89,14,80} +{49,14,38,19} +{43,88,25,58,39,24} +{21,34,37,65} +{85,3,46} +\N +{11,60,86,65,49,83} +{51,98,7,28} +{85,17,34,59,14,86} +{89,81,48} +{67,40,11,60,75} +{13,45,42,22,82,82} +{98,21,89} +{30,63} +{35,45,68} +{9,29} +{43,71} +{82,44,59,72,48} +{1,48,29,44,14,11} +{75,33,85} +{7,32,92} +{62,14} +{29,31,1,36,51} +{92,12,28,20} +{13,67} +{88,72,14,22,61,42} +{15,98,49} +{65,27,9,76} +\N +{15,95,26,12,52,40} +{17,20,74} +{57,63,15,22,38} +{93,71,8} +{26,84,82} +{20,52,3,3} +{72,95} +{10,9,80} +{9,9,18,51} +{74,24,63,63,57,89} +{64,91,95,18,15} +{64,37,20,36,74} +{52,9,53,6} +{17,31,42} +{3,73,92,13,62} +{57,81,58,49} +{52,56,2,26,18} +\N +\N +{90,90} +{16,92} +{66,51,7,19,10} +{100,81,69,86,95} +{48,64,81} +{87,54,73} +{6,80,100,24,26,8} +{44,67} +{27,94,2,25,34} +{80,25} +{12,2,77,75,15} +{63,14,30} +{85,75,59} +{72,73,54,44,25,76} +{95,44,69,91,62} +{94,73,78,5} +{28,52} +{86,31} +{69,90,95,66} +{6,10} +{68,72,112} +{9,165} +{91,132,164} +{57,82,144,167,184} +{3,6,101,118} +{111,158} +{22,29,30,174} +{41,66} +{39,76,189} +{7,20,21,196} +{52,126,169,171,184} +{21,77,91,176,196} +{16,97,121} +{83,135,137} +{8,140,160,164,165,195} +{38,65,185} +{112,152} +{111,129,134,148} +{47,80,114,135,147,165} +{24,98,119,123} +{43,48,60,147,154} +{19,54,138,171,186} +{156,175} +{20,51,123,193,193} +{37,41,136,173,192} +{14,22,111,125} +{44,125,160,184} +{19,75,99,103,107,164} +{24,113,145} +{27,157} +{12,107,133,134} +{72,94,102,158,194} +{104,157} +{122,171} +{28,47,89,104,112} +{25,35,82,105,155} +{106,107,139,181} +{50,110,132,136} +{90,110,166} +{1,1,55,60,85,108} +{8,22,31,106,172,196} +{24,69,109,121,154} +{0,26,44,59,132,175} +{103,125,172,188,190} +{11,23,78,109,131} +{81,146,169,181,196} +{2,84,113,189} +{8,46,126,131} +{13,73,73,125,127} +{67,117,139,184} +{29,65,77,120,182} +{0,87,100,102,135} +{111,146,156} +{13,87,123,137,182,197} +{60,61,164} +{7,20,186} +{0,24,53,135,147} +{94,136} +{47,168} +{70,80} +{43,148} +{3,81,104,191} +{104,171,189} +{9,14,117,160,180} +{67,158} +{50,57,66,78,170,197} +{31,60,73,101,193,197} +{37,89,92,96,127} +{29,179} +{17,47,137,155,157,187} +{33,77,154} +{48,63,85,150,184} +{32,53,61,95,172} +{20,35,47,171,179,196} +{2,17,40,169,184} +{116,127,131,142} +{16,26,27,87,164,198} +{58,129} +{67,98,108,132,157,197} +{145,157} +{13,49,56} +{59,103,180,196} +{35,65,104,106,120,126} +{18,96,115,133} +{27,61} +{61,194,197} +{11,27,36,94} +{15,36,101,128,197} +{51,62,115,149} +{83,198} +{30,120,127,145,184} +{50,149} +{13,35,87,117,135,158} +{57,60,74,113,128,178} +{11,90,123,163,170} +{39,121,148,171,198,199} +{30,77,78,137,140,162} +{52,69,120,141} +{9,100,137} +{56,161} +{44,57,75,110,154} +{98,123,155,167} +{10,60,85,105,164,168} +{13,92,179,186} +{13,171,173,176,178} +{33,53,88,123,144,172} +{21,57,70,131,151} +{13,51,63,169,169} +{36,104,119,166} +{54,59,84,166,172} +{7,87,100,102,142,187} +{2,5,6,43,174} +{4,26,29,59,77} +{10,82,98,103,104} +{104,147} +{47,55,99} +{102,154,165} +{0,96,107,139,157,159} +{66,167,174} +{92,97,117} +{21,75,180,185} +{54,64,139,180} +{23,141,189} +{32,38,147} +{82,87} +{6,34,34,161,183} +{25,64,69,97,122} +{80,152,170,189} +{44,78,143,162} +{52,53,64,69,112,158} +{77,80,123,150,175} +{110,121,125,125,128,198} +{0,8,57,104,127,188} +{17,46,48,93,129,150} +{135,193} +{89,111,135,166,184} +{132,181} +{47,54,101,108,125} +{18,55,103,142} +{11,125} +{18,49,58,68,122,153} +{37,47,137,179,185} +{57,78,167,187,192} +{28,32,38,67,77,184} +{67,83} +{43,104,191} +{22,40,118,194} +{24,53,66,195} +{27,87,89,101,130,191} +{71,86,157,167,183} +{31,87,102} +{48,53,70,101,149,174} +{21,33,59,129,195} +{144,160} +{4,8,174,194} +{69,103,127,127,160} +{6,29,62,77,132} +{61,69,108,144,174} +{51,55,109,128,153} +{10,30} +{2,5,6,70,146,183} +{0,1,75,97,166,180} +{53,78,104} +{31,45,68,108,161} +{3,40,78,103,109,130} +{33,44,159} +{28,82,93,136,148,157} +{31,32,76,143,157} +{2,55,106} +{21,66,80,129,129,152} +{1,34,59,128,154,195} +{10,154,172,177} +{2,7,31,47,82,125} +{60,131,149,156} +{20,141} +{23,38,43,100} +{51,70} +{3,41,164} +{126,160,165,169} +{61,71,143} +{65,70,81,100,146} +{40,48,57,75,85,85} +{116,153} +{31,42,49,103,183} +{28,44,62,85,133,177} +{50,68,164,170} +{4,26,60,87,119,141} +{5,102,160} +{20,129,177} +{98,120,135,157,164,168} +{66,150} +{101,101} +{164,187} +{43,65,96,166,189} +{18,36,58,109,118} +{25,32,135,161,170} +{55,104,183} +{69,139,144,181,182} +{84,131,155} +{6,18,63,156,159} +{7,66,67,88} +{8,46,52,95,178} +{58,58,83,119,119,163} +{27,143} +{78,80,122,149,164,176} +{6,83,107,183,198} +{86,199} +{22,74} +{28,62,64,114} +{15,56} +{41,97,139,152,161,161} +{48,192} +{16,62,99,138,155} +{32,84,145} +{108,137} +{93,112,120,155} +{73,117} +{20,26,197} +{4,141} +{110,132} +{95,133,142,152,183,193} +{85,141} +{53,76,86,131} +{5,59,73,74,101,130} +{0,1,64,151,188} +{15,131,131,174} +{80,98,106,187} +{41,102,167,173} +{9,42,133} +{103,110,110,134,175,185} +{168,187} +{42,47,108,121,165,198} +{81,171} +{38,122,123,149} +{16,79} +{45,64,131,176,182,197} +{35,82,87,100,123,196} +{41,52} +{33,68} +{60,140} +{12,41,152} +{54,71} +{88,95,95,146,148,180} +{47,66,89,121,168,182} +{15,70,94,122,137,154} +{42,48,129,162} +{70,151} +{11,55,89,118} +{36,74,121,130,152} +{46,48,52,120,179} +{70,81} +{96,146,183} +{76,78,108,153} +{71,168} +{66,106,108,167} +{22,44,49,85,87,195} +{17,52,143,175} +{86,103} +{16,46,176} +{95,111,162,173,198} +{44,191} +{7,48,107,115,116} +{12,120,141,179,184} +{83,188} +{83,85,86,139,154} +{50,74,89,154,179} +{79,87,120,128,183} +{13,121} +{16,52,62,86,168,199} +{7,16,29,35,157,181} +{23,48,65,119,180} +{10,173} +{7,98,128,143,145,162} +{23,27,88,91,127} +{35,53,56,56,118} +{7,161} +{0,42,67,174} +{44,161} +{75,80,110,170} +{17,93,117,140,168,196} +{18,100,150,184} +{108,132} +{54,90,97,103,149} +{9,12,30,43,82,95} +{131,163} +{67,99,168} +{91,150,172} +{47,164,195} +{72,90,98} +{24,78,130,172} +{1,27,32,64,66,156} +{7,26,72,88,130,140} +{56,126,130} +{1,76,81,122,169,188} +{60,154} +{101,103,135,150} +{22,25,33} +{99,117} +{24,95,122,186} +{48,95,102,108,125,170} +{13,113,154} +{155,177} +{37,73,106} +{7,64,124,195} +{101,124,133,157,166,188} +{27,34,60,100} +{26,104,163} +{34,43,108,133,165} +{64,79,89,122,132} +{10,96,168} +{2,22,89,118,122,198} +{122,192} +{42,101,104,135,168,181} +{7,38,63,86,101,152} +{29,84,89,114,123,184} +{33,46,59,137,153,175} +{3,54,66,92} +{31,34,148,159,185} +{3,52,97,99} +{3,26} +{42,57,62,148,199} +{15,26,198} +{14,34,109,111,128,193} +{107,197} +{16,107} +{9,21,136,169} +{67,97,99,153,165,173} +{46,76,89,100,164} +{96,102,150,167,180} +{31,103,137,146,180} +{21,40,157,163,170,183} +{139,170} +{1,75,82,148,169,198} +{13,39,107} +{13,50,97,101,106} +{52,176} +{18,169} +{129,140,146,183,189} +{95,122,145} +{5,6,102,130,151} +{5,118,140,153} +{27,78,140,164,182} +{36,140,148} +{58,100,127} +{9,16} +{26,33,119} +{1,17,18,165} +{14,182} +{11,13,48,89,140,165} +{9,19,78,113} +{121,171} +{18,23,46,113,159,162} +{17,104} +{50,104,132,167,179} +{55,89,102,132,176} +{19,109} +{60,70,73,153,163} +{18,127,145} +{80,106,146,170} +{10,39,72,74,84,150} +{3,71} +{1,10,64} +{82,95,127,132,141,152} +{43,55,57,89,120,197} +{155,182} +{23,34,57,111,153} +{99,188} +{86,114,124} +{113,191} +{31,129,184} +{125,159,159} +{22,27,81,156} +{3,54,80,122,128,168} +{76,112} +{152,174} +{22,27,70,172} +{26,86} +{49,59,102,186} +{53,55,75,125} +{152,199} +{11,15,46,102,105,168} +{132,148,154} +{24,114,121,126,138,165} +{82,107} +{36,93,122,184,194} +{1,59,76,146} +{73,165} +{38,98,176} +{53,72,121,153} +{127,147} +{31,77,128,177} +{107,186,189} +{119,126,127,160} +{24,74,148,197} +{85,126,134,146} +{76,77,81,134} +{67,112,159,174,183} +{22,169,170} +{79,112,177,199} +{1,56} +{21,42,50,172} +{6,63,105,166,189} +{31,95,106,152,171,177} +{21,49,99,101,122,187} +{63,104,113,161,186} +{37,126,144,166,173} +{32,53,147} +{123,123,130} +{78,85,177} +{2,69,95,146,187} +{6,11,14,43,121} +{76,105,184} +{63,96,114,122,195} +{11,22,34,45,120,156} +{22,83,119,131,138,167} +{9,56,96,106,114} +{92,132,162} +{25,45,83,119,139,150} +{19,21,56,59,141} +{14,26,62,119,180,190} +{6,34,49,99,139,170} +{10,56,150,166,166} +{14,57,119,153,167,198} +{26,41,150,158,169} +{152,167} +{1,61,93,180} +{46,110,138,199} +{4,56,81,110,173} +{28,32,148,185} +{8,9,28,29,39,195} +{14,39,68,144} +{26,37,79,81,110} +{115,158,161} +{6,39,145,191} +{67,118,125,142,184,198} +{127,163} +{52,118} +{22,78,131,156} +{46,68,86,142,145,197} +{85,188} +{37,54,64,147,158} +{31,134,141,183,185} +{10,33,135,198} +{41,124,173,180} +{0,14,92,129,154,198} +{39,73,128,154,182,196} +{40,83,94,168} +{106,142} +{76,99} +{19,62,77,108,165,186} +{68,90,97,119,176} +{44,108,193} +{2,124} +{137,174,175,176,180} +{28,62,81,132,165,186} +{98,112,148,181} +{86,125} +{70,161} +{5,13,188} +{136,168} +{82,87} +{30,42,57} +{132,136,152} +{20,59,87,98,195} +{6,53,112,113,183,195} +{64,147,157} +{61,140,192} +{44,59,88,123,161} +{90,175} +{38,46,105,121,159} +{35,62,66,90,155} +{2,2,21,38} +{123,144} +{117,155} +{60,86} +{4,39,129,146,179} +{66,71,87,135,148,157} +{29,67,108,196} +{30,64,76,124,172} +{36,39,79,130,140,149} +{30,44,136,196} +{5,15,20,117,198} +{20,87,87,121} +{42,136,142,148} +{0,56} +{16,38,56,57} +{52,138} +{103,115} +{10,29,43,93,120,134} +{44,140,150,180} +{74,98,132,160} +{2,62,98,160} +{14,32,43,63,92} +{23,87,128,152,177,197} +{30,86,111,178,180} +{49,61,114,195,196} +{133,158,195} +{18,105,165,190} +{77,83,175} +{29,33,51,166,188} +{37,51,96,103,127} +{119,125,128,140} +{8,80,93,189} +{76,96,110,131,170} +{81,90} +{13,25,28,41,128,142} +{56,62,73,110} +{60,62,128,136,166,193} +{34,34,61,74} +{32,84,87,92,112,181} +{10,66,93,153} +{23,77,182} +{2,7,156} +{5,13,49,61,103,179} +{67,136,136,163,181,196} +{26,60,74,100,160} +{39,59,69,93,111} +{9,77,90} +{1,20,52,75,156,169} +{25,95,103,157,163,193} +{95,136} +{47,108,137,157,164} +{37,99,151,153,169,189} +{112,126,139,171,184,195} +{39,188} +{4,20,71,80,136,156} +{24,33,77,82} +{103,188} +{74,116} +{82,90,110,154,194,195} +{25,149,180} +{120,123,130,171} +{20,38,104,126,175,176} +{14,62,97,130,135,193} +{35,118} +{20,42,64,73,76,120} +{11,40,60,74,144,148} +{13,26,46,63,76} +{24,29,98,106} +{6,139,171,186} +{5,109,197} +{20,45,84,125} +{1,137,150,195} +{1,8,80,111} +{57,90,102,167} +{53,186} +{8,31,115,145,156,165} +{10,18,31,116,164} +{43,47} +{33,143,154} +{106,153,174,190} +{73,106,158} +{18,137,158,173} +{73,80,107,123,141,199} +{17,43,123,130,130,155} +{15,31,37,91,164,181} +{38,86} +{49,105,142,145,173,190} +{18,107,108,135,138} +{43,65,107,112,193} +{8,68,68,74} +{54,106,108,109,164} +{53,153} +{59,134,154,173,180} +{34,93} +{11,33,124} +{8,104} +{27,37,46,65,125,174} +{0,122,189} +{15,74,107,147,188} +{35,63,78} +{28,49,123,129,177,193} +{11,89,104} +{117,171,197} +{11,15,62,136,145,145} +{2,127,193} +{17,28,42,113,145} +{31,44,118,148} +{52,103,128,161,182} +{45,47,70,102,161,184} +{15,52,82,86} +{60,87,102,108,127,170} +{24,57,102,145,181} +{12,53} +{5,52,92,129,164} +{87,128} +{80,143,170} +{59,85,134,139} +{61,67,110,117,156,157} +{6,8,60,112,154,170} +{92,122,133} +{121,148,161} +{9,22,61,187} +{12,40,78,107,176} +{30,45,58,189,198} +{83,107,123,148} +{3,66,98,124,126,150} +{13,34} +{16,41,132} +{16,85} +{3,25} +{30,58,138,167} +{24,36,87,151,159,186} +{2,4,121,196} +{79,95,99,107} +{11,49,146,169} +{51,90} +{76,155} +{26,26,116,120,146,182} +{44,66,72,117,132,174} +{7,161,179,197} +{2,81,158} +{4,22,59,107,146,170} +{0,0,133,192} +{57,82} +{17,61} +{28,29,42,77,89,124} +{53,78,127,188} +{31,57,103,104,162} +{9,84,100} +{3,52,114,133,161,188} +{8,37,97,158,189} +{0,13,88} +{29,79,92,158,160,171} +{59,63,77,139,165} +{25,77,116,169} +{50,88,151,166} +{52,162,167} +{32,149,191,194,194} +{47,57,74,95,97} +{30,65,96,153,184} +{80,130,150,172} +{79,91,141,153,157} +{93,110,114,194} +{62,66,156,175} +{55,56,97,117} +{74,152,171,186} +{13,24,50,50,131} +{0,16,95,141,146,161} +{1,51,158} +{37,71,96,122} +{71,104,145} +{47,52,124,131,169} +{111,188} +{59,61,95,152,156,157} +{5,31,106,164,176} +{44,82,113,134,188} +{13,55,65,99,150} +{25,73,130,192} +{88,120,193} +{79,123,153,175} +{24,158,162} +{52,53,81} +{5,32,78,102} +{73,97,111,151} +{71,72,102,151} +{5,61,73,85,129,151} +{66,177} +{26,77,139,152} +{46,117} +{55,72,122,148,157,174} +{3,53,76,184,196} +{34,36,41,61,194} +{8,153,163,182} +{51,59} +{113,115,149} +{54,57,78} +{39,137} +{75,81,93} +{5,30,44,80,86,126} +{68,107,128,160,179} +{98,108,162} +{55,126} +{24,54,121,122} +{75,90} +{10,83,139} +{16,120,148} +{97,175} +{53,70,71,120,135,189} +{9,110,123,150} +{24,42,44,96,138,170} +{17,61} +{23,65,110,135,155,157} +{19,59,139} +{50,65,127,179} +{15,138,152,162} +{15,34} +{25,29,63,135,161} +{47,113,123,129,163} +{25,138,157,184} +{50,92,199} +{110,116} +{15,36,134,145,165,182} +{4,75,82,175} +{24,49,63,89,128} +{174,182} +{103,116,119} +{101,125,180,192} +{47,66,113,127,148} +{15,60,118} +{20,51,90,91,117} +{25,72,146,199} +{34,93,199} +{31,71,106,115,186} +{1,10,119,144,188,197} +{49,80,185} +{134,178,188} +{42,67,170,172} +{13,43,91,91} +{13,31,48,98,155,158} +{37,44,70,76,141,160} +{50,60,72} +{51,65,166,188} +{11,103,129,144} +{136,167,181} +{165,178} +{34,107} +{54,120} +{33,132,136,165,178} +{60,79,119,127,187,197} +{27,31,130,132} +{125,129} +{97,111} +{71,171,187,191} +{68,91,94} +{94,119,159,178} +{2,29,51,173} +{37,61,97,113,147} +{11,35,79,91} +{67,71} +{4,20,103,107,169,179} +{35,77} +{71,94} +{29,31,67,101,172,174} +{52,122} +{87,125} +{129,142,164} +{13,30,85,139} +{17,57,65,170,179} +{46,65,151,167,192,197} +{31,78,132,136,158} +{38,161} +{15,101,111,134} +{42,118,139,142,178} +{57,95,132,134} +{5,42,116,152,173,192} +{144,199} +{38,70,77,143,175,188} +{38,84,93,149} +{56,98,153,165,170,191} +{1,52,112,112,131,145} +{16,132,150,184} +{14,60,111,153} +{49,109,112,165} +{69,136,152} +{59,90,94,158,168} +{42,47} +{18,194} +{33,70,94,167,175,177} +{40,57,125,138,159} +{3,10,31} +{2,5,8,26,141,181} +{27,29,142,175,186,195} +{31,49,99,120} +{109,123} +{21,76,112,119,124} +{41,49,146,173} +{101,173} +{49,73,85,89,179} +{22,36,154,192} +{136,163} +{111,165} +{94,128} +{81,167} +{35,165} +{41,109,119} +{13,74,80,114} +{72,106,189} +{65,172} +{30,31,35,52,63} +{80,116} +{0,149} +{139,189} +{0,65,107,153,179} +{15,40,46,51,75,160} +{12,28,48,79,105} +{76,98,146,157,180} +{45,62,79,83,113,155} +{130,162,184} +{78,140,145,181,196,198} +{108,168} +{3,13,14,15,77} +{22,29,68,117,142,143} +{67,110,122,167,183} +{22,25,58,93,143,151} +{53,82,170} +{1,18,50,98,108,174} +{58,140} +{49,179,196} +{109,171} +{38,82,132,183} +{32,151,175} +{53,90,106,169,187} +{99,136,141,146,171} +{27,108,111,155,192} +{28,77,86} +{11,109,118,149,154,183} +{7,74,122,137,185} +{70,110,151,154,175} +{7,48,88,181,181,182} +{97,101,105,123,139,156} +{19,139} +{17,107,134} +{63,64,178} +{100,133,143} +{64,173} +{1,88,109,120,145,160} +{113,198} +{84,112,121,184} +{90,185,193} +{91,135,155,185} +{56,191} +{14,15,48,61,92,171} +{18,139,152,199} +{16,80,107,125,144,166} +{8,92,112,173,176} +{27,196} +{9,169,183,190} +{20,29,40,98,106,182} +{77,115,149,181} +{31,65} +{7,29,62,90,157,178} +{10,33,79,186} +{42,74,113,178,192} +{17,86,88,118} +{27,58,104,122,166} +{16,97,102,105,192} +{16,59,115,127} +{27,56,60} +{104,175} +{52,84} +{127,137} +{7,13,18,81,139,140} +{11,31,81,150,189} +{44,55,107} +{45,58,127,137} +{70,76,80,93,145} +{27,60} +{40,76,172} +{7,123,192} +{55,170} +{61,137,137,184,187} +{49,50,190} +{99,126,152,164} +{56,79,88,98,132} +{45,74,119,123,158,175} +{66,96} +{100,114} +{62,84,111,122} +{8,22,141,172,181} +{70,141} +{3,48,106,193} +{33,114,168,174,183} +{46,186,194} +{58,71,82,122,190} +{60,67} +{14,30,132,144,174} +{9,113,124} +{11,14,29,63,110,182} +{4,64,102,168,178} +{90,108,110,160,165,199} +{44,86,191} +{6,19,84,125,125,156} +{53,105,122,154,175,190} +{83,177,183} +{96,103,181} +{38,156} +{2,6,60,116,131} +{12,144} +{13,73,93,132} +{142,167} +{37,61,71,75,121,144} +{32,43,146} +{41,59,144,176} +{11,14,44,54,92,177} +{37,198} +{39,80,81,104,138,193} +{13,73,92,127,149,194} +{34,57,69,104,118,186} +{7,48,84,96,108} +{32,41,64,111} +{108,131,150,174,195} +{50,53,184,191} +{8,32} +{26,76,88} +{4,50,100,134,134} +{36,40,148,158,177} +{7,16,57,59} +{35,96,113,129,167} +{46,63,128,163} +{8,46,94,97,105,178} +{12,70} +{45,93,134,135,188,195} +{11,52,76,103,131,192} +{19,45,57,119,123,136} +{19,62} +{1,49,64,197} +{0,42,60,102,134,147} +{102,152,156,160} +{51,54,129} +{50,68,71,72,170} +{0,11,184} +{19,105} +{144,185,191} +{17,51,76,98,118,135} +{52,64,143,171} +{1,46,62,74,81} +{8,36,129} +{5,25,96,113,146,152} +{19,28,59,110,131,142} +{7,18,176,179} +{17,21,48,63,121} +{34,79,81,85,152,155} +{8,82,104,122,139,193} +{34,50,128,140,175} +{51,173} +{48,128,138} +{126,129,178} +{42,51,61,141,170,180} +{59,91,144} +{64,74,118,170,191} +{12,55,116,157,159} +{97,157} +{32,34,102,105,178} +{36,103,125} +{15,36,184} +{6,13} +{0,100,144,185,198} +{32,47,64,66,118,143} +{23,112,117} +{34,44,47,81,124,135} +{21,49,115} +{29,158} +{34,114,127,151} +{111,199} +{23,53,76,113,122,123} +{89,113,117,137} +{52,76,126,155,164} +{4,48,78,114,147,179} +{27,56,151,191} +{3,183} +{30,41,72,145} +{15,41,152,177,196} +{44,58,124,164,177} +{9,51,70,174} +{13,18,81,136,178} +{85,139,142} +{12,62,118,156} +{50,142,149,175} +{35,38,99,100,128} +{53,54,92,123,153,160} +{121,133} +{12,63,117,148,149,187} +{88,153,170,192,195} +{22,51,67,104,141} +{186,198} +{39,40,82,159,189} +{59,74,149} +{88,99,136,145,191} +{5,48,90,120,138,193} +{22,76,155,180} +{118,122,141,176} +{87,104,116,159,172,191} +{63,104,155} +{8,153,168} +{119,141,178,179} +{100,110} +{14,65,164} +{2,92,97,117,188} +{47,59,64,141,148,187} +{109,137,139,151,169} +{68,78,156} +{37,39,103,183,190,194} +{50,58,74,180} +{12,121,155,175} +{26,43,97} +{102,159,161} +{3,138,163,179} +{55,69,78,164} +{67,87,136} +{67,150} +{74,113,199} +{103,126,187} +{39,141,155} +{6,19,25,75,157} +{10,49,71,105,114,154} +{3,24,35,54,88} +{16,25,73,114,181,191} +{2,2,63,154} +{68,74,107,187,199} +{13,235} +{40,122,203,232,233,235} +{115,152,193,202,242} +{3,50,86,111,248} +{25,66,181,188,279} +{80,116} +{38,83,106,119,134} +{29,63,203} +{7,27,186,200,201} +{88,92,94,272,295} +{35,68,136,158} +{148,225,293} +{1,87,195} +{48,100,203} +{0,35,61,91,280} +{130,160,168,216} +{4,104,148,164} +{35,40,91,145,155,214} +{46,107} +{21,276} +{42,143,150,238,291} +{64,70,140,165,228,257} +{0,148,197} +{72,131,195,202,251,270} +{99,195,224,264,292} +{5,184,186,243} +{93,132,137,148,228,251} +{66,125,198,211,285} +{29,79,180} +{41,60,65,66,254} +{4,69,79,207} +{113,182,203,252,259,298} +{10,20} +{99,200,276} +{109,262} +{4,87,193,207,220,273} +{30,183,215} +{7,138,202,215,217} +{25,79,194,203,260} +{128,178} +{62,152,211,279} +{57,99,204,280} +{41,59} +{18,52,200} +{81,132,190,275} +{89,158} +{32,72,122,228,245,249} +{24,72,196,233,299} +{0,5,46,122,213} +{197,242} +{43,105,241,272} +{74,118,158,173,208,288} +{145,149,197,238,252,297} +{32,39,189} +{98,240} +{65,140,149,197,203,204} +{103,225,266} +{84,277,283} +{35,246} +{10,101,239} +{40,75,192,253} +{106,152,247,272,287} +{50,293} +{85,134} +{59,204} +{54,64,88,269,286} +{4,92,111,121,154,182} +{80,163,202,234,277,298} +{129,147,158,196,283,290} +{49,144,232,293} +{20,29,226,244,274} +{64,101,185,189,234,268} +{23,157} +{56,93,133} +{9,57,241,289} +{50,124,181,194,238} +{11,38,67,69,213} +{149,220} +{168,189,267} +{34,133,235,264,284} +{81,239,241,260} +{35,78,80,201,262,297} +{0,196,285} +{71,108,239,258,277,278} +{4,94} +{77,132,140,251} +{11,78,132} +{43,145,188} +{97,144,148,161,254} +{109,132} +{48,83,189,242} +{115,176,276} +{162,210} +{88,109,136,153,154,159} +{265,280} +{74,86,195} +{17,112,188,213,231,266} +{36,136,160,218,239} +{179,273} +{79,118,136,154,200,259} +{161,212} +{24,98,178} +{161,187} +{45,169,227,236} +{218,253} +{10,18,74,258} +{70,199,210,213,285,291} +{12,50,69,92,184,186} +{130,131,163,295} +{198,239,297} +{49,86,125,176,234,282} +{7,129,146,223,269} +{144,173} +{30,52,133,228} +{21,88,176} +{5,70,299} +{37,69,285} +{14,17,45,72,99,197} +{125,196} +{30,220} +{55,103,127,251} +{108,114,156,200,207,217} +{7,195,250} +{64,111,193,202,236} +{92,115,232,233,240} +{22,232,260} +{18,44,191,253,294} +{40,158} +{86,92,103,154,207,294} +{33,177,191,223,235} +{65,116,158,253} +{49,125,152,194} +{100,149,191,266,288} +{13,64,103,254,283} +{42,75,80,103,155} +{77,128,198,280} +{118,218,287} +{0,36,52,101,148} +{1,64,181,201,221} +{6,44,47,71,150,225} +{13,85,88,167} +{31,40,69,91,99,281} +{60,115,157,224,252,273} +{30,87,200,270,285} +{171,293} +{24,33} +{59,69,74,118,190,216} +{147,258,288} +{62,73,219,232,266} +{50,74,225,238,271} +{6,88,115,185,205,262} +{97,230} +{76,76,150,211,228,262} +{134,195} +{104,235} +{38,41,204} +{64,71,124} +{44,63,111,231} +{186,188} +{5,132,225} +{113,286} +{43,161,276} +{8,294} +{18,90,105,169} +{213,213} +{29,45,161,289} +{79,152} +{10,110,162,177,217,238} +{63,98,192,244} +{118,147,187,281} +{5,15,36,94,263} +{40,81,220} +{29,74,76,157,162,206} +{11,28,53,68,126,222} +{73,73,181,239} +{36,60,164} +{16,47,82,152,167,289} +{149,149,219,268,294} +{97,169} +{32,160,210,257} +{32,69} +{7,63,73,195} +{54,110} +{61,75,135,270} +{22,43,127,174,184,208} +{106,113,174} +{0,70,90,239} +{191,260} +{43,80,168} +{25,54,257,263} +{118,213} +{110,207,220,251,287} +{126,139,161,248,252} +{51,79,116,132,190,291} +{183,199,200,254} +{86,233} +{105,109,176,211} +{12,109} +{3,65,158} +{21,86} +{12,15,191} +{181,223,224,256,259,276} +{112,191,219,232,239} +{51,215} +{36,46,278} +{68,75,169,228,244,270} +{10,16,52,172,189,274} +{177,191,197,209,222,282} +{41,119,190,202} +{128,277,292,298} +{34,38} +{22,36,81,117} +{81,161,248,279} +{75,85,103,149,190,211} +{127,279} +{50,74,152} +{122,168,209,240,276,282} +{66,102,208,239,291} +{9,113} +{72,199,237} +{110,112,135,141,270} +{26,109,130,159,291} +{108,206} +{2,289} +{63,238} +{4,57,104,119,142,214} +{46,97,239} +{210,297} +{207,268} +{13,64,80} +{62,109,171,195,232} +{11,260,262,276,292} +{21,75,78,80,140,226} +{38,56} +{122,251,297} +{108,180,213} +{57,58,135,231,233} +{75,136,185,211} +{52,109,122,174,178,255} +{65,91,234,249} +{5,24,53,218} +{90,211,246} +{106,242,260} +{61,136} +{49,87,177,280} +{38,89,104,189,297} +{43,76,293,298} +{182,255,289} +{25,57,64,272} +{23,122,149} +{49,50,129,153} +{183,188,204} +{27,164,226,230} +{0,13,67,145,160,259} +{22,32,43,85,105,124} +{20,199} +{31,119} +{14,16,152,158,196} +{5,59,91,202,217,280} +{100,128,187} +{20,193,214,258,272} +{17,27,55,151,177,219} +{53,55,63,208,213,230} +{15,160,258,260} +{71,147,235,258} +{26,49,173,234,271} +{50,52,58,167,257} +{15,154,213,232} +{6,35,86,94,286} +{0,4,83,262,281} +{93,148,284} +{28,165,262,290} +{18,99,160,266} +{63,223,291,295} +{103,154,180} +{12,110,144,221} +{9,158,203} +{20,207,275} +{9,20,48,88,120,289} +{67,110,133,151,225,297} +{71,102} +{168,208} +{48,137,163,164,280,287} +{90,209} +{28,244} +{107,224,293} +{86,206} +{8,113,147,165,285,286} +{7,159,160,237} +{0,66,87,146,225,294} +{58,100,112,124,189} +{13,108} +{121,168,216,253} +{147,242,282} +{236,240} +{21,28,83,103,166} +{30,88,108,280,295} +{23,136,298} +{125,290} +{140,249,276,277} +{49,81,135,147,164,267} +{28,63,198,297} +{30,101,216,232,267,287} +{54,195,204,223,236,251} +{27,176,179,204,264,291} +{136,164,172,273} +{43,67,81,121,277} +{128,131,256,269} +{176,219,289} +{127,175,259} +{35,94,153,177,222,253} +{29,154,178,240,260} +{165,176,201,243,259} +{17,298} +{29,203,232,241,289} +{107,136,153,238} +{49,198} +{68,179,202,253} +{157,178} +{23,199,287} +{131,228} +{19,19,39,111,138,277} +{49,86,178,194,223,226} +{114,201} +{149,282} +{109,147,150,176,209,229} +{122,131,167,228,258} +{5,40,120,154,266} +{135,207,238,263} +{75,128} +{80,117,296} +{60,82,122,131,138} +{57,146,159,233,244,278} +{15,80,157,182,244,272} +{114,116,160,176,287} +{10,133,279} +{27,115,126,293} +{89,161} +{95,120,218} +{26,269} +{109,281} +{53,62,103,107,118,239} +{185,186,227,252} +{3,125,146,161,288} +{171,245,256,283} +{23,153,201,238} +{0,82,93,218,242} +{101,124,137,150,194} +{21,96,104,201,244,266} +{88,121,147,155,173,225} +{24,106,112,193} +{26,67,115,212,283} +{23,120,280} +{45,99} +{30,66,136,199} +{17,213} +{14,37,55,103,265} +{52,258,284} +{119,213,272,274,285} +{43,45,105,254,288} +{64,81,123,126,164,292} +{88,229,260} +{25,117} +{7,149,197,227,258} +{74,83,240,246,284,292} +{2,4,63,103,115,289} +{92,239} +{12,26,130,228,265} +{53,99,131,142,164,291} +{63,248,259,283} +{186,215,282} +{67,110,160} +{166,191} +{33,156,224} +{152,166,190,250,297} +{123,126,153,199,204} +{49,70,199,238,238,289} +{14,18,65,74,146,235} +{63,77,172,180,186,225} +{1,48,105,170} +{37,56,113,133,196} +{193,261,266} +{190,273} +{38,129,261} +{251,252,253,254,275,296} +{249,275} +{167,205,266} +{27,152,256} +{19,72,248} +{40,73,141,249} +{105,197} +{156,243,277,282} +{165,168,227,298} +{8,31,202,271} +{10,101,109,167,236,277} +{33,91,165,192,206,211} +{102,122,232} +{190,239,283} +{160,185} +{2,13,65,70} +{11,68,170,192,229,284} +{66,90,228,237} +{1,6,92,99,222,242} +{42,128,133,207,289} +{12,100,164,191} +{26,31,120,176,204,220} +{13,39,95,105,120,182} +{114,120,295} +{31,34,55,181,197,235} +{24,52,64,80,142} +{3,49,148,255,268} +{132,175,254} +{32,71,141} +{112,116,186,270,271} +{64,106,209,228,297} +{128,268} +{107,208,299} +{151,173,187,192,213} +{3,296} +{20,31,135,153,289} +{138,193,212,269,277,288} +{73,92,130,295} +{73,80,105} +{50,96,138,199,265} +{4,7,8,183,260,267} +{66,71,118,145} +{15,63,116,160,175,181} +{88,217} +{56,69,106,106,127,274} +{84,205} +{83,101,241,269} +{21,254} +{22,32,83,150,293} +{198,221} +{30,46,95,179,197} +{46,85,208} +{56,112,236} +{71,217} +{31,57,145,253} +{34,133,170} +{48,53,119,187,268,287} +{111,203,229,239} +{62,136} +{49,54,187,254,298} +{20,26,148,159,190,286} +{3,13,193,252,284} +{40,137,154,167,248,259} +{3,47,242,278} +{77,100,143,232} +{51,130} +{66,90,148,220,242,273} +{143,151,211} +{10,23} +{21,30,179} +{17,47,105,156,193,213} +{0,23,25,125,144,146} +{179,209} +{79,113,117,192} +{5,53,216,275,285} +{187,197} +{22,68,218,221} +{0,71,78,110,120,173} +{46,97,117,149,253,286} +{10,20,129,162,171,195} +{60,97,130,163,190} +{57,145,179,283} +{99,274} +{151,161,228,251} +{3,177,192,286} +{21,81,142} +{180,283} +{13,102,131,149,246} +{19,99,132,162,167,257} +{15,86,188,260} +{203,251,281} +{5,45,138,155,157} +{1,2,4,213,278} +{21,123,208,219,263,267} +{36,106,181,231,238} +{103,120,168,184,224,287} +{53,104,139,251} +{1,91,141,202,268} +{75,115,216,253} +{56,167,268,296} +{66,158,235,249} +{82,124,198} +{56,67,112,140,170,176} +{16,75,266} +{38,165,200,219,291,297} +{86,151,229,241,275} +{0,57,141,176,229,258} +{18,72,164,195,235} +{94,282} +{83,139,242,269,294} +{9,44,145,251,272} +{132,203,249,282} +{7,41,170,254} +{6,153,193,291} +{18,134,137,227,261} +{14,36,115,124,172,229} +{54,206} +{49,91,131,185,204} +{7,242} +{41,57,161} +{93,224,241,288} +{119,288} +{90,99,117,196,296} +{67,85,154} +{147,169,216,264} +{79,92,164} +{19,120,132,197,267} +{76,264} +{30,133} +{27,37,93,138,218} +{152,155,244} +{41,149,182,259} +{29,178,224} +{115,201,268} +{141,166,253,282} +{3,65,125,245,264} +{6,150,159,202,206,277} +{217,276} +{28,96,144,193} +{7,59,190} +{144,217} +{10,79,96,100,126,222} +{7,61,253} +{14,69,263} +{3,30,63,125,186,277} +{2,10,79,100,223} +{131,131,239} +{116,195,199,240} +{87,99,158} +{52,180} +{7,12,140,208,275} +{65,67,83,280} +{4,52,125,126,137,176} +{9,48,79,203,217,243} +{43,206,251} +{19,112,196,263,266} +{29,70,256} +{161,236,258} +{8,25,42,97,291} +{63,144,242,271} +{7,17} +{1,85,250} +{104,244,250} +{18,22,31,99,266,281} +{51,138,237,268,288} +{8,40,91,221,273} +{0,176,230,249,254,255} +{44,140,176,194,197} +{56,197,264} +{229,246,283} +{53,128,173,233,282} +{45,193,221} +{21,80,286} +{4,18,267} +{15,97,220} +{62,70,83,147,149,244} +{120,134,159,174,250} +{116,269} +{23,108} +{10,91,239} +{7,128,142,243,286} +{134,201,245,275,278} +{13,208,227,288} +{30,78,85} +{107,179} +{31,59,153,217,240,298} +{27,130,233,282,286} +{15,59,136,262} +{85,186,233} +{10,152,165,181,181} +{137,183} +{40,56,125,256,265,280} +{12,22,120,183} +{62,229} +{38,59,81,113,261} +{67,194,229} +{7,173} +{37,43,296} +{59,162,285} +{171,200,213,213} +{116,123,209,234,277} +{52,175} +{189,213} +{30,94,99,228,238} +{46,101,154,260,272,274} +{30,32,59} +{65,172,292} +{18,22,131,170,271} +{2,53,88,104,264,265} +{60,194,288} +{15,108,121,161,201} +{40,85,173,195,201,221} +{54,86,107,174,287} +{20,71,190,227} +{16,46,66,175,197,252} +{130,243,252,282} +{142,219,266,272} +{14,202,204,231,241,276} +{161,172,212,222} +{15,183,275} +{83,270} +{67,204} +{65,184,264} +{73,119,183,190,242} +{53,287} +{24,171} +{72,220,220} +{101,136,176,204,224,280} +{39,47,282} +{106,162,238,252} +{23,242,247,265} +{98,108,189,209,273} +{122,245,270} +{109,127,128,244,299} +{41,162,186,191} +{60,196} +{0,123,129,213,248} +{29,79,89,91} +{172,298} +{122,140,162,228,263,268} +{2,116,247,294} +{6,138} +{17,98,287} +{53,166,187,219,248,296} +{15,26,90,175,196} +{184,193,198} +{17,69,76,105,183,264} +{56,101,110} +{15,108,139,168,272} +{5,71,104,141} +{136,179} +{72,189} +{54,79,208} +{98,113,150,184,190,246} +{37,69,132,210,285} +{1,29,45,74,109,145} +{11,72,133,149,216} +{34,57,84,212,280} +{131,211,294} +{70,84,173} +{193,213,230,266,285,299} +{57,94,163,182,227} +{44,133,143} +{31,32,211} +{130,142,165,188,194,231} +{52,61,139,226,239,287} +{7,103,157} +{155,224,230} +{127,135,139} +{77,237,294} +{10,213,278} +{28,90,185,274} +{59,105,282,297} +{39,128,174,268} +{32,158,215} +{24,145,189,213,278} +{78,148,230,263} +{42,68,93,160,287,299} +{4,12,70,91,191,237} +{20,294} +{45,53,77,113,211,240} +{232,237} +{125,152,284} +{58,81,155,215,296} +{4,8,44} +{1,52,102,128,184,218} +{185,199,226,299} +{10,178,262,285} +{80,95,230,240,266} +{4,5,213} +{156,187,271,298} +{88,298} +{109,233,290} +{47,65,91,105,249,269} +{97,129} +{46,92,207} +{2,163,249,259,291} +{89,102,140,158,231} +{162,184,283} +{36,213} +{163,259} +{47,220,250} +{37,89,105,124,143,198} +{3,71} +{142,165,190,256,269,269} +{152,256} +{27,49,191,198,220,285} +{71,73,87,189,260} +{11,54,90,106,130,216} +{193,245,252} +{2,8,57,91,163,184} +{18,171,283} +{28,41,110,112} +{5,57} +{137,262,285} +{19,57,156,229,269} +{138,179,190,199,281} +{35,98,196,242} +{122,152} +{83,132,181,212,280,288} +{219,298} +{57,88,103} +{5,203} +{98,156,266} +{10,45,72,169,211} +{45,101,156,214,269} +{68,73,81} +{16,127,259} +{9,32,246} +{66,173,261,261,274} +{17,115,157,169,251} +{49,158} +{25,37} +{2,73,103,178,194,236} +{238,269,273} +{162,178,276} +{48,52,160,237,288} +{54,82,130,135,169,275} +{29,142} +{205,249,253,275,291} +{60,76,84,115,126} +{48,108,153,213,231} +{23,124,175,210,226,293} +{9,181} +{20,99,112,166,201,242} +{102,150,201} +{41,98,240,244,260} +{7,44,98,293} +{0,125,177,283} +{28,118,124,148,241,290} +{73,91,122} +{9,72,109,130,202,290} +{70,111,120,160,216,262} +{59,175,296} +{2,201} +{83,297} +{76,293} +{83,127,136,242,275,285} +{169,190,195} +{83,122,186,189,217,229} +{98,210,229} +{117,133} +{74,294} +{6,31,59,143,156,273} +{98,180,241} +{26,52,114,243} +{112,240} +{104,217} +{148,162,259,279} +{92,101,150,226,272,295} +{55,86,118,202,237,275} +{81,203} +{79,126,177,265} +{57,193} +{169,240,244} +{21,171,190,250,263} +{23,37,215,235} +{40,54,240,286} +{105,177,190,276,285} +{44,45,122,151} +{28,31,187} +{127,135,211} +{5,13,150,194,259} +{136,181,280} +{20,147,158,189,200} +{15,83,88,128,169} +{10,14,25,26,150,158} +{42,101,172,205} +{85,185,226,236,271} +{34,127,188,250,268} +{27,143} +{26,48,99,110,117,207} +{22,56,190,269,287} +{200,278} +{70,134,138,204,216,298} +{175,219,297} +{99,273} +{206,216} +{23,214} +{131,140} +{11,140,240} +{73,148} +{7,66,125,210} +{2,61,92} +{0,137} +{143,188,265} +{177,238} +{0,93,163,229} +{35,49} +{8,8,111,144,165} +{99,278} +{21,44,71,224,252,270} +{119,150,175,233,245,294} +{15,87} +{84,211,217,225} +{20,41,87,123,124,299} +{62,120,169} +{37,43,92,175,206,222} +{95,168,180,250,269,296} +{60,228,278,285} +{173,195,232,276} +{1,2,139,256,278} +{51,119} +{212,238,291} +{120,172,292} +{138,279} +{251,261} +{151,181,278,296} +{163,207,220,289,295} +{89,278,290} +{24,137,157,206,271,278} +{7,63,83,89,155,189} +{2,5,172,195,215,260} +{243,281} +{60,125} +{74,87,222,236} +{45,70,159,194} +{69,159,250} +{150,214,296} +{101,158,250} +{56,134} +{57,87,160,167,247,285} +{123,269} +{235,242} +{79,95,115,167,287} +{31,56,132,244,276} +{25,218,241,241} +{57,82,151,170,204} +{69,103,288} +{88,138,154,292} +{14,98,138,227,245,249} +{175,222,274} +{38,139,193,208,277} +{79,141} +{5,77,197,209} +{15,37,77,110,116} +{26,226} +{68,93,101,140,233} +{53,96,170,192,290} +{29,89,102,216,220} +{11,85,136,239} +{158,180,195,200,226} +{10,49,118,137,172} +{144,172,183} +{14,176,188,215,272} +{42,97,125} +{114,166} +{52,61,162,171,249} +{140,195,242} +{59,99,233} +{31,76,136,181,187} +{81,112,157,168,271,294} +{8,35,44,48,190,297} +{145,195,201} +{160,248,291} +{94,270,285} +{116,139,225} +{111,131,140} +{158,277} +{59,229,257} +{25,47,99,123,239} +{8,36,205,274,295} +{132,152,178,192,235} +{19,40,96,204} +{7,77} +{211,282} +{26,100,180,244,281,296} +{200,212,286} +{5,94,151,290} +{75,80,128,179,269,269} +{7,111} +{7,26,69,158,269,276} +{7,36,74,94,171,215} +{2,62,65,93,124,271} +{78,96,109,189} +{182,197,280,298} +{17,78,82,85,85,208} +{6,122,155} +{14,33,130} +{1,21,167,169} +{49,85,158,175,213} +{59,194} +{125,132,259,285} +{20,38,81,89,234,274} +{106,140,156,287} +{57,125} +{53,103,158,204,234,267} +{0,49,160,189,235} +{34,115,142,207} +{162,173,181,190,298} +{11,76,116,166,191} +{2,87,99,236,279} +{40,203} +{2,33,39,215,254} +{53,69,83,224,228} +{79,136,183,216,226,227} +{10,109,137,163,240} +{24,126,141} +{69,255} +{103,138,230,246,259,283} +{136,290} +{13,34,78,145,166,242} +{38,74,83,242,294} +{54,248,273} +{107,162} +{50,170,176,191,207,275} +{32,134,166,288,292} +{163,167,186,274,291,296} +{31,86,123,156,160} +{114,133,136,176,281,290} +{105,147,211} +{124,151,179,222,299} +{87,101} +{145,169,181,205,247} +{6,266} +{26,33,52,56,106,116} +{19,21,65,89,104,168} +{164,181,208} +{36,67,92,116,248} +{145,200,247} +{155,215} +{49,212} +{29,57,105,117,131} +{2,13,68,128,139,140} +{193,273,273} +{3,78,105,111,297} +{49,142,244} +{32,259} +{161,205} +{96,146,179,259} +{44,45,211,233} +{56,91,146,166,285} +{87,107,120,262,299} +{76,160,276,297} +{248,266} +{5,12,188,240,247} +{164,206,293} +{15,18,60,163} +{53,134,172,230,287,290} +{117,137,146,153,155} +{72,270} +{171,251} +{80,125,137,141,169} +{52,108,200,219,225,271} +{29,78,106,221} +{21,74,110,273} +{28,88,98,170} +{83,104} +{12,152} +{7,69,143,246,265,269} +{62,106,157,200} +{113,260,272,272,294} +{16,35,80,121,165,176} +{96,154,172,198,263} +{29,53,109,128,129,195} +{131,230,271,273,295,299} +{53,160,208,231} +{23,180,208,249,272} +{45,208,264} +{14,29,169} +{116,147,272} +{7,193,237,271} +{158,198,253} +{41,60,71} +{110,133,200,249} +{24,159,255} +{26,39,61,114,218,229} +{141,286,299} +{74,278} +{67,71,155} +{151,257,284} +{13,28,72,131,206} +{60,152,275,295} +{88,105,184,185} +{85,190,205,256,283,285} +{202,285} +{14,92,160,200,246,279} +{42,95,157,195} +{50,99,224,276} +{32,97,101,122} +{66,85} +{19,146,180,242,269,286} +{24,86,247,274} +{54,264,270,284} +{72,77,85,124,127,285} +{47,249} +{25,73,102,237} +{33,68,84,117,120} +{29,62,172,240,242,273} +{42,140,182,248,261,282} +{118,228,284} +{1,89,158,294} +{29,89,122,155,208,283} +{173,208,229} +{6,22,142,267,299} +{22,122,173,245,293} diff --git a/data/rum_weight.data b/data/rum_weight.data new file mode 100644 index 0000000000..5bce717c1b --- /dev/null +++ b/data/rum_weight.data @@ -0,0 +1,52 @@ +As a reward for your reformation I write to you on this precious sheet.|write +You see I have come to be wonderfully attached to Heidelberg, the|attached come see +beautiful, the quaint, the historically poetic, learned and picturesque| +old town on the Neckar. It seems like another home. So I could not show|seems show could +my appreciation of you in a more complimentary way than by sending this|sending +little series of pictures. Have you ever been here, I wonder? You did|did have been wonder +not say, but you wrote as if you knew it by sight as well as by heart.|wrote say knew +As I cannot know, I will venture an explanation. The panorama speaks for|know will speaks +itself. Put on your "specs" and look at the castle, half way up the|put look +_berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it|Look +from the "Terrasse." Thus you'll get something of an idea of it. The|get +Gesprente Thurm is the one that was blown up by the French. The|is blown was +thickness of the walls, twenty-one feet, and the solid masonry, held it|held +so well that only a fragment, as it were, gave way. It still hangs as if|were gave hangs +ready to be replaced. "Das Grosse Fass Gebaude," too, you will have no|be replaced will have +difficulty in making out. If you only had it with its 49,000 gallons of|making had +wine, but wouldn't you divide with your neighbors! The columns in the|wouldn't divide +portico that shows in the Schlosshof are the four brought from|shows are brought +Charlemagne's palace at Ingelheim by the Count Palatine Ludwig, some| +time between 1508-44. The Zum Ritter has nothing to do with the castle,|has do +but is an ancient structure (1592) in the Renaissance style, and one of|is +the few that escaped destruction in 1693. It is a beautiful, highly|escaped is +ornamental building, and I wish you could see it, if you have not seen|wish could see have seen +it.| +| +All the above information, I beg you to believe, I do not intend you|beg believe do intend +to think was evolved from my inner consciousness, but gathered from|think was evolved gathered +the--nearest guide-book!| +| +I am so much obliged to you for mapping out Switzerland to me. I have|am obliged have +been trying my best to get all those "passes" into my brain. Now, thanks|been trying get +to your letter, I have them all in the handiest kind of a bunch. Ariel|have +like, "I'll do my bidding gently," and as surely, if I get there. But|do bidding get +there are dreadful reports of floods and roads caved in and bridges|are caved +swept away and snows and--enough of such exciting items as sets one|swept sets +thinking--"to go or not to go?" We are this far on the way. Reached|thinking go go are Reached +here this afternoon. Have spent the evening sauntering in the gardens,|Have spent sauntering +the Conversationhaus, the bazaar, mingling with the throng, listening to|mingling listening +the band, and comparing what it is with what it was. It was a gay and|comparing was was +curious spectacle, but on the whole had "the banquet-hall deserted"|had deserted +look. The situation is most beautiful. It lies, you know, at the|is lies know +entrance of the Black Forest, among picturesque, thickly-wooded hills,| +in the valley of the Oos, and extends up the slope of some of the hills.|extends +The Oos is a most turbid, turbulent stream; dashes through part of the|is +town with angry, headlong speed. There is an avenue along its bank of|is +oaks, limes and maples, bordered with flower-beds and shrubberies, and| +adorned with fountains and handsome villas. We shall devote to-morrow to| +seeing all there is to be seen, and go to Strassburg to-morrow evening|seeing is be seen go +for two or three days. From there to Constance, and then hold _our_| +"Council" as to further movements.| +def fgr| +def xxx fgr| diff --git a/expected/altorder.out b/expected/altorder.out index 9caaaba7db..6c0bcae2ad 100644 --- a/expected/altorder.out +++ b/expected/altorder.out @@ -1,120 +1,183 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder.out - test output for 64-bit systems and + * altorder_1.out - test output for 32-bit systems. + * + */ CREATE TABLE atsts (id int, t tsvector, d timestamp); \copy atsts from 'data/tsts.data' -CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_timestamp_ops, d) +-- PGPRO-2537: We need more data to test rumsort.c with logtape.c +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' +CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 - 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 - 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 - 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 - 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 - 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) +(36 rows) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 371 | Tue May 17 06:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; @@ -130,37 +193,37 @@ SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) EXPLAIN (costs off) @@ -177,7 +240,7 @@ SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) EXPLAIN (costs off) @@ -194,9 +257,13 @@ SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN @@ -208,13 +275,13 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (4 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 - 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) EXPLAIN (costs off) @@ -228,13 +295,13 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (4 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 - 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 - 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) EXPLAIN (costs off) @@ -250,11 +317,11 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 - 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 - 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) EXPLAIN (costs off) @@ -269,11 +336,11 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 - 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 (5 rows) EXPLAIN (costs off) @@ -290,15 +357,42 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(36 rows) EXPLAIN (costs off) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; @@ -314,13 +408,163 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) + +EXPLAIN (costs off) +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Index Scan using atsts_idx on atsts + Index Cond: ((t @@ '''wr'' & ''q'':*'::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(4 rows) + +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 +(112 rows) -DROP TABLE atsts CASCADE; diff --git a/expected/altorder_1.out b/expected/altorder_1.out index 309f97df88..980515f58e 100644 --- a/expected/altorder_1.out +++ b/expected/altorder_1.out @@ -1,121 +1,184 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder.out - test output for 64-bit systems and + * altorder_1.out - test output for 32-bit systems. + * + */ CREATE TABLE atsts (id int, t tsvector, d timestamp); \copy atsts from 'data/tsts.data' -CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_timestamp_ops, d) +-- PGPRO-2537: We need more data to test rumsort.c with logtape.c +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' +CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -ERROR: currently, RUM doesn't support order by over pass-by-reference column -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +ERROR: doesn't support order index over pass-by-reference column +INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 - 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 - 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 - 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 - 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 - 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) +(36 rows) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 371 | Tue May 17 06:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; @@ -129,37 +192,37 @@ SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) EXPLAIN (costs off) @@ -174,7 +237,7 @@ SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) EXPLAIN (costs off) @@ -189,9 +252,13 @@ SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN @@ -204,13 +271,13 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 - 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) EXPLAIN (costs off) @@ -225,13 +292,13 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 - 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 - 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) EXPLAIN (costs off) @@ -248,11 +315,11 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 - 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 - 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) EXPLAIN (costs off) @@ -268,11 +335,11 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 - 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 (5 rows) EXPLAIN (costs off) @@ -289,15 +356,42 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(36 rows) EXPLAIN (costs off) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; @@ -313,13 +407,163 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) + +EXPLAIN (costs off) +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Seq Scan on atsts + Filter: ((t @@ '''wr'' & ''q'':*'::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(4 rows) + +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 +(112 rows) -DROP TABLE atsts CASCADE; diff --git a/expected/altorder_hash.out b/expected/altorder_hash.out index ccbef409fe..1011b90d0c 100644 --- a/expected/altorder_hash.out +++ b/expected/altorder_hash.out @@ -1,40 +1,49 @@ -CREATE TABLE atsts (id int, t tsvector, d timestamp); -\copy atsts from 'data/tsts.data' -CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_hash_timestamp_ops, d) +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder_hash.out - test output for 64-bit systems and + * altorder_hash_1.out - test output for 32-bit systems. + * + */ +CREATE TABLE atstsh (id int, t tsvector, d timestamp); +\copy atstsh from 'data/tsts.data' +CREATE INDEX atstsh_idx ON atstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; +INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; count ------- 158 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr&qh'; count ------- 17 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq&yt'; count ------- 6 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq|yt'; count ------- 98 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- 23 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- 39 @@ -43,7 +52,7 @@ SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -53,7 +62,7 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -63,7 +72,7 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -73,19 +82,19 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; count ------- 357 (1 row) -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; count ------- 153 (1 row) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -99,7 +108,7 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -112,102 +121,105 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; QUERY PLAN ------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atsts + -> Bitmap Heap Scan on atstsh Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on atsts_idx + -> Bitmap Index Scan on atstsh_idx Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) (5 rows) -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; count ------- 158 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr&qh'; count ------- 17 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq&yt'; count ------- 6 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq|yt'; count ------- 98 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- 23 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- 39 (1 row) EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; QUERY PLAN ----------------------------------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atsts + -> Bitmap Heap Scan on atstsh Recheck Cond: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) - -> Bitmap Index Scan on atsts_idx + -> Bitmap Index Scan on atstsh_idx Index Cond: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) (5 rows) -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; count ------- 357 (1 row) EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; QUERY PLAN ----------------------------------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atsts + -> Bitmap Heap Scan on atstsh Recheck Cond: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) - -> Bitmap Index Scan on atsts_idx + -> Bitmap Index Scan on atstsh_idx Index Cond: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) (5 rows) -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; count ------- 153 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts + -> Index Scan using atstsh_idx on atstsh Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) (4 rows) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -218,16 +230,16 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts + -> Index Scan using atstsh_idx on atstsh Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) Order By: (d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone) (4 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -238,16 +250,16 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts + -> Index Scan using atstsh_idx on atstsh Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) Order By: (d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) (4 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -258,15 +270,15 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts + -> Index Scan using atstsh_idx on atstsh Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) (3 rows) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -277,16 +289,16 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 (5 rows) EXPLAIN (costs off) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Index Scan using atsts_idx on atsts + -> Index Scan using atstsh_idx on atstsh Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -301,16 +313,16 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER (9 rows) EXPLAIN (costs off) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Index Scan using atsts_idx on atsts + -> Index Scan using atstsh_idx on atstsh Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -323,4 +335,3 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -DROP TABLE atsts CASCADE; diff --git a/expected/altorder_hash_1.out b/expected/altorder_hash_1.out index a01a332fd7..e310fbdb89 100644 --- a/expected/altorder_hash_1.out +++ b/expected/altorder_hash_1.out @@ -1,41 +1,50 @@ -CREATE TABLE atsts (id int, t tsvector, d timestamp); -\copy atsts from 'data/tsts.data' -CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_hash_timestamp_ops, d) +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder_hash.out - test output for 64-bit systems and + * altorder_hash_1.out - test output for 32-bit systems. + * + */ +CREATE TABLE atstsh (id int, t tsvector, d timestamp); +\copy atstsh from 'data/tsts.data' +CREATE INDEX atstsh_idx ON atstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -ERROR: currently, RUM doesn't support order by over pass-by-reference column -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; +ERROR: doesn't support order index over pass-by-reference column +INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; count ------- 158 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr&qh'; count ------- 17 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq&yt'; count ------- 6 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq|yt'; count ------- 98 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- 23 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- 39 @@ -44,7 +53,7 @@ SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -54,7 +63,7 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -64,7 +73,7 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -74,19 +83,19 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; count ------- 357 (1 row) -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; count ------- 153 (1 row) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -100,7 +109,7 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -113,97 +122,100 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; QUERY PLAN --------------------------------------------------- Aggregate - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: (t @@ '''wr'' | ''qh'''::tsquery) (3 rows) -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; count ------- 158 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr&qh'; count ------- 17 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq&yt'; count ------- 6 (1 row) -SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq|yt'; count ------- 98 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- 23 (1 row) -SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- 39 (1 row) EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; QUERY PLAN ------------------------------------------------------------------------------- Aggregate - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) (3 rows) -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; count ------- 357 (1 row) EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; QUERY PLAN ------------------------------------------------------------------------------- Aggregate - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) (3 rows) -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; count ------- 153 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ------------------------------------------------------------------------------------- Limit -> Sort Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: (t @@ '''wr'' & ''qh'''::tsquery) (5 rows) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -214,17 +226,17 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ------------------------------------------------------------------------------------- Limit -> Sort Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: (t @@ '''wr'' & ''qh'''::tsquery) (5 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -235,17 +247,17 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ------------------------------------------------------------------------------------- Limit -> Sort Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: (t @@ '''wr'' & ''qh'''::tsquery) (5 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -256,16 +268,16 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ------------------------------------------------------------------------------------- Limit -> Sort Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on atsts + -> Seq Scan on atstsh (4 rows) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -276,16 +288,16 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 (5 rows) EXPLAIN (costs off) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------ Sort Sort Key: d - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -300,16 +312,16 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER (9 rows) EXPLAIN (costs off) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------ Sort Sort Key: d - -> Seq Scan on atsts + -> Seq Scan on atstsh Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -322,4 +334,3 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -DROP TABLE atsts CASCADE; diff --git a/expected/array.out b/expected/array.out new file mode 100644 index 0000000000..a2fb3bb8df --- /dev/null +++ b/expected/array.out @@ -0,0 +1,922 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * array.out - test output for 64-bit systems and + * array_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +set enable_sort=off; +/* + * Complete checks for int2[]. + */ +CREATE TABLE test_array ( + i int2[] +); +INSERT INTO test_array VALUES ('{}'), ('{0}'), ('{1,2,3,4}'), ('{1,2,3}'), ('{1,2}'), ('{1}'); +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +SELECT NULL::int[] = '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] && '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] @> '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] <@ '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] % '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] <=> '{1}'; + ?column? +---------- + +(1 row) + +INSERT INTO test_array VALUES (NULL); +SELECT * FROM test_array WHERE i = '{1}'; + i +----- + {1} +(1 row) + +DELETE FROM test_array WHERE i IS NULL; +SELECT * FROM test_array WHERE i = '{NULL}'; +ERROR: array must not contain nulls +SELECT * FROM test_array WHERE i = '{1,2,3,NULL}'; +ERROR: array must not contain nulls +SELECT * FROM test_array WHERE i = '{{1,2},{3,4}}'; +ERROR: array must have 1 dimension +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i = '{}'; + i +---- + {} +(1 row) + +SELECT * FROM test_array WHERE i = '{0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i = '{1}'; + i +----- + {1} +(1 row) + +SELECT * FROM test_array WHERE i = '{1,2}'; + i +------- + {1,2} +(1 row) + +SELECT * FROM test_array WHERE i = '{2,1}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i = '{1,2,3,3}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i = '{0,0}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i = '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i && '{}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i && '{1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i && '{3}'; + i +----------- + {1,2,3,4} + {1,2,3} +(2 rows) + +SELECT * FROM test_array WHERE i && '{4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i && '{1,2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{1,2,3}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{1,2,3,4}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{4,3,2,1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{0,0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i && '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i @> '{}'; + i +----------- + {} + {0} + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(6 rows) + +SELECT * FROM test_array WHERE i @> '{1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i @> '{2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i @> '{3}'; + i +----------- + {1,2,3,4} + {1,2,3} +(2 rows) + +SELECT * FROM test_array WHERE i @> '{4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{1,2,4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{1,2,3,4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{4,3,2,1}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{0,0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i @> '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i <@ '{}'; + i +---- + {} +(1 row) + +SELECT * FROM test_array WHERE i <@ '{1}'; + i +----- + {} + {1} +(2 rows) + +SELECT * FROM test_array WHERE i <@ '{2}'; + i +---- + {} +(1 row) + +SELECT * FROM test_array WHERE i <@ '{1,2,4}'; + i +------- + {} + {1,2} + {1} +(3 rows) + +SELECT * FROM test_array WHERE i <@ '{1,2,3,4}'; + i +----------- + {} + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(5 rows) + +SELECT * FROM test_array WHERE i <@ '{4,3,2,1}'; + i +----------- + {} + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(5 rows) + +SELECT * FROM test_array WHERE i <@ '{0,0}'; + i +----- + {} + {0} +(2 rows) + +SELECT * FROM test_array WHERE i <@ '{100}'; + i +---- + {} +(1 row) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i % '{}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i % '{1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i % '{1,2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{1,2,4}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{1,2,3,4}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{4,3,2,1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{1,2,3,4,5}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i % '{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i % '{1,10,20,30,40,50}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i % '{1,10,20,30}'; + i +----- + {1} +(1 row) + +SELECT * FROM test_array WHERE i % '{1,1,1,1,1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{0,0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i % '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{1}'::smallint[]) + Order By: (i <=> '{1}'::smallint[]) +(3 rows) + +SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; + i +----------- + {1} + {1,2} + {1,2,3} + {1,2,3,4} +(4 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ADD COLUMN add_info timestamp; +CREATE INDEX idx_array ON test_array +USING rum (i rum_anyarray_addon_ops, add_info) +WITH (attach = 'add_info', to = 'i'); +WITH q as ( + SELECT row_number() OVER (ORDER BY i) idx, ctid FROM test_array +) +UPDATE test_array SET add_info = '2016-05-16 14:21:25'::timestamp + + format('%s days', q.idx)::interval +FROM q WHERE test_array.ctid = q.ctid; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +---------------------------------- + Seq Scan on test_array + Filter: (i % '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{1}' ORDER BY add_info <=> '2016-05-16 14:21:25' LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------ + Limit + -> Index Scan using idx_array on test_array + Index Cond: (i && '{1}'::smallint[]) + Order By: (add_info <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT * FROM test_array WHERE i && '{1}' ORDER BY add_info <=> '2016-05-16 14:21:25' LIMIT 10; + i | add_info +-----------+-------------------------- + {1} | Thu May 19 14:21:25 2016 + {1,2} | Fri May 20 14:21:25 2016 + {1,2,3} | Sat May 21 14:21:25 2016 + {1,2,3,4} | Sun May 22 14:21:25 2016 +(4 rows) + +DROP INDEX idx_array; +/* + * Sanity checks for popular array types. + */ +ALTER TABLE test_array ALTER COLUMN i TYPE int4[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::integer[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE int8[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::bigint[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE text[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::text[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE varchar[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::character varying[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE char[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::bpchar[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE numeric[] USING i::numeric[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::numeric[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE float4[] USING i::float4[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::real[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE float8[] USING i::float8[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +---------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +---------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::double precision[]) +(2 rows) + +DROP INDEX idx_array; +/* + * Check ordering using distance operator + * + * We want to check that index scan provides us correct ordering by distance + * operator. File 'data/rum_array.data' contains two arrays that statisfy + * i @> '{23,20}' and have finite distance i <=> '{51}', and a bunch of arrays + * that statisfy i @> '{23,20}' and have infinite distance i <=> '{51}'. + * + * When ordering by distance the order of this bunch of arrays with infinite + * distance is not determined and may depend of PostgreSQL version and system. + * We don't add another sort expression to ORDER BY because that might cause + * the planner to avoid using the index. Instead, we replace arrays that have + * infinite distance with {-1} to unambiguously determine the test output. + * + * 'Infinity' is printed differently in the output in different PostgreSQL + * versions, so we replace it with -1. + */ +CREATE TABLE test_array_order ( + i int2[] +); +\copy test_array_order(i) from 'data/rum_array.data'; +CREATE INDEX idx_array_order ON test_array_order USING rum (i rum_anyarray_ops); +/* + * Check that plan of the query uses ordering provided by index scan + */ +EXPLAIN (COSTS OFF) +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + QUERY PLAN +------------------------------------------------------------ + Subquery Scan on t + -> Index Scan using idx_array_order on test_array_order + Index Cond: (i @> '{23,20}'::smallint[]) + Order By: (i <=> '{51}'::smallint[]) +(4 rows) + +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + i | distance +---------------------+------------------ + {20,23,51} | 1.73205080756888 + {33,51,20,77,23,65} | 2.44948974278318 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 +(12 rows) + diff --git a/expected/array_1.out b/expected/array_1.out new file mode 100644 index 0000000000..cc5f93307c --- /dev/null +++ b/expected/array_1.out @@ -0,0 +1,915 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * array.out - test output for 64-bit systems and + * array_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +set enable_sort=off; +/* + * Complete checks for int2[]. + */ +CREATE TABLE test_array ( + i int2[] +); +INSERT INTO test_array VALUES ('{}'), ('{0}'), ('{1,2,3,4}'), ('{1,2,3}'), ('{1,2}'), ('{1}'); +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +SELECT NULL::int[] = '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] && '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] @> '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] <@ '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] % '{1}'; + ?column? +---------- + +(1 row) + +SELECT NULL::int[] <=> '{1}'; + ?column? +---------- + +(1 row) + +INSERT INTO test_array VALUES (NULL); +SELECT * FROM test_array WHERE i = '{1}'; + i +----- + {1} +(1 row) + +DELETE FROM test_array WHERE i IS NULL; +SELECT * FROM test_array WHERE i = '{NULL}'; +ERROR: array must not contain nulls +SELECT * FROM test_array WHERE i = '{1,2,3,NULL}'; +ERROR: array must not contain nulls +SELECT * FROM test_array WHERE i = '{{1,2},{3,4}}'; +ERROR: array must have 1 dimension +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i = '{}'; + i +---- + {} +(1 row) + +SELECT * FROM test_array WHERE i = '{0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i = '{1}'; + i +----- + {1} +(1 row) + +SELECT * FROM test_array WHERE i = '{1,2}'; + i +------- + {1,2} +(1 row) + +SELECT * FROM test_array WHERE i = '{2,1}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i = '{1,2,3,3}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i = '{0,0}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i = '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i && '{}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i && '{1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i && '{3}'; + i +----------- + {1,2,3,4} + {1,2,3} +(2 rows) + +SELECT * FROM test_array WHERE i && '{4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i && '{1,2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{1,2,3}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{1,2,3,4}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{4,3,2,1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i && '{0,0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i && '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i @> '{}'; + i +----------- + {} + {0} + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(6 rows) + +SELECT * FROM test_array WHERE i @> '{1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i @> '{2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i @> '{3}'; + i +----------- + {1,2,3,4} + {1,2,3} +(2 rows) + +SELECT * FROM test_array WHERE i @> '{4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{1,2,4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{1,2,3,4}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{4,3,2,1}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i @> '{0,0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i @> '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i <@ '{}'; + i +---- + {} +(1 row) + +SELECT * FROM test_array WHERE i <@ '{1}'; + i +----- + {} + {1} +(2 rows) + +SELECT * FROM test_array WHERE i <@ '{2}'; + i +---- + {} +(1 row) + +SELECT * FROM test_array WHERE i <@ '{1,2,4}'; + i +------- + {} + {1,2} + {1} +(3 rows) + +SELECT * FROM test_array WHERE i <@ '{1,2,3,4}'; + i +----------- + {} + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(5 rows) + +SELECT * FROM test_array WHERE i <@ '{4,3,2,1}'; + i +----------- + {} + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(5 rows) + +SELECT * FROM test_array WHERE i <@ '{0,0}'; + i +----- + {} + {0} +(2 rows) + +SELECT * FROM test_array WHERE i <@ '{100}'; + i +---- + {} +(1 row) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::smallint[]) +(2 rows) + +SELECT * FROM test_array WHERE i % '{}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i % '{1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i % '{1,2}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{1,2,4}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{1,2,3,4}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{4,3,2,1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{1,2,3,4,5}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} +(3 rows) + +SELECT * FROM test_array WHERE i % '{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}'; + i +----------- + {1,2,3,4} +(1 row) + +SELECT * FROM test_array WHERE i % '{1,10,20,30,40,50}'; + i +--- +(0 rows) + +SELECT * FROM test_array WHERE i % '{1,10,20,30}'; + i +----- + {1} +(1 row) + +SELECT * FROM test_array WHERE i % '{1,1,1,1,1}'; + i +----------- + {1,2,3,4} + {1,2,3} + {1,2} + {1} +(4 rows) + +SELECT * FROM test_array WHERE i % '{0,0}'; + i +----- + {0} +(1 row) + +SELECT * FROM test_array WHERE i % '{100}'; + i +--- +(0 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{1}'::smallint[]) + Order By: (i <=> '{1}'::smallint[]) +(3 rows) + +SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; + i +----------- + {1} + {1,2} + {1,2,3} + {1,2,3,4} +(4 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ADD COLUMN add_info timestamp; +CREATE INDEX idx_array ON test_array +USING rum (i rum_anyarray_addon_ops, add_info) +WITH (attach = 'add_info', to = 'i'); +WITH q as ( + SELECT row_number() OVER (ORDER BY i) idx, ctid FROM test_array +) +UPDATE test_array SET add_info = '2016-05-16 14:21:25'::timestamp + + format('%s days', q.idx)::interval +FROM q WHERE test_array.ctid = q.ctid; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +---------------------------------- + Seq Scan on test_array + Filter: (i % '{}'::smallint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{1}' ORDER BY add_info <=> '2016-05-16 14:21:25' LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------ + Limit + -> Index Scan using idx_array on test_array + Index Cond: (i && '{1}'::smallint[]) + Order By: (add_info <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT * FROM test_array WHERE i && '{1}' ORDER BY add_info <=> '2016-05-16 14:21:25' LIMIT 10; +ERROR: doesn't support order by over pass-by-reference column +DROP INDEX idx_array; +/* + * Sanity checks for popular array types. + */ +ALTER TABLE test_array ALTER COLUMN i TYPE int4[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::integer[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::integer[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE int8[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::bigint[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::bigint[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE text[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::text[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::text[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE varchar[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::character varying[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::character varying[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE char[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::bpchar[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::bpchar[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE numeric[] USING i::numeric[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::numeric[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::numeric[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE float4[] USING i::float4[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::real[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +------------------------------------------ + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::real[]) +(2 rows) + +DROP INDEX idx_array; +ALTER TABLE test_array ALTER COLUMN i TYPE float8[] USING i::float8[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; + QUERY PLAN +---------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i = '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i && '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i @> '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; + QUERY PLAN +----------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i <@ '{}'::double precision[]) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + QUERY PLAN +---------------------------------------------- + Index Scan using idx_array on test_array + Index Cond: (i % '{}'::double precision[]) +(2 rows) + +DROP INDEX idx_array; +/* + * Check ordering using distance operator + * + * We want to check that index scan provides us correct ordering by distance + * operator. File 'data/rum_array.data' contains two arrays that statisfy + * i @> '{23,20}' and have finite distance i <=> '{51}', and a bunch of arrays + * that statisfy i @> '{23,20}' and have infinite distance i <=> '{51}'. + * + * When ordering by distance the order of this bunch of arrays with infinite + * distance is not determined and may depend of PostgreSQL version and system. + * We don't add another sort expression to ORDER BY because that might cause + * the planner to avoid using the index. Instead, we replace arrays that have + * infinite distance with {-1} to unambiguously determine the test output. + * + * 'Infinity' is printed differently in the output in different PostgreSQL + * versions, so we replace it with -1. + */ +CREATE TABLE test_array_order ( + i int2[] +); +\copy test_array_order(i) from 'data/rum_array.data'; +CREATE INDEX idx_array_order ON test_array_order USING rum (i rum_anyarray_ops); +/* + * Check that plan of the query uses ordering provided by index scan + */ +EXPLAIN (COSTS OFF) +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + QUERY PLAN +------------------------------------------------------------ + Subquery Scan on t + -> Index Scan using idx_array_order on test_array_order + Index Cond: (i @> '{23,20}'::smallint[]) + Order By: (i <=> '{51}'::smallint[]) +(4 rows) + +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + i | distance +---------------------+------------------ + {20,23,51} | 1.73205080756888 + {33,51,20,77,23,65} | 2.44948974278318 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 +(12 rows) + diff --git a/expected/bit.out b/expected/bit.out new file mode 100644 index 0000000000..4a1ff72037 --- /dev/null +++ b/expected/bit.out @@ -0,0 +1,44 @@ +set enable_seqscan=off; +CREATE TABLE test_bit ( + i bit(3) +); +INSERT INTO test_bit VALUES ('001'),('010'),('011'),('100'),('101'),('110'); +CREATE INDEX idx_bit ON test_bit USING rum (i); +SELECT * FROM test_bit WHERE i<'100'::bit(3) ORDER BY i; + i +----- + 001 + 010 + 011 +(3 rows) + +SELECT * FROM test_bit WHERE i<='100'::bit(3) ORDER BY i; + i +----- + 001 + 010 + 011 + 100 +(4 rows) + +SELECT * FROM test_bit WHERE i='100'::bit(3) ORDER BY i; + i +----- + 100 +(1 row) + +SELECT * FROM test_bit WHERE i>='100'::bit(3) ORDER BY i; + i +----- + 100 + 101 + 110 +(3 rows) + +SELECT * FROM test_bit WHERE i>'100'::bit(3) ORDER BY i; + i +----- + 101 + 110 +(2 rows) + diff --git a/expected/bytea.out b/expected/bytea.out new file mode 100644 index 0000000000..f76865901e --- /dev/null +++ b/expected/bytea.out @@ -0,0 +1,46 @@ +set enable_seqscan=off; +-- ensure consistent test output regardless of the default bytea format +SET bytea_output TO escape; +CREATE TABLE test_bytea ( + i bytea +); +INSERT INTO test_bytea VALUES ('a'),('ab'),('abc'),('abb'),('axy'),('xyz'); +CREATE INDEX idx_bytea ON test_bytea USING rum (i); +SELECT * FROM test_bytea WHERE i<'abc'::bytea ORDER BY i; + i +----- + a + ab + abb +(3 rows) + +SELECT * FROM test_bytea WHERE i<='abc'::bytea ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + +SELECT * FROM test_bytea WHERE i='abc'::bytea ORDER BY i; + i +----- + abc +(1 row) + +SELECT * FROM test_bytea WHERE i>='abc'::bytea ORDER BY i; + i +----- + abc + axy + xyz +(3 rows) + +SELECT * FROM test_bytea WHERE i>'abc'::bytea ORDER BY i; + i +----- + axy + xyz +(2 rows) + diff --git a/expected/char.out b/expected/char.out new file mode 100644 index 0000000000..ccc8fb0415 --- /dev/null +++ b/expected/char.out @@ -0,0 +1,37 @@ +set enable_seqscan=off; +CREATE TABLE test_char ( + i "char" +); +INSERT INTO test_char VALUES ('a'),('b'),('c'),('d'),('e'),('f'); +CREATE INDEX idx_char ON test_char USING rum (i); +SELECT * FROM test_char WHERE i<'d'::"char" ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_char WHERE i<='d'::"char" ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_char WHERE i='d'::"char" ORDER BY i; + i +--- + d +(1 row) + +SELECT * FROM test_char WHERE i>='d'::"char" ORDER BY i; + i +--- + d + e + f +(3 rows) + +SELECT * FROM test_char WHERE i>'d'::"char" ORDER BY i; + i +--- + e + f +(2 rows) + diff --git a/expected/cidr.out b/expected/cidr.out new file mode 100644 index 0000000000..1765b944bd --- /dev/null +++ b/expected/cidr.out @@ -0,0 +1,51 @@ +set enable_seqscan=off; +CREATE TABLE test_cidr ( + i cidr +); +INSERT INTO test_cidr VALUES + ( '1.2.3.4' ), + ( '1.2.4.4' ), + ( '1.2.5.4' ), + ( '1.2.6.4' ), + ( '1.2.7.4' ), + ( '1.2.8.4' ) +; +CREATE INDEX idx_cidr ON test_cidr USING rum (i); +SELECT * FROM test_cidr WHERE i<'1.2.6.4'::cidr ORDER BY i; + i +------------ + 1.2.3.4/32 + 1.2.4.4/32 + 1.2.5.4/32 +(3 rows) + +SELECT * FROM test_cidr WHERE i<='1.2.6.4'::cidr ORDER BY i; + i +------------ + 1.2.3.4/32 + 1.2.4.4/32 + 1.2.5.4/32 + 1.2.6.4/32 +(4 rows) + +SELECT * FROM test_cidr WHERE i='1.2.6.4'::cidr ORDER BY i; + i +------------ + 1.2.6.4/32 +(1 row) + +SELECT * FROM test_cidr WHERE i>='1.2.6.4'::cidr ORDER BY i; + i +------------ + 1.2.6.4/32 + 1.2.7.4/32 + 1.2.8.4/32 +(3 rows) + +SELECT * FROM test_cidr WHERE i>'1.2.6.4'::cidr ORDER BY i; + i +------------ + 1.2.7.4/32 + 1.2.8.4/32 +(2 rows) + diff --git a/expected/date.out b/expected/date.out new file mode 100644 index 0000000000..04247310c5 --- /dev/null +++ b/expected/date.out @@ -0,0 +1,51 @@ +set enable_seqscan=off; +CREATE TABLE test_date ( + i date +); +INSERT INTO test_date VALUES + ( '2004-10-23' ), + ( '2004-10-24' ), + ( '2004-10-25' ), + ( '2004-10-26' ), + ( '2004-10-27' ), + ( '2004-10-28' ) +; +CREATE INDEX idx_date ON test_date USING rum (i); +SELECT * FROM test_date WHERE i<'2004-10-26'::date ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 +(3 rows) + +SELECT * FROM test_date WHERE i<='2004-10-26'::date ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 +(4 rows) + +SELECT * FROM test_date WHERE i='2004-10-26'::date ORDER BY i; + i +------------ + 10-26-2004 +(1 row) + +SELECT * FROM test_date WHERE i>='2004-10-26'::date ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 +(3 rows) + +SELECT * FROM test_date WHERE i>'2004-10-26'::date ORDER BY i; + i +------------ + 10-27-2004 + 10-28-2004 +(2 rows) + diff --git a/expected/expr.out b/expected/expr.out new file mode 100644 index 0000000000..b57de73ff4 --- /dev/null +++ b/expected/expr.out @@ -0,0 +1,26 @@ +CREATE TABLE documents ( + en text not null, + score float not null, + textsearch_index_en_col tsvector +); +INSERT INTO documents VALUES ('the pet cat is in the shed', 56, to_tsvector('english', 'the pet cat is in the shed')); +CREATE INDEX textsearch_index_en ON documents + USING rum (textsearch_index_en_col rum_tsvector_addon_ops, score) + WITH (attach = 'score', to = 'textsearch_index_en_col'); +SET enable_seqscan=off; +-- should be 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + en | score | textsearch_index_en_col +----------------------------+-------+-------------------------- + the pet cat is in the shed | 56 | 'cat':3 'pet':2 'shed':7 +(1 row) + +SET enable_seqscan=on; +-- 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + en | score | textsearch_index_en_col +----------------------------+-------+-------------------------- + the pet cat is in the shed | 56 | 'cat':3 'pet':2 'shed':7 +(1 row) + +DROP TABLE documents; diff --git a/expected/float4.out b/expected/float4.out new file mode 100644 index 0000000000..59e036d35d --- /dev/null +++ b/expected/float4.out @@ -0,0 +1,80 @@ +set enable_seqscan=off; +CREATE TABLE test_float4 ( + i float4 +); +INSERT INTO test_float4 VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_float4 ON test_float4 USING rum (i); +SELECT * FROM test_float4 WHERE i<1::float4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_float4 WHERE i<=1::float4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_float4 WHERE i=1::float4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_float4 WHERE i>=1::float4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_float4 WHERE i>1::float4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::float4 FROM test_float4 ORDER BY i <=> 0::float4; + QUERY PLAN +-------------------------------------------- + Index Scan using idx_float4 on test_float4 + Order By: (i <=> '0'::real) +(2 rows) + +SELECT *, i <=> 0::float4 FROM test_float4 ORDER BY i <=> 0::float4; + i | ?column? +----+---------- + 0 | 0 + -1 | 1 + 1 | 1 + -2 | 2 + 2 | 2 + 3 | 3 +(6 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 1::float4 FROM test_float4 WHERE i<1::float4 ORDER BY i <=> 1::float4; + QUERY PLAN +-------------------------------------------- + Index Scan using idx_float4 on test_float4 + Index Cond: (i < '1'::real) + Order By: (i <=> '1'::real) +(3 rows) + +SELECT *, i <=> 1::float4 FROM test_float4 WHERE i<1::float4 ORDER BY i <=> 1::float4; + i | ?column? +----+---------- + 0 | 1 + -1 | 2 + -2 | 3 +(3 rows) + diff --git a/expected/float8.out b/expected/float8.out new file mode 100644 index 0000000000..fdca51343a --- /dev/null +++ b/expected/float8.out @@ -0,0 +1,89 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * float8.out - test output for 64-bit systems and + * float8_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +CREATE TABLE test_float8 ( + i float8 +); +INSERT INTO test_float8 VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_float8 ON test_float8 USING rum (i); +SELECT * FROM test_float8 WHERE i<1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_float8 WHERE i<=1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_float8 WHERE i=1::float8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_float8 WHERE i>=1::float8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_float8 WHERE i>1::float8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::float8 FROM test_float8 ORDER BY i <=> 0::float8; + QUERY PLAN +-------------------------------------------- + Index Scan using idx_float8 on test_float8 + Order By: (i <=> '0'::double precision) +(2 rows) + +SELECT *, i <=> 0::float8 FROM test_float8 ORDER BY i <=> 0::float8; + i | ?column? +----+---------- + 0 | 0 + -1 | 1 + 1 | 1 + -2 | 2 + 2 | 2 + 3 | 3 +(6 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 1::float8 FROM test_float8 WHERE i<1::float8 ORDER BY i <=> 1::float8; + QUERY PLAN +-------------------------------------------- + Index Scan using idx_float8 on test_float8 + Index Cond: (i < '1'::double precision) + Order By: (i <=> '1'::double precision) +(3 rows) + +SELECT *, i <=> 1::float8 FROM test_float8 WHERE i<1::float8 ORDER BY i <=> 1::float8; + i | ?column? +----+---------- + 0 | 1 + -1 | 2 + -2 | 3 +(3 rows) + diff --git a/expected/float8_1.out b/expected/float8_1.out new file mode 100644 index 0000000000..b421dcf311 --- /dev/null +++ b/expected/float8_1.out @@ -0,0 +1,74 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * float8.out - test output for 64-bit systems and + * float8_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +CREATE TABLE test_float8 ( + i float8 +); +INSERT INTO test_float8 VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_float8 ON test_float8 USING rum (i); +SELECT * FROM test_float8 WHERE i<1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_float8 WHERE i<=1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_float8 WHERE i=1::float8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_float8 WHERE i>=1::float8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_float8 WHERE i>1::float8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::float8 FROM test_float8 ORDER BY i <=> 0::float8; + QUERY PLAN +-------------------------------------------- + Index Scan using idx_float8 on test_float8 + Order By: (i <=> '0'::double precision) +(2 rows) + +SELECT *, i <=> 0::float8 FROM test_float8 ORDER BY i <=> 0::float8; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT *, i <=> 1::float8 FROM test_float8 WHERE i<1::float8 ORDER BY i <=> 1::float8; + QUERY PLAN +-------------------------------------------- + Index Scan using idx_float8 on test_float8 + Index Cond: (i < '1'::double precision) + Order By: (i <=> '1'::double precision) +(3 rows) + +SELECT *, i <=> 1::float8 FROM test_float8 WHERE i<1::float8 ORDER BY i <=> 1::float8; +ERROR: doesn't support order by over pass-by-reference column diff --git a/expected/inet.out b/expected/inet.out new file mode 100644 index 0000000000..fef9e0f7f5 --- /dev/null +++ b/expected/inet.out @@ -0,0 +1,51 @@ +set enable_seqscan=off; +CREATE TABLE test_inet ( + i inet +); +INSERT INTO test_inet VALUES + ( '1.2.3.4/16' ), + ( '1.2.4.4/16' ), + ( '1.2.5.4/16' ), + ( '1.2.6.4/16' ), + ( '1.2.7.4/16' ), + ( '1.2.8.4/16' ) +; +CREATE INDEX idx_inet ON test_inet USING rum (i); +SELECT * FROM test_inet WHERE i<'1.2.6.4/16'::inet ORDER BY i; + i +------------ + 1.2.3.4/16 + 1.2.4.4/16 + 1.2.5.4/16 +(3 rows) + +SELECT * FROM test_inet WHERE i<='1.2.6.4/16'::inet ORDER BY i; + i +------------ + 1.2.3.4/16 + 1.2.4.4/16 + 1.2.5.4/16 + 1.2.6.4/16 +(4 rows) + +SELECT * FROM test_inet WHERE i='1.2.6.4/16'::inet ORDER BY i; + i +------------ + 1.2.6.4/16 +(1 row) + +SELECT * FROM test_inet WHERE i>='1.2.6.4/16'::inet ORDER BY i; + i +------------ + 1.2.6.4/16 + 1.2.7.4/16 + 1.2.8.4/16 +(3 rows) + +SELECT * FROM test_inet WHERE i>'1.2.6.4/16'::inet ORDER BY i; + i +------------ + 1.2.7.4/16 + 1.2.8.4/16 +(2 rows) + diff --git a/expected/int2.out b/expected/int2.out new file mode 100644 index 0000000000..68f66a3c0b --- /dev/null +++ b/expected/int2.out @@ -0,0 +1,80 @@ +set enable_seqscan=off; +CREATE TABLE test_int2 ( + i int2 +); +INSERT INTO test_int2 VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_int2 ON test_int2 USING rum (i); +SELECT * FROM test_int2 WHERE i<1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int2 WHERE i<=1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int2 WHERE i=1::int2 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int2 WHERE i>=1::int2 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int2 WHERE i>1::int2 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::int2 FROM test_int2 ORDER BY i <=> 0::int2; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int2 on test_int2 + Order By: (i <=> '0'::smallint) +(2 rows) + +SELECT *, i <=> 0::int2 FROM test_int2 ORDER BY i <=> 0::int2; + i | ?column? +----+---------- + 0 | 0 + -1 | 1 + 1 | 1 + -2 | 2 + 2 | 2 + 3 | 3 +(6 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 1::int2 FROM test_int2 WHERE i<1::int2 ORDER BY i <=> 1::int2; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int2 on test_int2 + Index Cond: (i < '1'::smallint) + Order By: (i <=> '1'::smallint) +(3 rows) + +SELECT *, i <=> 1::int2 FROM test_int2 WHERE i<1::int2 ORDER BY i <=> 1::int2; + i | ?column? +----+---------- + 0 | 1 + -1 | 2 + -2 | 3 +(3 rows) + diff --git a/expected/int4.out b/expected/int4.out new file mode 100644 index 0000000000..00b73e3432 --- /dev/null +++ b/expected/int4.out @@ -0,0 +1,709 @@ +set enable_seqscan=off; +CREATE TABLE test_int4 ( + i int4 +); +INSERT INTO test_int4 VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_int4 ON test_int4 USING rum (i); +SELECT * FROM test_int4 WHERE i<1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int4 WHERE i<=1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int4 WHERE i=1::int4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int4 WHERE i>=1::int4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int4 WHERE i>1::int4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::int4 FROM test_int4 ORDER BY i <=> 0::int4; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int4 on test_int4 + Order By: (i <=> 0) +(2 rows) + +SELECT *, i <=> 0::int4 FROM test_int4 ORDER BY i <=> 0::int4; + i | ?column? +----+---------- + 0 | 0 + -1 | 1 + 1 | 1 + -2 | 2 + 2 | 2 + 3 | 3 +(6 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 1::int4 FROM test_int4 WHERE i<1::int4 ORDER BY i <=> 1::int4; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int4 on test_int4 + Index Cond: (i < 1) + Order By: (i <=> 1) +(3 rows) + +SELECT *, i <=> 1::int4 FROM test_int4 WHERE i<1::int4 ORDER BY i <=> 1::int4; + i | ?column? +----+---------- + 0 | 1 + -1 | 2 + -2 | 3 +(3 rows) + +CREATE TABLE test_int4_o AS SELECT id::int4, t FROM tsts; +CREATE INDEX test_int4_o_idx ON test_int4_o USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't'); +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +SELECT id, id <=| 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +SELECT id, id |=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int4_o_idx on test_int4_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> 400) +(4 rows) + +SELECT id, id <=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int4_o_idx on test_int4_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| 400) +(4 rows) + +SELECT id, id <=| 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int4_o_idx on test_int4_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> 400) +(4 rows) + +SELECT id, id |=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_o_idx on test_int4_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= 400)) +(4 rows) + +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_o_idx on test_int4_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= 400)) +(4 rows) + +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int4_a AS SELECT id::int4, t FROM tsts; +CREATE INDEX test_int4_a_idx ON test_int4_a USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +EXPLAIN (costs off) +SELECT count(*) FROM test_int4_a WHERE id < 400; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Index Scan using test_int4_a_idx on test_int4_a + Index Cond: (id < 400) +(3 rows) + +SELECT count(*) FROM test_int4_a WHERE id < 400; + count +------- + 401 +(1 row) + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int4_a_idx on test_int4_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> 400) +(4 rows) + +SELECT id, id <=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int4_a_idx on test_int4_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| 400) +(4 rows) + +SELECT id, id <=| 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int4_a_idx on test_int4_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> 400) +(4 rows) + +SELECT id, id |=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_a_idx on test_int4_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= 400)) +(4 rows) + +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_a_idx on test_int4_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= 400)) +(4 rows) + +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int4_h_o AS SELECT id::int4, t FROM tsts; +CREATE INDEX test_int4_h_o_idx ON test_int4_h_o USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't'); +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +SELECT id, id <=| 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +SELECT id, id |=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int4_h_o_idx on test_int4_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> 400) +(4 rows) + +SELECT id, id <=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int4_h_o_idx on test_int4_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| 400) +(4 rows) + +SELECT id, id <=| 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int4_h_o_idx on test_int4_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> 400) +(4 rows) + +SELECT id, id |=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_h_o_idx on test_int4_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= 400)) +(4 rows) + +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_h_o_idx on test_int4_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= 400)) +(4 rows) + +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int4_h_a AS SELECT id::int4, t FROM tsts; +CREATE INDEX test_int4_h_a_idx ON test_int4_h_a USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +EXPLAIN (costs off) +SELECT count(*) FROM test_int4_h_a WHERE id < 400; + QUERY PLAN +----------------------------------------------------------- + Aggregate + -> Index Scan using test_int4_h_a_idx on test_int4_h_a + Index Cond: (id < 400) +(3 rows) + +SELECT count(*) FROM test_int4_h_a WHERE id < 400; + count +------- + 401 +(1 row) + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int4_h_a_idx on test_int4_h_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> 400) +(4 rows) + +SELECT id, id <=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int4_h_a_idx on test_int4_h_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| 400) +(4 rows) + +SELECT id, id <=| 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int4_h_a_idx on test_int4_h_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> 400) +(4 rows) + +SELECT id, id |=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_h_a_idx on test_int4_h_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= 400)) +(4 rows) + +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int4_h_a_idx on test_int4_h_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= 400)) +(4 rows) + +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int4_id_t AS SELECT id::int4, t FROM tsts; +CREATE INDEX test_int4_id_t_idx ON test_int4_o USING rum + (t rum_tsvector_ops, id); +EXPLAIN (costs off) +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400::int4 ORDER BY id <=> 400::int4; + QUERY PLAN +------------------------------------------------------------------- + Index Scan using test_int4_h_a_idx on test_int4_h_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= 400)) + Order By: (id <=> 400) +(3 rows) + +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400::int4 ORDER BY id <=> 400::int4; + id +----- + 371 + 355 + 354 + 252 + 232 + 168 + 135 + 71 + 39 + 16 +(10 rows) + diff --git a/expected/int8.out b/expected/int8.out new file mode 100644 index 0000000000..663162a18e --- /dev/null +++ b/expected/int8.out @@ -0,0 +1,718 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * int8.out - test output for 64-bit systems and + * int8_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +CREATE TABLE test_int8 ( + i int8 +); +INSERT INTO test_int8 VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_int8 ON test_int8 USING rum (i); +SELECT * FROM test_int8 WHERE i<1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int8 WHERE i<=1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int8 WHERE i=1::int8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int8 WHERE i>=1::int8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int8 WHERE i>1::int8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::int8 FROM test_int8 ORDER BY i <=> 0::int8; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int8 on test_int8 + Order By: (i <=> '0'::bigint) +(2 rows) + +SELECT *, i <=> 0::int8 FROM test_int8 ORDER BY i <=> 0::int8; + i | ?column? +----+---------- + 0 | 0 + -1 | 1 + 1 | 1 + -2 | 2 + 2 | 2 + 3 | 3 +(6 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 1::int8 FROM test_int8 WHERE i<1::int8 ORDER BY i <=> 1::int8; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int8 on test_int8 + Index Cond: (i < '1'::bigint) + Order By: (i <=> '1'::bigint) +(3 rows) + +SELECT *, i <=> 1::int8 FROM test_int8 WHERE i<1::int8 ORDER BY i <=> 1::int8; + i | ?column? +----+---------- + 0 | 1 + -1 | 2 + -2 | 3 +(3 rows) + +CREATE TABLE test_int8_o AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_o_idx ON test_int8_o USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't'); +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> '400'::bigint) +(4 rows) + +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| '400'::bigint) +(4 rows) + +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> '400'::bigint) +(4 rows) + +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_a AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_a_idx ON test_int8_a USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +EXPLAIN (costs off) +SELECT count(*) FROM test_int8_a WHERE id < 400::int8; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Index Scan using test_int8_a_idx on test_int8_a + Index Cond: (id < '400'::bigint) +(3 rows) + +SELECT count(*) FROM test_int8_a WHERE id < 400::int8; + count +------- + 401 +(1 row) + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_a_idx on test_int8_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> '400'::bigint) +(4 rows) + +SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_a_idx on test_int8_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| '400'::bigint) +(4 rows) + +SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_a_idx on test_int8_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> '400'::bigint) +(4 rows) + +SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_a_idx on test_int8_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_a_idx on test_int8_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_h_o AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_h_o_idx ON test_int8_h_o USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't'); +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> '400'::bigint) +(4 rows) + +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| '400'::bigint) +(4 rows) + +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> '400'::bigint) +(4 rows) + +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_h_a AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_h_a_idx ON test_int8_h_a USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +EXPLAIN (costs off) +SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; + QUERY PLAN +----------------------------------------------------------- + Aggregate + -> Index Scan using test_int8_h_a_idx on test_int8_h_a + Index Cond: (id < '400'::bigint) +(3 rows) + +SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; + count +------- + 401 +(1 row) + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_a_idx on test_int8_h_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> '400'::bigint) +(4 rows) + +SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_a_idx on test_int8_h_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| '400'::bigint) +(4 rows) + +SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_a_idx on test_int8_h_a + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> '400'::bigint) +(4 rows) + +SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_h_a_idx on test_int8_h_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_h_a_idx on test_int8_h_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_id_t AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_id_t_idx ON test_int8_o USING rum + (t rum_tsvector_ops, id); +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; + QUERY PLAN +----------------------------------------------------------------------------- + Index Scan using test_int8_h_a_idx on test_int8_h_a + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) + Order By: (id <=> '400'::bigint) +(3 rows) + +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; + id +----- + 371 + 355 + 354 + 252 + 232 + 168 + 135 + 71 + 39 + 16 +(10 rows) + diff --git a/expected/int8_1.out b/expected/int8_1.out new file mode 100644 index 0000000000..ffced0aaf8 --- /dev/null +++ b/expected/int8_1.out @@ -0,0 +1,664 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * int8.out - test output for 64-bit systems and + * int8_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +CREATE TABLE test_int8 ( + i int8 +); +INSERT INTO test_int8 VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_int8 ON test_int8 USING rum (i); +SELECT * FROM test_int8 WHERE i<1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int8 WHERE i<=1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int8 WHERE i=1::int8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int8 WHERE i>=1::int8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int8 WHERE i>1::int8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::int8 FROM test_int8 ORDER BY i <=> 0::int8; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int8 on test_int8 + Order By: (i <=> '0'::bigint) +(2 rows) + +SELECT *, i <=> 0::int8 FROM test_int8 ORDER BY i <=> 0::int8; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT *, i <=> 1::int8 FROM test_int8 WHERE i<1::int8 ORDER BY i <=> 1::int8; + QUERY PLAN +---------------------------------------- + Index Scan using idx_int8 on test_int8 + Index Cond: (i < '1'::bigint) + Order By: (i <=> '1'::bigint) +(3 rows) + +SELECT *, i <=> 1::int8 FROM test_int8 WHERE i<1::int8 ORDER BY i <=> 1::int8; +ERROR: doesn't support order by over pass-by-reference column +CREATE TABLE test_int8_o AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_o_idx ON test_int8_o USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't'); +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> '400'::bigint) +(4 rows) + +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| '400'::bigint) +(4 rows) + +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +------------------------------------------------------- + Limit + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> '400'::bigint) +(4 rows) + +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_o_idx on test_int8_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_a AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_a_idx ON test_int8_a USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +ERROR: doesn't support order index over pass-by-reference column +EXPLAIN (costs off) +SELECT count(*) FROM test_int8_a WHERE id < 400::int8; + QUERY PLAN +-------------------------------------- + Aggregate + -> Seq Scan on test_int8_a + Filter: (id < '400'::bigint) +(3 rows) + +SELECT count(*) FROM test_int8_a WHERE id < 400::int8; + count +------- + 401 +(1 row) + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Sort + Sort Key: ((id <=> '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) + +SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Sort + Sort Key: ((id <=| '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) + +SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Sort + Sort Key: ((id |=> '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) + +SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_int8_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_int8_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_h_o AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_h_o_idx ON test_int8_h_o USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't'); +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=> '400'::bigint) +(4 rows) + +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id <=| '400'::bigint) +(4 rows) + +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +----------------------------------------------------------- + Limit + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (id |=> '400'::bigint) +(4 rows) + +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_int8_h_o_idx on test_int8_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_h_a AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_h_a_idx ON test_int8_h_a USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +ERROR: doesn't support order index over pass-by-reference column +EXPLAIN (costs off) +SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; + QUERY PLAN +-------------------------------------- + Aggregate + -> Seq Scan on test_int8_h_a + Filter: (id < '400'::bigint) +(3 rows) + +SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; + count +------- + 401 +(1 row) + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Sort + Sort Key: ((id <=> '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) + +SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Sort + Sort Key: ((id <=| '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) + +SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Sort + Sort Key: ((id |=> '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) + +SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_int8_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; + id +----- + 16 + 39 + 71 + 135 + 168 + 232 + 252 + 354 + 355 + 371 +(10 rows) + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + QUERY PLAN +------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_int8_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 +(7 rows) + +CREATE TABLE test_int8_id_t AS SELECT id::int8, t FROM tsts; +CREATE INDEX test_int8_id_t_idx ON test_int8_o USING rum + (t rum_tsvector_ops, id); +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; + QUERY PLAN +------------------------------------------------------------------------------- + Sort + Sort Key: ((id <=> '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) + +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; + id +----- + 371 + 355 + 354 + 252 + 232 + 168 + 135 + 71 + 39 + 16 +(10 rows) + diff --git a/expected/interval.out b/expected/interval.out new file mode 100644 index 0000000000..5d70931d0a --- /dev/null +++ b/expected/interval.out @@ -0,0 +1,51 @@ +set enable_seqscan=off; +CREATE TABLE test_interval ( + i interval +); +INSERT INTO test_interval VALUES + ( '03:55:08' ), + ( '04:55:08' ), + ( '05:55:08' ), + ( '08:55:08' ), + ( '09:55:08' ), + ( '10:55:08' ) +; +CREATE INDEX idx_interval ON test_interval USING rum (i); +SELECT * FROM test_interval WHERE i<'08:55:08'::interval ORDER BY i; + i +-------------------------- + @ 3 hours 55 mins 8 secs + @ 4 hours 55 mins 8 secs + @ 5 hours 55 mins 8 secs +(3 rows) + +SELECT * FROM test_interval WHERE i<='08:55:08'::interval ORDER BY i; + i +-------------------------- + @ 3 hours 55 mins 8 secs + @ 4 hours 55 mins 8 secs + @ 5 hours 55 mins 8 secs + @ 8 hours 55 mins 8 secs +(4 rows) + +SELECT * FROM test_interval WHERE i='08:55:08'::interval ORDER BY i; + i +-------------------------- + @ 8 hours 55 mins 8 secs +(1 row) + +SELECT * FROM test_interval WHERE i>='08:55:08'::interval ORDER BY i; + i +--------------------------- + @ 8 hours 55 mins 8 secs + @ 9 hours 55 mins 8 secs + @ 10 hours 55 mins 8 secs +(3 rows) + +SELECT * FROM test_interval WHERE i>'08:55:08'::interval ORDER BY i; + i +--------------------------- + @ 9 hours 55 mins 8 secs + @ 10 hours 55 mins 8 secs +(2 rows) + diff --git a/expected/macaddr.out b/expected/macaddr.out new file mode 100644 index 0000000000..bc9f393980 --- /dev/null +++ b/expected/macaddr.out @@ -0,0 +1,51 @@ +set enable_seqscan=off; +CREATE TABLE test_macaddr ( + i macaddr +); +INSERT INTO test_macaddr VALUES + ( '22:00:5c:03:55:08' ), + ( '22:00:5c:04:55:08' ), + ( '22:00:5c:05:55:08' ), + ( '22:00:5c:08:55:08' ), + ( '22:00:5c:09:55:08' ), + ( '22:00:5c:10:55:08' ) +; +CREATE INDEX idx_macaddr ON test_macaddr USING rum (i); +SELECT * FROM test_macaddr WHERE i<'22:00:5c:08:55:08'::macaddr ORDER BY i; + i +------------------- + 22:00:5c:03:55:08 + 22:00:5c:04:55:08 + 22:00:5c:05:55:08 +(3 rows) + +SELECT * FROM test_macaddr WHERE i<='22:00:5c:08:55:08'::macaddr ORDER BY i; + i +------------------- + 22:00:5c:03:55:08 + 22:00:5c:04:55:08 + 22:00:5c:05:55:08 + 22:00:5c:08:55:08 +(4 rows) + +SELECT * FROM test_macaddr WHERE i='22:00:5c:08:55:08'::macaddr ORDER BY i; + i +------------------- + 22:00:5c:08:55:08 +(1 row) + +SELECT * FROM test_macaddr WHERE i>='22:00:5c:08:55:08'::macaddr ORDER BY i; + i +------------------- + 22:00:5c:08:55:08 + 22:00:5c:09:55:08 + 22:00:5c:10:55:08 +(3 rows) + +SELECT * FROM test_macaddr WHERE i>'22:00:5c:08:55:08'::macaddr ORDER BY i; + i +------------------- + 22:00:5c:09:55:08 + 22:00:5c:10:55:08 +(2 rows) + diff --git a/expected/money.out b/expected/money.out new file mode 100644 index 0000000000..b2e9bac41d --- /dev/null +++ b/expected/money.out @@ -0,0 +1,89 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * money.out - test output for 64-bit systems and + * money_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +CREATE TABLE test_money ( + i money +); +INSERT INTO test_money VALUES ('-2'),('-1'),('0'),('1'),('2'),('3'); +CREATE INDEX idx_money ON test_money USING rum (i); +SELECT * FROM test_money WHERE i<'1'::money ORDER BY i; + i +-------- + -$2.00 + -$1.00 + $0.00 +(3 rows) + +SELECT * FROM test_money WHERE i<='1'::money ORDER BY i; + i +-------- + -$2.00 + -$1.00 + $0.00 + $1.00 +(4 rows) + +SELECT * FROM test_money WHERE i='1'::money ORDER BY i; + i +------- + $1.00 +(1 row) + +SELECT * FROM test_money WHERE i>='1'::money ORDER BY i; + i +------- + $1.00 + $2.00 + $3.00 +(3 rows) + +SELECT * FROM test_money WHERE i>'1'::money ORDER BY i; + i +------- + $2.00 + $3.00 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::money FROM test_money ORDER BY i <=> 0::money; + QUERY PLAN +------------------------------------------ + Index Scan using idx_money on test_money + Order By: (i <=> (0)::money) +(2 rows) + +SELECT *, i <=> 0::money FROM test_money ORDER BY i <=> 0::money; + i | ?column? +--------+---------- + $0.00 | 0 + -$1.00 | 100 + $1.00 | 100 + -$2.00 | 200 + $2.00 | 200 + $3.00 | 300 +(6 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 1::money FROM test_money WHERE i<1::money ORDER BY i <=> 1::money; + QUERY PLAN +------------------------------------------ + Index Scan using idx_money on test_money + Index Cond: (i < (1)::money) + Order By: (i <=> (1)::money) +(3 rows) + +SELECT *, i <=> 1::money FROM test_money WHERE i<1::money ORDER BY i <=> 1::money; + i | ?column? +--------+---------- + $0.00 | 100 + -$1.00 | 200 + -$2.00 | 300 +(3 rows) + diff --git a/expected/money_1.out b/expected/money_1.out new file mode 100644 index 0000000000..6a3fa8c211 --- /dev/null +++ b/expected/money_1.out @@ -0,0 +1,74 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * money.out - test output for 64-bit systems and + * money_1.out - test output for 32-bit systems. + * + */ +set enable_seqscan=off; +CREATE TABLE test_money ( + i money +); +INSERT INTO test_money VALUES ('-2'),('-1'),('0'),('1'),('2'),('3'); +CREATE INDEX idx_money ON test_money USING rum (i); +SELECT * FROM test_money WHERE i<'1'::money ORDER BY i; + i +-------- + -$2.00 + -$1.00 + $0.00 +(3 rows) + +SELECT * FROM test_money WHERE i<='1'::money ORDER BY i; + i +-------- + -$2.00 + -$1.00 + $0.00 + $1.00 +(4 rows) + +SELECT * FROM test_money WHERE i='1'::money ORDER BY i; + i +------- + $1.00 +(1 row) + +SELECT * FROM test_money WHERE i>='1'::money ORDER BY i; + i +------- + $1.00 + $2.00 + $3.00 +(3 rows) + +SELECT * FROM test_money WHERE i>'1'::money ORDER BY i; + i +------- + $2.00 + $3.00 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::money FROM test_money ORDER BY i <=> 0::money; + QUERY PLAN +------------------------------------------ + Index Scan using idx_money on test_money + Order By: (i <=> (0)::money) +(2 rows) + +SELECT *, i <=> 0::money FROM test_money ORDER BY i <=> 0::money; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT *, i <=> 1::money FROM test_money WHERE i<1::money ORDER BY i <=> 1::money; + QUERY PLAN +------------------------------------------ + Index Scan using idx_money on test_money + Index Cond: (i < (1)::money) + Order By: (i <=> (1)::money) +(3 rows) + +SELECT *, i <=> 1::money FROM test_money WHERE i<1::money ORDER BY i <=> 1::money; +ERROR: doesn't support order by over pass-by-reference column diff --git a/expected/numeric.out b/expected/numeric.out new file mode 100644 index 0000000000..0bda4fd2b6 --- /dev/null +++ b/expected/numeric.out @@ -0,0 +1,44 @@ +set enable_seqscan=off; +CREATE TABLE test_numeric ( + i numeric +); +INSERT INTO test_numeric VALUES (-2),(-1),(0),(1),(2),(3); +CREATE INDEX idx_numeric ON test_numeric USING rum (i); +SELECT * FROM test_numeric WHERE i<'1'::numeric ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_numeric WHERE i<='1'::numeric ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_numeric WHERE i='1'::numeric ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_numeric WHERE i>='1'::numeric ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_numeric WHERE i>'1'::numeric ORDER BY i; + i +--- + 2 + 3 +(2 rows) + diff --git a/expected/oid.out b/expected/oid.out new file mode 100644 index 0000000000..f1b2e5d566 --- /dev/null +++ b/expected/oid.out @@ -0,0 +1,78 @@ +set enable_seqscan=off; +CREATE TABLE test_oid ( + i oid +); +INSERT INTO test_oid VALUES (0),(1),(2),(3),(4),(5); +CREATE INDEX idx_oid ON test_oid USING rum (i); +SELECT * FROM test_oid WHERE i<3::oid ORDER BY i; + i +--- + 0 + 1 + 2 +(3 rows) + +SELECT * FROM test_oid WHERE i<=3::oid ORDER BY i; + i +--- + 0 + 1 + 2 + 3 +(4 rows) + +SELECT * FROM test_oid WHERE i=3::oid ORDER BY i; + i +--- + 3 +(1 row) + +SELECT * FROM test_oid WHERE i>=3::oid ORDER BY i; + i +--- + 3 + 4 + 5 +(3 rows) + +SELECT * FROM test_oid WHERE i>3::oid ORDER BY i; + i +--- + 4 + 5 +(2 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 0::oid FROM test_oid ORDER BY i <=> 0::oid; + QUERY PLAN +-------------------------------------- + Index Scan using idx_oid on test_oid + Order By: (i <=> '0'::oid) +(2 rows) + +SELECT *, i <=> 0::oid FROM test_oid ORDER BY i <=> 0::oid; + i | ?column? +---+---------- + 0 | 0 + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(6 rows) + +EXPLAIN (costs off) +SELECT *, i <=> 1::oid FROM test_oid WHERE i<1::oid ORDER BY i <=> 1::oid; + QUERY PLAN +-------------------------------------- + Index Scan using idx_oid on test_oid + Index Cond: (i < '1'::oid) + Order By: (i <=> '1'::oid) +(3 rows) + +SELECT *, i <=> 1::oid FROM test_oid WHERE i<1::oid ORDER BY i <=> 1::oid; + i | ?column? +---+---------- + 0 | 1 +(1 row) + diff --git a/expected/orderby.out b/expected/orderby.out index 5505d3430f..07ae7322ed 100644 --- a/expected/orderby.out +++ b/expected/orderby.out @@ -1,9 +1,92 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby.out - test output for 64-bit systems and + * orderby_1.out - test output for 32-bit systems. + * + */ CREATE TABLE tsts (id int, t tsvector, d timestamp); \copy tsts from 'data/tsts.data' -CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_timestamp_ops, d) +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 +(5 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 + 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 + 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 +(5 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 + 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 +(5 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(9 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(8 rows) + +-- Test bitmap index scan +RESET enable_bitmapscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; count ------- @@ -40,9 +123,19 @@ SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; 39 (1 row) -SET enable_indexscan=OFF; -SET enable_indexonlyscan=OFF; -SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -53,6 +146,19 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -63,6 +169,19 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -73,6 +192,37 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_idx on tsts + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 + 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 +(5 rows) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -87,6 +237,18 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -100,20 +262,18 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; -SET enable_seqscan = off; +SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on tsts - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on tsts_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Index Scan using tsts_idx on tsts + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; count @@ -277,54 +437,6 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -SET enable_bitmapscan=OFF; -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 16 | Mon May 02 11:21:22.326724 2016 - 39 | Tue May 03 10:21:22.326724 2016 - 71 | Wed May 04 18:21:22.326724 2016 - 135 | Sat May 07 10:21:22.326724 2016 - 168 | Sun May 08 19:21:22.326724 2016 - 232 | Wed May 11 11:21:22.326724 2016 - 252 | Thu May 12 07:21:22.326724 2016 - 354 | Mon May 16 13:21:22.326724 2016 - 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) - -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 371 | Tue May 17 06:21:22.326724 2016 - 406 | Wed May 18 17:21:22.326724 2016 - 415 | Thu May 19 02:21:22.326724 2016 - 428 | Thu May 19 15:21:22.326724 2016 - 457 | Fri May 20 20:21:22.326724 2016 - 458 | Fri May 20 21:21:22.326724 2016 - 484 | Sat May 21 23:21:22.326724 2016 - 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) - SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d ----+--------------------------------- @@ -357,4 +469,78 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 458 | Fri May 20 21:21:22.326724 2016 (3 rows) -DROP TABLE tsts CASCADE; +-- Test "ORDER BY" error message +DROP INDEX tsts_idx; +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d); +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: cannot order without attribute 2 in ORDER BY clause +-- Test multicolumn index +RESET enable_indexscan; +RESET enable_indexonlyscan; +RESET enable_bitmapscan; +SET enable_seqscan = off; +DROP INDEX tsts_idx; +CREATE INDEX tsts_id_idx ON tsts USING rum (t rum_tsvector_addon_ops, id, d) + WITH (attach = 'd', to = 't'); +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 1)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +----+--- +(0 rows) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 355)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +-----+--------------------------------- + 355 | Mon May 16 14:21:22.326724 2016 +(1 row) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Wed May 11 11:21:22.326724 2016'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +-----+--------------------------------- + 232 | Wed May 11 11:21:22.326724 2016 +(1 row) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Mon May 01 00:00:00 2000'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +----+--- +(0 rows) + diff --git a/expected/orderby_1.out b/expected/orderby_1.out index f88c18d958..cdd536ac9d 100644 --- a/expected/orderby_1.out +++ b/expected/orderby_1.out @@ -1,46 +1,18 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby.out - test output for 64-bit systems and + * orderby_1.out - test output for 32-bit systems. + * + */ CREATE TABLE tsts (id int, t tsvector, d timestamp); \copy tsts from 'data/tsts.data' -CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_timestamp_ops, d) +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't'); -ERROR: currently, RUM doesn't support order by over pass-by-reference column -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - count -------- - 158 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; - count -------- - 17 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; - count -------- - 6 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; - count -------- - 98 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; - count -------- - 23 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; - count -------- - 39 -(1 row) - +INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; @@ -101,18 +73,19 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; +-- Test bitmap index scan RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - QUERY PLAN ---------------------------------------------------- + QUERY PLAN +------------------------------------------------------------- Aggregate - -> Seq Scan on tsts - Filter: (t @@ '''wr'' | ''qh'''::tsquery) -(3 rows) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; count @@ -157,9 +130,11 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY Limit -> Sort Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts - Filter: (t @@ '''wr'' & ''qh'''::tsquery) -(5 rows) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? @@ -178,9 +153,11 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY Limit -> Sort Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts - Filter: (t @@ '''wr'' & ''qh'''::tsquery) -(5 rows) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? @@ -199,9 +176,11 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY Limit -> Sort Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts - Filter: (t @@ '''wr'' & ''qh'''::tsquery) -(5 rows) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? @@ -215,33 +194,26 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN -------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------- Limit - -> Sort - Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts -(4 rows) + -> Index Scan using tsts_idx on tsts + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 - 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 -(5 rows) - +ERROR: doesn't support order by over pass-by-reference column EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d @@ -259,13 +231,15 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d @@ -280,15 +254,110 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan +RESET enable_indexscan; +RESET enable_indexonlyscan; SET enable_bitmapscan=OFF; EXPLAIN (costs off) +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Index Scan using tsts_idx on tsts + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) + +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; + count +------- + 158 +(1 row) + +SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; + count +------- + 17 +(1 row) + +SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; + count +------- + 6 +(1 row) + +SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; + count +------- + 98 +(1 row) + +SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; + count +------- + 23 +(1 row) + +SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; + count +------- + 39 +(1 row) + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_idx on tsts + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_idx on tsts + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_idx on tsts + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_idx on tsts + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Index Scan using tsts_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; @@ -307,12 +376,12 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Index Scan using tsts_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; @@ -360,4 +429,64 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 458 | Fri May 20 21:21:22.326724 2016 (3 rows) -DROP TABLE tsts CASCADE; +-- Test "ORDER BY" error message +DROP INDEX tsts_idx; +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d); +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +-- Test multicolumn index +RESET enable_indexscan; +RESET enable_indexonlyscan; +RESET enable_bitmapscan; +SET enable_seqscan = off; +DROP INDEX tsts_idx; +CREATE INDEX tsts_id_idx ON tsts USING rum (t rum_tsvector_addon_ops, id, d) + WITH (attach = 'd', to = 't'); +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 1)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 355)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Wed May 11 11:21:22.326724 2016'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Mon May 01 00:00:00 2000'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column diff --git a/expected/orderby_hash.out b/expected/orderby_hash.out index c8550cea91..782ad5700e 100644 --- a/expected/orderby_hash.out +++ b/expected/orderby_hash.out @@ -1,49 +1,142 @@ -CREATE TABLE tsts (id int, t tsvector, d timestamp); -\copy tsts from 'data/tsts.data' -CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_hash_timestamp_ops, d) +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby_hash.out - test output for 64-bit systems and + * orderby_hash_1.out - test output for 32-bit systems. + * + */ +CREATE TABLE tstsh (id int, t tsvector, d timestamp); +\copy tstsh from 'data/tsts.data' +CREATE INDEX tstsh_idx ON tstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; +INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 +(5 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 + 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 + 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 +(5 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 + 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 +(5 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(9 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(8 rows) + +-- Test bitmap index scan +RESET enable_bitmapscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; count ------- 158 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; count ------- 17 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; count ------- 6 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; count ------- 98 (1 row) -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- 23 (1 row) -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- 39 (1 row) -SET enable_indexscan=OFF; -SET enable_indexonlyscan=OFF; -SET enable_bitmapscan=OFF; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -53,7 +146,20 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -63,7 +169,20 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -73,7 +192,38 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_idx on tstsh + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 + 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 +(5 rows) + +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -87,7 +237,19 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -100,68 +262,66 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; -SET enable_seqscan = off; +SET enable_bitmapscan=OFF; EXPLAIN (costs off) -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on tsts - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on tsts_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Index Scan using tstsh_idx on tstsh + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; count ------- 158 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; count ------- 17 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; count ------- 6 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; count ------- 98 (1 row) -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- 23 (1 row) -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- 39 (1 row) EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using tsts_idx on tsts + -> Index Scan using tstsh_idx on tstsh Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) (4 rows) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -172,16 +332,16 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using tsts_idx on tsts + -> Index Scan using tstsh_idx on tstsh Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) Order By: (d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone) (4 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -192,16 +352,16 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using tsts_idx on tsts + -> Index Scan using tstsh_idx on tstsh Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) Order By: (d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) (4 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -212,15 +372,15 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Scan using tsts_idx on tsts + -> Index Scan using tstsh_idx on tstsh Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) (3 rows) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -231,16 +391,16 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 1 (5 rows) EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Index Scan using tsts_idx on tsts + -> Index Scan using tstsh_idx on tstsh Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -255,16 +415,16 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER (9 rows) EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Index Scan using tsts_idx on tsts + -> Index Scan using tstsh_idx on tstsh Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -277,55 +437,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -SET enable_bitmapscan=OFF; -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 16 | Mon May 02 11:21:22.326724 2016 - 39 | Tue May 03 10:21:22.326724 2016 - 71 | Wed May 04 18:21:22.326724 2016 - 135 | Sat May 07 10:21:22.326724 2016 - 168 | Sun May 08 19:21:22.326724 2016 - 232 | Wed May 11 11:21:22.326724 2016 - 252 | Thu May 12 07:21:22.326724 2016 - 354 | Mon May 16 13:21:22.326724 2016 - 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) - -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 371 | Tue May 17 06:21:22.326724 2016 - 406 | Wed May 18 17:21:22.326724 2016 - 415 | Thu May 19 02:21:22.326724 2016 - 428 | Thu May 19 15:21:22.326724 2016 - 457 | Fri May 20 20:21:22.326724 2016 - 458 | Fri May 20 21:21:22.326724 2016 - 484 | Sat May 21 23:21:22.326724 2016 - 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d ----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -333,7 +445,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 71 | Wed May 04 18:21:22.326724 2016 (3 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; id | d -----+--------------------------------- 355 | Mon May 16 14:21:22.326724 2016 @@ -341,7 +453,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 252 | Thu May 12 07:21:22.326724 2016 (3 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -349,7 +461,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 415 | Thu May 19 02:21:22.326724 2016 (3 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; id | d -----+--------------------------------- 496 | Sun May 22 11:21:22.326724 2016 @@ -357,4 +469,73 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 458 | Fri May 20 21:21:22.326724 2016 (3 rows) -DROP TABLE tsts CASCADE; +-- Test multicolumn index +RESET enable_indexscan; +RESET enable_indexonlyscan; +RESET enable_bitmapscan; +SET enable_seqscan = off; +DROP INDEX tstsh_idx; +CREATE INDEX tstsh_id_idx ON tsts USING rum (t rum_tsvector_addon_ops, id, d) + WITH (attach = 'd', to = 't'); +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 1)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +----+--- +(0 rows) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 355)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +-----+--------------------------------- + 355 | Mon May 16 14:21:22.326724 2016 +(1 row) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Wed May 11 11:21:22.326724 2016'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +-----+--------------------------------- + 232 | Wed May 11 11:21:22.326724 2016 +(1 row) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Mon May 01 00:00:00 2000'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d +----+--- +(0 rows) + diff --git a/expected/orderby_hash_1.out b/expected/orderby_hash_1.out index 562388d4f0..f19e4507c7 100644 --- a/expected/orderby_hash_1.out +++ b/expected/orderby_hash_1.out @@ -1,50 +1,22 @@ -CREATE TABLE tsts (id int, t tsvector, d timestamp); -\copy tsts from 'data/tsts.data' -CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_hash_timestamp_ops, d) +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby_hash.out - test output for 64-bit systems and + * orderby_hash_1.out - test output for 32-bit systems. + * + */ +CREATE TABLE tstsh (id int, t tsvector, d timestamp); +\copy tstsh from 'data/tsts.data' +CREATE INDEX tstsh_idx ON tstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't'); -ERROR: currently, RUM doesn't support order by over pass-by-reference column -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - count -------- - 158 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; - count -------- - 17 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; - count -------- - 6 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; - count -------- - 98 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; - count -------- - 23 -(1 row) - -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; - count -------- - 39 -(1 row) - +INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -54,7 +26,7 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -64,7 +36,7 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -74,7 +46,7 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -88,7 +60,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -101,67 +73,70 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; +-- Test bitmap index scan RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - QUERY PLAN ---------------------------------------------------- +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- Aggregate - -> Seq Scan on tsts - Filter: (t @@ '''wr'' | ''qh'''::tsquery) -(3 rows) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; count ------- 158 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; count ------- 17 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; count ------- 6 (1 row) -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; count ------- 98 (1 row) -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- 23 (1 row) -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- 39 (1 row) EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ------------------------------------------------------------------------------------- Limit -> Sort Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts - Filter: (t @@ '''wr'' & ''qh'''::tsquery) -(5 rows) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -172,17 +147,19 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ------------------------------------------------------------------------------------- Limit -> Sort Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts - Filter: (t @@ '''wr'' & ''qh'''::tsquery) -(5 rows) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 @@ -193,17 +170,19 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN ------------------------------------------------------------------------------------- Limit -> Sort Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts - Filter: (t @@ '''wr'' & ''qh'''::tsquery) -(5 rows) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 @@ -214,36 +193,29 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN -------------------------------------------------------------------------------------- +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- Limit - -> Sort - Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) - -> Seq Scan on tsts -(4 rows) - -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 - 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 -(5 rows) + -> Index Scan using tstsh_idx on tstsh + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -258,16 +230,18 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER (9 rows) EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -280,18 +254,113 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan +RESET enable_indexscan; +RESET enable_indexonlyscan; SET enable_bitmapscan=OFF; EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Index Scan using tstsh_idx on tstsh + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) + +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; + count +------- + 158 +(1 row) + +SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; + count +------- + 17 +(1 row) + +SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; + count +------- + 6 +(1 row) + +SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; + count +------- + 98 +(1 row) + +SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; + count +------- + 23 +(1 row) + +SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; + count +------- + 39 +(1 row) + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_idx on tstsh + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_idx on tstsh + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_idx on tstsh + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) + Order By: (d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_idx on tstsh + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Index Scan using tstsh_idx on tstsh + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -306,16 +375,16 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER (9 rows) EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- Sort Sort Key: d - -> Seq Scan on tsts - Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Index Scan using tstsh_idx on tstsh + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -328,7 +397,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d ----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 @@ -336,7 +405,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 71 | Wed May 04 18:21:22.326724 2016 (3 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; id | d -----+--------------------------------- 355 | Mon May 16 14:21:22.326724 2016 @@ -344,7 +413,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 252 | Thu May 12 07:21:22.326724 2016 (3 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 @@ -352,7 +421,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 415 | Thu May 19 02:21:22.326724 2016 (3 rows) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; id | d -----+--------------------------------- 496 | Sun May 22 11:21:22.326724 2016 @@ -360,4 +429,59 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 458 | Fri May 20 21:21:22.326724 2016 (3 rows) -DROP TABLE tsts CASCADE; +-- Test multicolumn index +RESET enable_indexscan; +RESET enable_indexonlyscan; +RESET enable_bitmapscan; +SET enable_seqscan = off; +DROP INDEX tstsh_idx; +CREATE INDEX tstsh_id_idx ON tsts USING rum (t rum_tsvector_addon_ops, id, d) + WITH (attach = 'd', to = 't'); +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 1)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id = 355)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Wed May 11 11:21:22.326724 2016'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_id_idx on tsts + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d = 'Mon May 01 00:00:00 2000'::timestamp without time zone)) + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(4 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column diff --git a/expected/predicate-rum-2.out b/expected/predicate-rum-2.out new file mode 100644 index 0000000000..cc4720c052 --- /dev/null +++ b/expected/predicate-rum-2.out @@ -0,0 +1,521 @@ +Parsed test spec with 2 sessions + +starting permutation: rxy1 wx1 c1 rxy2 wy2 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; + +starting permutation: rxy1 wx1 rxy2 c1 wy2 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step c1: COMMIT; +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; + +starting permutation: rxy1 wx1 rxy2 wy2 c1 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy1 wx1 rxy2 wy2 c2 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy1 rxy2 wx1 c1 wy2 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; + +starting permutation: rxy1 rxy2 wx1 wy2 c1 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy1 rxy2 wx1 wy2 c2 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy1 rxy2 wy2 wx1 c1 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy1 rxy2 wy2 wx1 c2 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy1 rxy2 wy2 c2 wx1 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; + +starting permutation: rxy2 rxy1 wx1 c1 wy2 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; + +starting permutation: rxy2 rxy1 wx1 wy2 c1 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy2 rxy1 wx1 wy2 c2 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy2 rxy1 wy2 wx1 c1 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy2 rxy1 wy2 wx1 c2 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy2 rxy1 wy2 c2 wx1 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; + +starting permutation: rxy2 wy2 rxy1 wx1 c1 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy2 wy2 rxy1 wx1 c2 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy2 wy2 rxy1 c2 wx1 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step c2: COMMIT; +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; + +starting permutation: rxy2 wy2 c2 rxy1 wx1 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('xz'); +step c2: COMMIT; +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('ab'); +step c1: COMMIT; diff --git a/expected/predicate-rum.out b/expected/predicate-rum.out new file mode 100644 index 0000000000..86071a3c7a --- /dev/null +++ b/expected/predicate-rum.out @@ -0,0 +1,523 @@ +Parsed test spec with 2 sessions + +starting permutation: rxy1 wx1 c1 rxy2 wy2 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +339|'qh' +(6 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; + +starting permutation: rxy1 wx1 rxy2 c1 wy2 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step c1: COMMIT; +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; + +starting permutation: rxy1 wx1 rxy2 wy2 c1 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy1 wx1 rxy2 wy2 c2 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy1 rxy2 wx1 c1 wy2 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; + +starting permutation: rxy1 rxy2 wx1 wy2 c1 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy1 rxy2 wx1 wy2 c2 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy1 rxy2 wy2 wx1 c1 c2 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy1 rxy2 wy2 wx1 c2 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy1 rxy2 wy2 c2 wx1 c1 +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; + +starting permutation: rxy2 rxy1 wx1 c1 wy2 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; + +starting permutation: rxy2 rxy1 wx1 wy2 c1 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy2 rxy1 wx1 wy2 c2 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy2 rxy1 wy2 wx1 c1 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy2 rxy1 wy2 wx1 c2 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy2 rxy1 wy2 c2 wx1 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; + +starting permutation: rxy2 wy2 rxy1 wx1 c1 c2 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; +step c2: COMMIT; + +starting permutation: rxy2 wy2 rxy1 wx1 c2 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c2: COMMIT; +step c1: COMMIT; + +starting permutation: rxy2 wy2 rxy1 c2 wx1 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + +step c2: COMMIT; +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; + +starting permutation: rxy2 wy2 c2 rxy1 wx1 c1 +step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) + +step wy2: INSERT INTO rum_tbl(tsv) values('hx'); +step c2: COMMIT; +step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +339|'hx' +(6 rows) + +step wx1: INSERT INTO rum_tbl(tsv) values('qh'); +step c1: COMMIT; diff --git a/expected/rum.out b/expected/rum.out index 5e0774b975..5966d196fe 100644 --- a/expected/rum.out +++ b/expected/rum.out @@ -4,10 +4,24 @@ CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT ON test_rum FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger('a', 'pg_catalog.english', 't'); CREATE INDEX rumidx ON test_rum USING rum (a rum_tsvector_ops); +-- Check empty table using index scan +SELECT + a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + * + FROM test_rum + ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; + ?column? | rum_ts_distance | rum_ts_score | t | a +----------+-----------------+--------------+---+--- +(0 rows) + +-- Fill the table with data \copy test_rum(t) from 'data/rum.data'; -CREATE INDEX failed_rumidx ON test_rum USING rum (a rum_tsvector_timestamp_ops); +CREATE INDEX failed_rumidx ON test_rum USING rum (a rum_tsvector_addon_ops); ERROR: additional information attribute "a" is not found in index SET enable_seqscan=off; +SET enable_indexscan=off; explain (costs off) SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); QUERY PLAN @@ -35,12 +49,14 @@ ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); explain (costs off) SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------- Aggregate - -> Index Scan using rumidx on test_rum - Index Cond: (a @@ '''def'' <-> ''fgr'''::tsquery) -(3 rows) + -> Bitmap Heap Scan on test_rum + Recheck Cond: (a @@ '''def'' <-> ''fgr'''::tsquery) + -> Bitmap Index Scan on rumidx + Index Cond: (a @@ '''def'' <-> ''fgr'''::tsquery) +(5 rows) SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); count @@ -116,61 +132,69 @@ SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 1 (1 row) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), * +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); - rum_ts_distance | t | a ------------------+--------------------------------------------------------------------------+--------------------------------------------------------------- - 16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 - 16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 16.4493 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 - 16.4493 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+--------------------------------------------------------------------------+--------------------------------------------------------------- + 16.4493 | 0.0607927 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 + 16.4493 | 0.0607927 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 16.4493 | 0.0607927 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 + 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), * +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); - rum_ts_distance | t | a ------------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), * FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; - ?column? | rum_ts_distance | t | a + distance | rum_ts_distance | t | a ----------+-----------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 8.22467 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 8.2247 | 8.2247 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 57.5727 | 57.5727 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), * +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); - rum_ts_distance | t | a ------------------+--------------------------------------------------------------------------+--------------------------------------------------------------- - 16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 - 16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 16.4493 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 - 16.4493 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+--------------------------------------------------------------------------+--------------------------------------------------------------- + 16.4493 | 0.0607927 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 + 16.4493 | 0.0607927 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 16.4493 | 0.0607927 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 + 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), * +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); - rum_ts_distance | t | a ------------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) INSERT INTO test_rum (t) VALUES ('foo bar foo the over foo qq bar'); @@ -208,65 +232,71 @@ SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER (1 row) -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); - ?column? +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; + distance ---------- 16.4493 16.4493 - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 (56 rows) CREATE TABLE tst (i int4, t tsvector); @@ -288,6 +318,7 @@ DELETE FROM tst WHERE i = 5; VACUUM tst; INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i; set enable_bitmapscan=off; +SET enable_indexscan=on; explain (costs off) SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * FROM test_rum @@ -300,15 +331,15 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * Order By: (a <=> '''w'':*'::tsquery) (3 rows) -SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'w:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); - ?column? | t | a + distance | t | a ----------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------- - 8.22467 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 - 8.22467 | wine, but wouldn't you divide with your neighbors! The columns in the | 'column':11 'divid':6 'neighbor':9 'wine':1 'wouldn':3 - 8.22467 | not say, but you wrote as if you knew it by sight as well as by heart. | 'heart':17 'knew':9 'say':2 'sight':12 'well':14 'wrote':5 + 8.2247 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 + 8.2247 | wine, but wouldn't you divide with your neighbors! The columns in the | 'column':11 'divid':6 'neighbor':9 'wine':1 'wouldn':3 + 8.2247 | not say, but you wrote as if you knew it by sight as well as by heart. | 'heart':17 'knew':9 'say':2 'sight':12 'well':14 'wrote':5 16.4493 | little series of pictures. Have you ever been here, I wonder? You did | 'ever':7 'littl':1 'pictur':4 'seri':2 'wonder':11 16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 16.4493 | _berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it | 'berg':1 'jettenhuhl':3 'konigestuhl':9 'look':10 'spur':6 'wood':5 @@ -322,16 +353,16 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * 16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 (14 rows) -SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'b:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'b:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*'); - ?column? | t | a + distance | t | a ----------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------- - 8.22467 | been trying my best to get all those "passes" into my brain. Now, thanks | 'best':4 'brain':12 'get':6 'pass':9 'thank':14 'tri':2 - 8.22467 | All the above information, I beg you to believe, I do not intend you | 'beg':6 'believ':9 'inform':4 'intend':13 - 8.22467 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6 - 8.22467 | oaks, limes and maples, bordered with flower-beds and shrubberies, and | 'bed':9 'border':5 'flower':8 'flower-b':7 'lime':2 'mapl':4 'oak':1 'shrubberi':11 + 8.2247 | been trying my best to get all those "passes" into my brain. Now, thanks | 'best':4 'brain':12 'get':6 'pass':9 'thank':14 'tri':2 + 8.2247 | All the above information, I beg you to believe, I do not intend you | 'beg':6 'believ':9 'inform':4 'intend':13 + 8.2247 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6 + 8.2247 | oaks, limes and maples, bordered with flower-beds and shrubberies, and | 'bed':9 'border':5 'flower':8 'flower-b':7 'lime':2 'mapl':4 'oak':1 'shrubberi':11 13.1595 | foo bar foo the over foo qq bar | 'bar':2,8 'foo':1,3,6 'qq':7 16.4493 | ornamental building, and I wish you could see it, if you have not seen | 'build':2 'could':7 'ornament':1 'see':8 'seen':14 'wish':5 16.4493 | the--nearest guide-book! | 'book':5 'guid':4 'guide-book':3 'nearest':2 @@ -350,5 +381,33 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), * 16.4493 | the few that escaped destruction in 1693. It is a beautiful, highly | '1693':7 'beauti':11 'destruct':5 'escap':4 'high':12 (20 rows) -DROP TABLE test_rum CASCADE; -DROP TABLE tst CASCADE; +-- Test correct work of phrase operator when position information is not in index. +create table test_rum_addon as table test_rum; +alter table test_rum_addon add column id serial; +create index on test_rum_addon using rum (a rum_tsvector_addon_ops, id) with (attach = 'id', to='a'); +select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); + t | a | id +---------------------------------------------------------------------+---------------------------------------------------------+---- + itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 | 9 +(1 row) + +explain (costs off) select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); + QUERY PLAN +------------------------------------------------------------ + Index Scan using test_rum_addon_a_id_idx on test_rum_addon + Index Cond: (a @@ '''half'' <-> ''way'''::tsquery) +(2 rows) + +-- +select ('bjarn:6237 stroustrup:6238'::tsvector <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; + distance +---------- + 8.22467 +(1 row) + +SELECT ('stroustrup:5508B,6233B,6238B bjarn:6235B,6237B' <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; + distance +---------- + 2.05617 +(1 row) + diff --git a/expected/rum_hash.out b/expected/rum_hash.out index 1ebee073a5..43a9760a28 100644 --- a/expected/rum_hash.out +++ b/expected/rum_hash.out @@ -1,308 +1,325 @@ -CREATE TABLE test_rum( t text, a tsvector ); +CREATE TABLE test_rum_hash( t text, a tsvector ); CREATE TRIGGER tsvectorupdate -BEFORE UPDATE OR INSERT ON test_rum +BEFORE UPDATE OR INSERT ON test_rum_hash FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger('a', 'pg_catalog.english', 't'); -CREATE INDEX rumidx ON test_rum USING rum (a rum_tsvector_hash_ops); -\copy test_rum(t) from 'data/rum.data'; -CREATE INDEX failed_rumidx ON test_rum USING rum (a rum_tsvector_timestamp_ops); +CREATE INDEX rumhashidx ON test_rum_hash USING rum (a rum_tsvector_hash_ops); +\copy test_rum_hash(t) from 'data/rum.data'; +CREATE INDEX failed_rumidx ON test_rum_hash USING rum (a rum_tsvector_addon_ops); ERROR: additional information attribute "a" is not found in index SET enable_seqscan=off; +SET enable_indexscan=off; explain (costs off) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); QUERY PLAN ------------------------------------------------------------------ Aggregate - -> Bitmap Heap Scan on test_rum + -> Bitmap Heap Scan on test_rum_hash Recheck Cond: (a @@ '''ever'' | ''wrote'''::tsquery) - -> Bitmap Index Scan on rumidx + -> Bitmap Index Scan on rumhashidx Index Cond: (a @@ '''ever'' | ''wrote'''::tsquery) (5 rows) explain (costs off) -SELECT * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote') +SELECT * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote') ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); QUERY PLAN ------------------------------------------------------------------ Sort Sort Key: ((a <=> '''ever'' | ''wrote'''::tsquery)) - -> Bitmap Heap Scan on test_rum + -> Bitmap Heap Scan on test_rum_hash Recheck Cond: (a @@ '''ever'' | ''wrote'''::tsquery) - -> Bitmap Index Scan on rumidx + -> Bitmap Index Scan on rumhashidx Index Cond: (a @@ '''ever'' | ''wrote'''::tsquery) (6 rows) explain (costs off) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------- Aggregate - -> Index Scan using rumidx on test_rum - Index Cond: (a @@ '''def'' <-> ''fgr'''::tsquery) -(3 rows) + -> Bitmap Heap Scan on test_rum_hash + Recheck Cond: (a @@ '''def'' <-> ''fgr'''::tsquery) + -> Bitmap Index Scan on rumhashidx + Index Cond: (a @@ '''def'' <-> ''fgr'''::tsquery) +(5 rows) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); count ------- 2 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have&wish'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'have&wish'); count ------- 1 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'knew&brain'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'knew&brain'); count ------- 0 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'among'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'among'); count ------- 1 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'structure&ancient'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'structure&ancient'); count ------- 1 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary|sight)&(sending|heart)'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary|sight)&(sending|heart)'); count ------- 2 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '(gave | half) <-> way'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '(gave | half) <-> way'); count ------- 2 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '(gave | !half) <-> way'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '(gave | !half) <-> way'); count ------- 3 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave & way'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '!gave & way'); count ------- 3 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave & wooded & !look'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '!gave & wooded & !look'); count ------- 1 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); count ------- 1 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <2> fgr'); count ------- 1 (1 row) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), * - FROM test_rum +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); - rum_ts_distance | t | a ------------------+--------------------------------------------------------------------------+--------------------------------------------------------------- - 16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 - 16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 16.4493 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 - 16.4493 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+--------------------------------------------------------------------------+--------------------------------------------------------------- + 16.4493 | 0.0607927 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 + 16.4493 | 0.0607927 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 16.4493 | 0.0607927 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 + 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), * - FROM test_rum +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); - rum_ts_distance | t | a ------------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * - FROM test_rum + FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; - ?column? | rum_ts_distance | t | a -----------+-----------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 8.22467 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | 57.5727 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + distance | rum_ts_distance | rum_ts_score | t | a +----------+-----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- + 8.2247 | 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), * - FROM test_rum +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); - rum_ts_distance | t | a ------------------+--------------------------------------------------------------------------+--------------------------------------------------------------- - 16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 - 16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 16.4493 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 - 16.4493 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+--------------------------------------------------------------------------+--------------------------------------------------------------- + 16.4493 | 0.0607927 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 + 16.4493 | 0.0607927 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 16.4493 | 0.0607927 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 + 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), * - FROM test_rum +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); - rum_ts_distance | t | a ------------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + rum_ts_distance | rum_ts_score | t | a +-----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) -INSERT INTO test_rum (t) VALUES ('foo bar foo the over foo qq bar'); -INSERT INTO test_rum (t) VALUES ('345 qwerty copyright'); -INSERT INTO test_rum (t) VALUES ('345 qwerty'); -INSERT INTO test_rum (t) VALUES ('A fat cat has just eaten a rat.'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar'); +INSERT INTO test_rum_hash (t) VALUES ('foo bar foo the over foo qq bar'); +INSERT INTO test_rum_hash (t) VALUES ('345 qwerty copyright'); +INSERT INTO test_rum_hash (t) VALUES ('345 qwerty'); +INSERT INTO test_rum_hash (t) VALUES ('A fat cat has just eaten a rat.'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'bar'); count ------- 1 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'qwerty&345'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'qwerty&345'); count ------- 2 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '345'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '345'); count ------- 2 (1 row) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'rat'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'rat'); count ------- 1 (1 row) -SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER BY a; +SELECT a FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER BY a; a ------------------------------ 'bar':2,8 'foo':1,3,6 'qq':7 (1 row) -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); - ?column? +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; + distance ---------- 16.4493 16.4493 - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 (56 rows) -CREATE TABLE tst (i int4, t tsvector); -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(1,100000) i; -CREATE INDEX tstidx ON tst USING rum (t rum_tsvector_hash_ops); -DELETE FROM tst WHERE i = 1; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(10001,11000) i; -DELETE FROM tst WHERE i = 2; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(11001,12000) i; -DELETE FROM tst WHERE i = 3; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(12001,13000) i; -DELETE FROM tst WHERE i = 4; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(13001,14000) i; -DELETE FROM tst WHERE i = 5; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i; +CREATE TABLE tst_hash (i int4, t tsvector); +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(1,100000) i; +CREATE INDEX tst_hashidx ON tst_hash USING rum (t rum_tsvector_hash_ops); +DELETE FROM tst_hash WHERE i = 1; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(10001,11000) i; +DELETE FROM tst_hash WHERE i = 2; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(11001,12000) i; +DELETE FROM tst_hash WHERE i = 3; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(12001,13000) i; +DELETE FROM tst_hash WHERE i = 4; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(13001,14000) i; +DELETE FROM tst_hash WHERE i = 5; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i; set enable_bitmapscan=off; +SET enable_indexscan=on; explain (costs off) SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * - FROM test_rum + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); - QUERY PLAN ------------------------------------------ - Index Scan using rumidx on test_rum + QUERY PLAN +---------------------------------------------- + Index Scan using rumhashidx on test_rum_hash Index Cond: (a @@ '''w'':*'::tsquery) Order By: (a <=> '''w'':*'::tsquery) (3 rows) SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * - FROM test_rum + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); ERROR: Compare with prefix expressions isn't supported -DROP TABLE test_rum CASCADE; -DROP TABLE tst CASCADE; diff --git a/expected/rum_validate.out b/expected/rum_validate.out new file mode 100644 index 0000000000..22000a1ee5 --- /dev/null +++ b/expected/rum_validate.out @@ -0,0 +1,132 @@ +-- +-- Various sanity tests +-- +-- First validate operator classes +SELECT opcname, amvalidate(opc.oid) +FROM pg_opclass opc JOIN pg_am am ON am.oid = opcmethod +WHERE amname = 'rum' +ORDER BY opcname; + opcname | amvalidate +-----------------------------------+------------ + rum_anyarray_addon_ops | t + rum_anyarray_ops | t + rum_bit_ops | t + rum_bytea_ops | t + rum_char_ops | t + rum_cidr_ops | t + rum_date_ops | t + rum_float4_ops | t + rum_float8_ops | t + rum_inet_ops | t + rum_int2_ops | t + rum_int4_ops | t + rum_int8_ops | t + rum_interval_ops | t + rum_macaddr_ops | t + rum_money_ops | t + rum_numeric_ops | t + rum_oid_ops | t + rum_text_ops | t + rum_time_ops | t + rum_timestamp_ops | t + rum_timestamptz_ops | t + rum_timetz_ops | t + rum_tsquery_ops | t + rum_tsvector_addon_ops | t + rum_tsvector_hash_addon_ops | t + rum_tsvector_hash_ops | t + rum_tsvector_hash_timestamp_ops | t + rum_tsvector_hash_timestamptz_ops | t + rum_tsvector_ops | t + rum_tsvector_timestamp_ops | t + rum_tsvector_timestamptz_ops | t + rum_varbit_ops | t + rum_varchar_ops | t +(34 rows) + +-- +-- Test access method and 'rumidx' index properties +-- +-- Access method properties +SELECT a.amname, p.name, pg_indexam_has_property(a.oid,p.name) +FROM pg_am a, unnest(array['can_order','can_unique','can_multi_col','can_exclude']) p(name) +WHERE a.amname = 'rum' ORDER BY a.amname; + amname | name | pg_indexam_has_property +--------+---------------+------------------------- + rum | can_order | f + rum | can_unique | f + rum | can_multi_col | t + rum | can_exclude | t +(4 rows) + +-- Index properties +SELECT p.name, pg_index_has_property('rumidx'::regclass,p.name) +FROM unnest(array['clusterable','index_scan','bitmap_scan','backward_scan']) p(name); + name | pg_index_has_property +---------------+----------------------- + clusterable | f + index_scan | t + bitmap_scan | t + backward_scan | f +(4 rows) + +-- Index column properties +SELECT p.name, pg_index_column_has_property('rumidx'::regclass,1,p.name) +FROM unnest(array['asc','desc','nulls_first','nulls_last','orderable','distance_orderable','returnable','search_array','search_nulls']) p(name); + name | pg_index_column_has_property +--------------------+------------------------------ + asc | f + desc | f + nulls_first | f + nulls_last | f + orderable | f + distance_orderable | t + returnable | f + search_array | f + search_nulls | f +(9 rows) + +-- +-- Check incorrect operator class +-- +DROP INDEX rumidx; +-- PGPRO-1175: Check incorrect operator class, i.e. it shouldn't work correctly +CREATE OPERATOR CLASS rum_tsvector_norm_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + OPERATOR 2 <=> (tsvector, rum_distance_query) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), + FUNCTION 6 rum_tsvector_config(internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), + FUNCTION 10 rum_ts_join_pos(internal, internal), + STORAGE text; +CREATE INDEX rum_norm_idx ON test_rum USING rum(a rum_tsvector_norm_ops); +SET enable_seqscan=off; +SET enable_bitmapscan=off; +SET enable_indexscan=on; +-- PGPRO-1175: Select using incorrect operator class +SELECT a + FROM test_rum + WHERE a @@ to_tsquery('pg_catalog.english', 'bar') + ORDER BY a <=> (to_tsquery('pg_catalog.english', 'bar'),0); + a +------------------------------ + 'bar':2,8 'foo':1,3,6 'qq':7 +(1 row) + +-- PGPRO-9026: column and attached column cannot be the same +CREATE TABLE test_array (i int2[]); +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_addon_ops) WITH (attach = 'i', to = 'i'); +ERROR: column "i" and attached column cannot be the same +SELECT * FROM test_array WHERE i && '{1}'; + i +--- +(0 rows) + +DROP TABLE test_array; diff --git a/expected/rum_weight.out b/expected/rum_weight.out new file mode 100644 index 0000000000..0c1565d1ce --- /dev/null +++ b/expected/rum_weight.out @@ -0,0 +1,136 @@ +CREATE TABLE testweight_rum( t text, a tsvector, r text ); +CREATE FUNCTION fill_weight_trigger() RETURNS trigger AS $$ +begin + new.a := + setweight(to_tsvector('pg_catalog.english', coalesce(new.r,'')), 'A') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.t,'')), 'D'); + return new; +end +$$ LANGUAGE plpgsql; +CREATE TRIGGER tsvectorweightupdate +BEFORE INSERT OR UPDATE ON testweight_rum +FOR EACH ROW EXECUTE PROCEDURE fill_weight_trigger(); +CREATE INDEX rumidx_weight ON testweight_rum USING rum (a rum_tsvector_ops); +\copy testweight_rum(t,r) from 'data/rum_weight.data' DELIMITER '|' ; +SET enable_seqscan=off; +SET enable_indexscan=off; +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever:A|wrote'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'among:ABC'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'structure:D&ancient:BCD'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary:DC|sight)&(sending:ABC|heart)'); + count +------- + 2 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave:D & way'); + count +------- + 3 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<->go)'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<2>go)'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a | way<->reach)'); + count +------- + 2 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a & way<->reach)'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go & !way:a'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'show:d & seem & !town:a'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!way:a'); + count +------- + 52 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & !way:a'); + count +------- + 2 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & !way:a'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'think<->go:d | go<->see'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d<->think'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach<->think'); + count +------- + 1 +(1 row) + diff --git a/expected/ruminv.out b/expected/ruminv.out index 6fd5a6457b..840dcfc85c 100644 --- a/expected/ruminv.out +++ b/expected/ruminv.out @@ -9,6 +9,7 @@ INSERT INTO test_invrum VALUES ('(a|b)&c'::tsquery); INSERT INTO test_invrum VALUES ('(!(a|b))&c'::tsquery); INSERT INTO test_invrum VALUES ('(a|b)&(c|d)'::tsquery); INSERT INTO test_invrum VALUES ('!a'::tsquery); +INSERT INTO test_invrum VALUES ('(a|a1|a2|a3|a4|a5)&(b|b1|b2|b3|b4|b5|b6)&!(c|c1|c2|c3)'::tsquery); SELECT * FROM test_invrum WHERE q @@ ''::tsvector; q ---------------- @@ -36,12 +37,13 @@ SELECT * FROM test_invrum WHERE q @@ 'b'::tsvector; (4 rows) SELECT * FROM test_invrum WHERE q @@ 'a b'::tsvector; - q ------------- + q +-------------------------------------------------------------------------------------------------------------------------------- 'a' | 'b' 'a' & 'b' !'a' | 'b' -(3 rows) + ( 'a' | 'a1' | 'a2' | 'a3' | 'a4' | 'a5' ) & ( 'b' | 'b1' | 'b2' | 'b3' | 'b4' | 'b5' | 'b6' ) & !( 'c' | 'c1' | 'c2' | 'c3' ) +(4 rows) SELECT * FROM test_invrum WHERE q @@ 'c'::tsvector; q @@ -113,13 +115,14 @@ SELECT * FROM test_invrum WHERE q @@ 'b d'::tsvector; (5 rows) SELECT * FROM test_invrum WHERE q @@ 'a b d'::tsvector; - q -------------------------------- + q +-------------------------------------------------------------------------------------------------------------------------------- 'a' | 'b' 'a' & 'b' !'a' | 'b' ( 'a' | 'b' ) & ( 'c' | 'd' ) -(4 rows) + ( 'a' | 'a1' | 'a2' | 'a3' | 'a4' | 'a5' ) & ( 'b' | 'b1' | 'b2' | 'b3' | 'b4' | 'b5' | 'b6' ) & !( 'c' | 'c1' | 'c2' | 'c3' ) +(5 rows) SELECT * FROM test_invrum WHERE q @@ 'c d'::tsvector; q @@ -166,12 +169,13 @@ SELECT * FROM test_invrum WHERE q @@ 'b'::tsvector; (4 rows) SELECT * FROM test_invrum WHERE q @@ 'a b'::tsvector; - q ------------- + q +-------------------------------------------------------------------------------------------------------------------------------- 'a' | 'b' 'a' & 'b' !'a' | 'b' -(3 rows) + ( 'a' | 'a1' | 'a2' | 'a3' | 'a4' | 'a5' ) & ( 'b' | 'b1' | 'b2' | 'b3' | 'b4' | 'b5' | 'b6' ) & !( 'c' | 'c1' | 'c2' | 'c3' ) +(4 rows) SELECT * FROM test_invrum WHERE q @@ 'c'::tsvector; q @@ -243,13 +247,14 @@ SELECT * FROM test_invrum WHERE q @@ 'b d'::tsvector; (5 rows) SELECT * FROM test_invrum WHERE q @@ 'a b d'::tsvector; - q -------------------------------- + q +-------------------------------------------------------------------------------------------------------------------------------- 'a' | 'b' 'a' & 'b' !'a' | 'b' ( 'a' | 'b' ) & ( 'c' | 'd' ) -(4 rows) + ( 'a' | 'a1' | 'a2' | 'a3' | 'a4' | 'a5' ) & ( 'b' | 'b1' | 'b2' | 'b3' | 'b4' | 'b5' | 'b6' ) & !( 'c' | 'c1' | 'c2' | 'c3' ) +(5 rows) SELECT * FROM test_invrum WHERE q @@ 'c d'::tsvector; q diff --git a/expected/security.out b/expected/security.out new file mode 100644 index 0000000000..86fcbf81da --- /dev/null +++ b/expected/security.out @@ -0,0 +1,5 @@ +-- Check security CVE-2020-14350 +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) RETURNS bool AS $$ SELECT false $$ LANGUAGE SQL; +CREATE EXTENSION rum; +ERROR: function "rum_anyarray_similar" already exists with same argument types +DROP FUNCTION rum_anyarray_similar(anyarray,anyarray); diff --git a/expected/text.out b/expected/text.out new file mode 100644 index 0000000000..9cf9310a77 --- /dev/null +++ b/expected/text.out @@ -0,0 +1,313 @@ +set enable_seqscan=off; +CREATE TABLE test_text ( + i text +); +INSERT INTO test_text VALUES ('a'),('ab'),('abc'),('abb'),('axy'),('xyz'); +CREATE INDEX idx_text ON test_text USING rum (i); +SELECT * FROM test_text WHERE i<'abc' ORDER BY i; + i +----- + a + ab + abb +(3 rows) + +SELECT * FROM test_text WHERE i<='abc' ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + +SELECT * FROM test_text WHERE i='abc' ORDER BY i; + i +----- + abc +(1 row) + +SELECT * FROM test_text WHERE i>='abc' ORDER BY i; + i +----- + abc + axy + xyz +(3 rows) + +SELECT * FROM test_text WHERE i>'abc' ORDER BY i; + i +----- + axy + xyz +(2 rows) + +CREATE TABLE test_text_o AS SELECT id::text, t FROM tsts; +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + id +----- + 135 + 16 + 168 + 232 + 252 + 354 + 355 + 371 + 39 +(9 rows) + +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 + 71 +(8 rows) + +CREATE INDEX test_text_o_idx ON test_text_o USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't'); +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_bitmapscan=OFF; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_text_o_idx on test_text_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) +(4 rows) + +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + id +----- + 135 + 16 + 168 + 232 + 252 + 354 + 355 + 371 + 39 +(9 rows) + +EXPLAIN (costs off) +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_text_o_idx on test_text_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) +(4 rows) + +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 + 71 +(8 rows) + +CREATE TABLE test_text_a AS SELECT id::text, t FROM tsts; +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column +CREATE INDEX test_text_a_idx ON test_text_a USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +ERROR: doesn't support order index over pass-by-reference column +EXPLAIN (costs off) +SELECT count(*) FROM test_text_a WHERE id < '400'; + QUERY PLAN +------------------------------------ + Aggregate + -> Seq Scan on test_text_a + Filter: (id < '400'::text) +(3 rows) + +SELECT count(*) FROM test_text_a WHERE id < '400'; + count +------- + 337 +(1 row) + +EXPLAIN (costs off) +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_text_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) +(4 rows) + +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + id +----- + 135 + 16 + 168 + 232 + 252 + 354 + 355 + 371 + 39 +(9 rows) + +EXPLAIN (costs off) +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_text_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) +(4 rows) + +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 + 71 +(8 rows) + +CREATE TABLE test_text_h_o AS SELECT id::text, t FROM tsts; +CREATE INDEX test_text_h_o_idx ON test_text_h_o USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't'); +EXPLAIN (costs off) +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_text_h_o_idx on test_text_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) +(4 rows) + +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + id +----- + 135 + 16 + 168 + 232 + 252 + 354 + 355 + 371 + 39 +(9 rows) + +EXPLAIN (costs off) +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Sort Key: id + -> Index Scan using test_text_h_o_idx on test_text_h_o + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) +(4 rows) + +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 + 71 +(8 rows) + +CREATE TABLE test_text_h_a AS SELECT id::text, t FROM tsts; +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column +CREATE INDEX test_text_h_a_idx ON test_text_h_a USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); +ERROR: doesn't support order index over pass-by-reference column +EXPLAIN (costs off) +SELECT count(*) FROM test_text_h_a WHERE id < '400'; + QUERY PLAN +------------------------------------ + Aggregate + -> Seq Scan on test_text_h_a + Filter: (id < '400'::text) +(3 rows) + +SELECT count(*) FROM test_text_h_a WHERE id < '400'; + count +------- + 337 +(1 row) + +EXPLAIN (costs off) +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_text_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) +(4 rows) + +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; + id +----- + 135 + 16 + 168 + 232 + 252 + 354 + 355 + 371 + 39 +(9 rows) + +EXPLAIN (costs off) +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + QUERY PLAN +----------------------------------------------------------------------------- + Sort + Sort Key: id + -> Seq Scan on test_text_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) +(4 rows) + +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + id +----- + 406 + 415 + 428 + 457 + 458 + 484 + 496 + 71 +(8 rows) + diff --git a/expected/time.out b/expected/time.out new file mode 100644 index 0000000000..c805fb51e0 --- /dev/null +++ b/expected/time.out @@ -0,0 +1,51 @@ +set enable_seqscan=off; +CREATE TABLE test_time ( + i time +); +INSERT INTO test_time VALUES + ( '03:55:08' ), + ( '04:55:08' ), + ( '05:55:08' ), + ( '08:55:08' ), + ( '09:55:08' ), + ( '10:55:08' ) +; +CREATE INDEX idx_time ON test_time USING rum (i); +SELECT * FROM test_time WHERE i<'08:55:08'::time ORDER BY i; + i +---------- + 03:55:08 + 04:55:08 + 05:55:08 +(3 rows) + +SELECT * FROM test_time WHERE i<='08:55:08'::time ORDER BY i; + i +---------- + 03:55:08 + 04:55:08 + 05:55:08 + 08:55:08 +(4 rows) + +SELECT * FROM test_time WHERE i='08:55:08'::time ORDER BY i; + i +---------- + 08:55:08 +(1 row) + +SELECT * FROM test_time WHERE i>='08:55:08'::time ORDER BY i; + i +---------- + 08:55:08 + 09:55:08 + 10:55:08 +(3 rows) + +SELECT * FROM test_time WHERE i>'08:55:08'::time ORDER BY i; + i +---------- + 09:55:08 + 10:55:08 +(2 rows) + diff --git a/expected/timestamp.out b/expected/timestamp.out index 797333f7f3..00969a7534 100644 --- a/expected/timestamp.out +++ b/expected/timestamp.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * timestamp.out - test output for 64-bit systems and + * timestamp_1.out - test output for 32-bit systems. + * + */ CREATE TABLE test_timestamp ( i timestamp ); @@ -9,7 +18,6 @@ INSERT INTO test_timestamp VALUES ( '2004-10-26 09:55:08' ), ( '2004-10-26 10:55:08' ) ; -SELECT i::timestamptz AS i INTO test_timestamptz FROM test_timestamp; SELECT i <=> '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; ?column? | i ----------+-------------------------- @@ -44,7 +52,6 @@ SELECT i |=> '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; (6 rows) CREATE INDEX idx_timestamp ON test_timestamp USING rum (i); -CREATE INDEX idx_timestamptz ON test_timestamptz USING rum (i); set enable_seqscan=off; explain (costs off) SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; @@ -132,6 +139,50 @@ SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i Tue Oct 26 10:55:08 2004 (2 rows) +explain (costs off) +SELECT *, i <=> '2004-10-26 08:55:08'::timestamp FROM test_timestamp + ORDER BY i <=> '2004-10-26 08:55:08'::timestamp; + QUERY PLAN +----------------------------------------------------------------------------- + Index Scan using idx_timestamp on test_timestamp + Order By: (i <=> 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(2 rows) + +SELECT *, i <=> '2004-10-26 08:55:08'::timestamp FROM test_timestamp + ORDER BY i <=> '2004-10-26 08:55:08'::timestamp; + i | ?column? +--------------------------+---------- + Tue Oct 26 08:55:08 2004 | 0 + Tue Oct 26 09:55:08 2004 | 3600 + Tue Oct 26 10:55:08 2004 | 7200 + Tue Oct 26 05:55:08 2004 | 10800 + Tue Oct 26 04:55:08 2004 | 14400 + Tue Oct 26 03:55:08 2004 | 18000 +(6 rows) + +explain (costs off) +SELECT *, i <=> '2004-10-26 05:00:00'::timestamp FROM test_timestamp + WHERE i>'2004-10-26 05:00:00'::timestamp ORDER BY i <=> '2004-10-26 05:00:00'::timestamp; + QUERY PLAN +----------------------------------------------------------------------------- + Index Scan using idx_timestamp on test_timestamp + Index Cond: (i > 'Tue Oct 26 05:00:00 2004'::timestamp without time zone) + Order By: (i <=> 'Tue Oct 26 05:00:00 2004'::timestamp without time zone) +(3 rows) + +SELECT *, i <=> '2004-10-26 05:00:00'::timestamp FROM test_timestamp + WHERE i>'2004-10-26 05:00:00'::timestamp ORDER BY i <=> '2004-10-26 05:00:00'::timestamp; + i | ?column? +--------------------------+---------- + Tue Oct 26 05:55:08 2004 | 3308 + Tue Oct 26 08:55:08 2004 | 14108 + Tue Oct 26 09:55:08 2004 | 17708 + Tue Oct 26 10:55:08 2004 | 21308 +(4 rows) + +-- Tests for timestamptz +SELECT i::timestamptz AS i INTO test_timestamptz FROM test_timestamp; +CREATE INDEX idx_timestamptz ON test_timestamptz USING rum (i); explain (costs off) SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; QUERY PLAN @@ -149,3 +200,44 @@ SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER Tue Oct 26 10:55:08 2004 PDT (2 rows) +explain (costs off) +SELECT *, i <=> '2004-10-26 08:55:08'::timestamptz FROM test_timestamptz + ORDER BY i <=> '2004-10-26 08:55:08'::timestamptz; + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using idx_timestamptz on test_timestamptz + Order By: (i <=> 'Tue Oct 26 08:55:08 2004 PDT'::timestamp with time zone) +(2 rows) + +SELECT *, i <=> '2004-10-26 08:55:08'::timestamptz FROM test_timestamptz + ORDER BY i <=> '2004-10-26 08:55:08'::timestamptz; + i | ?column? +------------------------------+---------- + Tue Oct 26 08:55:08 2004 PDT | 0 + Tue Oct 26 09:55:08 2004 PDT | 3600 + Tue Oct 26 10:55:08 2004 PDT | 7200 + Tue Oct 26 05:55:08 2004 PDT | 10800 + Tue Oct 26 04:55:08 2004 PDT | 14400 + Tue Oct 26 03:55:08 2004 PDT | 18000 +(6 rows) + +explain (costs off) +SELECT *, i <=> '2004-10-26 05:00:00'::timestamptz FROM test_timestamptz + WHERE i>'2004-10-26 05:00:00'::timestamptz ORDER BY i <=> '2004-10-26 05:00:00'::timestamptz; + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using idx_timestamptz on test_timestamptz + Index Cond: (i > 'Tue Oct 26 05:00:00 2004 PDT'::timestamp with time zone) + Order By: (i <=> 'Tue Oct 26 05:00:00 2004 PDT'::timestamp with time zone) +(3 rows) + +SELECT *, i <=> '2004-10-26 05:00:00'::timestamptz FROM test_timestamptz + WHERE i>'2004-10-26 05:00:00'::timestamptz ORDER BY i <=> '2004-10-26 05:00:00'::timestamptz; + i | ?column? +------------------------------+---------- + Tue Oct 26 05:55:08 2004 PDT | 3308 + Tue Oct 26 08:55:08 2004 PDT | 14108 + Tue Oct 26 09:55:08 2004 PDT | 17708 + Tue Oct 26 10:55:08 2004 PDT | 21308 +(4 rows) + diff --git a/expected/timestamp_1.out b/expected/timestamp_1.out new file mode 100644 index 0000000000..a8641a3232 --- /dev/null +++ b/expected/timestamp_1.out @@ -0,0 +1,211 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * timestamp.out - test output for 64-bit systems and + * timestamp_1.out - test output for 32-bit systems. + * + */ +CREATE TABLE test_timestamp ( + i timestamp +); +INSERT INTO test_timestamp VALUES + ( '2004-10-26 03:55:08' ), + ( '2004-10-26 04:55:08' ), + ( '2004-10-26 05:55:08' ), + ( '2004-10-26 08:55:08' ), + ( '2004-10-26 09:55:08' ), + ( '2004-10-26 10:55:08' ) +; +SELECT i <=> '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; + ?column? | i +----------+-------------------------- + 1740 | Tue Oct 26 05:55:08 2004 + 5340 | Tue Oct 26 04:55:08 2004 + 8940 | Tue Oct 26 03:55:08 2004 + 9060 | Tue Oct 26 08:55:08 2004 + 12660 | Tue Oct 26 09:55:08 2004 + 16260 | Tue Oct 26 10:55:08 2004 +(6 rows) + +SELECT i <=| '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; + ?column? | i +----------+-------------------------- + 1740 | Tue Oct 26 05:55:08 2004 + 5340 | Tue Oct 26 04:55:08 2004 + 8940 | Tue Oct 26 03:55:08 2004 + Infinity | Tue Oct 26 08:55:08 2004 + Infinity | Tue Oct 26 09:55:08 2004 + Infinity | Tue Oct 26 10:55:08 2004 +(6 rows) + +SELECT i |=> '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; + ?column? | i +----------+-------------------------- + 9060 | Tue Oct 26 08:55:08 2004 + 12660 | Tue Oct 26 09:55:08 2004 + 16260 | Tue Oct 26 10:55:08 2004 + Infinity | Tue Oct 26 03:55:08 2004 + Infinity | Tue Oct 26 04:55:08 2004 + Infinity | Tue Oct 26 05:55:08 2004 +(6 rows) + +CREATE INDEX idx_timestamp ON test_timestamp USING rum (i); +set enable_seqscan=off; +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: i + -> Index Scan using idx_timestamp on test_timestamp + Index Cond: (i < 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(4 rows) + +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 +(3 rows) + +explain (costs off) +SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY i; + QUERY PLAN +------------------------------------------------------------------------------------ + Sort + Sort Key: i + -> Index Scan using idx_timestamp on test_timestamp + Index Cond: (i <= 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(4 rows) + +SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 +(4 rows) + +explain (costs off) +SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; + QUERY PLAN +----------------------------------------------------------------------------- + Index Scan using idx_timestamp on test_timestamp + Index Cond: (i = 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(2 rows) + +SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +-------------------------- + Tue Oct 26 08:55:08 2004 +(1 row) + +explain (costs off) +SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; + QUERY PLAN +------------------------------------------------------------------------------------ + Sort + Sort Key: i + -> Index Scan using idx_timestamp on test_timestamp + Index Cond: (i >= 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(4 rows) + +SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +-------------------------- + Tue Oct 26 08:55:08 2004 + Tue Oct 26 09:55:08 2004 + Tue Oct 26 10:55:08 2004 +(3 rows) + +explain (costs off) +SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: i + -> Index Scan using idx_timestamp on test_timestamp + Index Cond: (i > 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(4 rows) + +SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; + i +-------------------------- + Tue Oct 26 09:55:08 2004 + Tue Oct 26 10:55:08 2004 +(2 rows) + +explain (costs off) +SELECT *, i <=> '2004-10-26 08:55:08'::timestamp FROM test_timestamp + ORDER BY i <=> '2004-10-26 08:55:08'::timestamp; + QUERY PLAN +----------------------------------------------------------------------------- + Index Scan using idx_timestamp on test_timestamp + Order By: (i <=> 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(2 rows) + +SELECT *, i <=> '2004-10-26 08:55:08'::timestamp FROM test_timestamp + ORDER BY i <=> '2004-10-26 08:55:08'::timestamp; +ERROR: doesn't support order by over pass-by-reference column +explain (costs off) +SELECT *, i <=> '2004-10-26 05:00:00'::timestamp FROM test_timestamp + WHERE i>'2004-10-26 05:00:00'::timestamp ORDER BY i <=> '2004-10-26 05:00:00'::timestamp; + QUERY PLAN +----------------------------------------------------------------------------- + Index Scan using idx_timestamp on test_timestamp + Index Cond: (i > 'Tue Oct 26 05:00:00 2004'::timestamp without time zone) + Order By: (i <=> 'Tue Oct 26 05:00:00 2004'::timestamp without time zone) +(3 rows) + +SELECT *, i <=> '2004-10-26 05:00:00'::timestamp FROM test_timestamp + WHERE i>'2004-10-26 05:00:00'::timestamp ORDER BY i <=> '2004-10-26 05:00:00'::timestamp; +ERROR: doesn't support order by over pass-by-reference column +-- Tests for timestamptz +SELECT i::timestamptz AS i INTO test_timestamptz FROM test_timestamp; +CREATE INDEX idx_timestamptz ON test_timestamptz USING rum (i); +explain (costs off) +SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + QUERY PLAN +------------------------------------------------------------------------------------ + Sort + Sort Key: i + -> Index Scan using idx_timestamptz on test_timestamptz + Index Cond: (i > 'Tue Oct 26 08:55:08 2004 PDT'::timestamp with time zone) +(4 rows) + +SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +------------------------------ + Tue Oct 26 09:55:08 2004 PDT + Tue Oct 26 10:55:08 2004 PDT +(2 rows) + +explain (costs off) +SELECT *, i <=> '2004-10-26 08:55:08'::timestamptz FROM test_timestamptz + ORDER BY i <=> '2004-10-26 08:55:08'::timestamptz; + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using idx_timestamptz on test_timestamptz + Order By: (i <=> 'Tue Oct 26 08:55:08 2004 PDT'::timestamp with time zone) +(2 rows) + +SELECT *, i <=> '2004-10-26 08:55:08'::timestamptz FROM test_timestamptz + ORDER BY i <=> '2004-10-26 08:55:08'::timestamptz; +ERROR: doesn't support order by over pass-by-reference column +explain (costs off) +SELECT *, i <=> '2004-10-26 05:00:00'::timestamptz FROM test_timestamptz + WHERE i>'2004-10-26 05:00:00'::timestamptz ORDER BY i <=> '2004-10-26 05:00:00'::timestamptz; + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using idx_timestamptz on test_timestamptz + Index Cond: (i > 'Tue Oct 26 05:00:00 2004 PDT'::timestamp with time zone) + Order By: (i <=> 'Tue Oct 26 05:00:00 2004 PDT'::timestamp with time zone) +(3 rows) + +SELECT *, i <=> '2004-10-26 05:00:00'::timestamptz FROM test_timestamptz + WHERE i>'2004-10-26 05:00:00'::timestamptz ORDER BY i <=> '2004-10-26 05:00:00'::timestamptz; +ERROR: doesn't support order by over pass-by-reference column diff --git a/expected/timetz.out b/expected/timetz.out new file mode 100644 index 0000000000..ea194ecbfe --- /dev/null +++ b/expected/timetz.out @@ -0,0 +1,51 @@ +set enable_seqscan=off; +CREATE TABLE test_timetz ( + i timetz +); +INSERT INTO test_timetz VALUES + ( '03:55:08 GMT+2' ), + ( '04:55:08 GMT+2' ), + ( '05:55:08 GMT+2' ), + ( '08:55:08 GMT+2' ), + ( '09:55:08 GMT+2' ), + ( '10:55:08 GMT+2' ) +; +CREATE INDEX idx_timetz ON test_timetz USING rum (i); +SELECT * FROM test_timetz WHERE i<'08:55:08 GMT+2'::timetz ORDER BY i; + i +------------- + 03:55:08-02 + 04:55:08-02 + 05:55:08-02 +(3 rows) + +SELECT * FROM test_timetz WHERE i<='08:55:08 GMT+2'::timetz ORDER BY i; + i +------------- + 03:55:08-02 + 04:55:08-02 + 05:55:08-02 + 08:55:08-02 +(4 rows) + +SELECT * FROM test_timetz WHERE i='08:55:08 GMT+2'::timetz ORDER BY i; + i +------------- + 08:55:08-02 +(1 row) + +SELECT * FROM test_timetz WHERE i>='08:55:08 GMT+2'::timetz ORDER BY i; + i +------------- + 08:55:08-02 + 09:55:08-02 + 10:55:08-02 +(3 rows) + +SELECT * FROM test_timetz WHERE i>'08:55:08 GMT+2'::timetz ORDER BY i; + i +------------- + 09:55:08-02 + 10:55:08-02 +(2 rows) + diff --git a/expected/varbit.out b/expected/varbit.out new file mode 100644 index 0000000000..60b9224a01 --- /dev/null +++ b/expected/varbit.out @@ -0,0 +1,44 @@ +set enable_seqscan=off; +CREATE TABLE test_varbit ( + i varbit +); +INSERT INTO test_varbit VALUES ('001'),('010'),('011'),('100'),('101'),('110'); +CREATE INDEX idx_varbit ON test_varbit USING rum (i); +SELECT * FROM test_varbit WHERE i<'100'::varbit ORDER BY i; + i +----- + 001 + 010 + 011 +(3 rows) + +SELECT * FROM test_varbit WHERE i<='100'::varbit ORDER BY i; + i +----- + 001 + 010 + 011 + 100 +(4 rows) + +SELECT * FROM test_varbit WHERE i='100'::varbit ORDER BY i; + i +----- + 100 +(1 row) + +SELECT * FROM test_varbit WHERE i>='100'::varbit ORDER BY i; + i +----- + 100 + 101 + 110 +(3 rows) + +SELECT * FROM test_varbit WHERE i>'100'::varbit ORDER BY i; + i +----- + 101 + 110 +(2 rows) + diff --git a/expected/varchar.out b/expected/varchar.out new file mode 100644 index 0000000000..4ab7937f06 --- /dev/null +++ b/expected/varchar.out @@ -0,0 +1,44 @@ +set enable_seqscan=off; +CREATE TABLE test_varchar ( + i varchar +); +INSERT INTO test_varchar VALUES ('a'),('ab'),('abc'),('abb'),('axy'),('xyz'); +CREATE INDEX idx_varchar ON test_varchar USING rum (i); +SELECT * FROM test_varchar WHERE i<'abc'::varchar ORDER BY i; + i +----- + a + ab + abb +(3 rows) + +SELECT * FROM test_varchar WHERE i<='abc'::varchar ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + +SELECT * FROM test_varchar WHERE i='abc'::varchar ORDER BY i; + i +----- + abc +(1 row) + +SELECT * FROM test_varchar WHERE i>='abc'::varchar ORDER BY i; + i +----- + abc + axy + xyz +(3 rows) + +SELECT * FROM test_varchar WHERE i>'abc'::varchar ORDER BY i; + i +----- + axy + xyz +(2 rows) + diff --git a/logical.conf b/logical.conf new file mode 100644 index 0000000000..367f706651 --- /dev/null +++ b/logical.conf @@ -0,0 +1,2 @@ +wal_level = logical +max_replication_slots = 4 diff --git a/meson.build b/meson.build new file mode 100644 index 0000000000..b4336f0668 --- /dev/null +++ b/meson.build @@ -0,0 +1,118 @@ +# Copyright (c) 2025, Postgres Professional + +# Does not support the PGXS infrastructure at this time. Please, compile as part +# of the contrib source tree. + +extension = 'rum' +extversion = '1.3' + +rum_sources = files( + 'src/btree_rum.c', + 'src/rum_arr_utils.c', + 'src/rum_ts_utils.c', + 'src/rumbtree.c', + 'src/rumbulk.c', + 'src/rumdatapage.c', + 'src/rumentrypage.c', + 'src/rumget.c', + 'src/ruminsert.c', + 'src/rumscan.c', + 'src/rumsort.c', + 'src/rumtsquery.c', + 'src/rumutil.c', + 'src/rumvacuum.c', + 'src/rumvalidate.c', +) + +if host_system == 'windows' + rum_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'rum', + '--FILEDESC', 'RUM index access method',]) +endif + +rum = shared_module('rum', + rum_sources, + kwargs: contrib_mod_args, +) +contrib_targets += rum + +configure_file( + input: 'rum_init.sql', + output: extension + '--' + extversion + '.sql', + copy: true, + install: true, + install_dir: contrib_data_args['install_dir'], +) + +install_data( + 'rum.control', + 'rum--1.0--1.1.sql', + 'rum--1.1--1.2.sql', + 'rum--1.2--1.3.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'rum', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'security', + 'rum', + 'rum_validate', + 'rum_hash', + 'ruminv', + 'timestamp', + 'orderby', + 'orderby_hash', + 'altorder', + 'altorder_hash', + 'limits', + 'int2', + 'int4', + 'int8', + 'float4', + 'float8', + 'money', + 'oid', + 'time', + 'timetz', + 'date', + 'interval', + 'macaddr', + 'inet', + 'cidr', + 'text', + 'varchar', + 'char', + 'bytea', + 'bit', + 'varbit', + 'numeric', + 'rum_weight', + 'expr', + 'array', + ], + 'regress_args': [ + '--temp-config', files('logical.conf') + ], + }, + 'tap': { + 'tests': [ + 't/001_wal.pl', + 't/002_pglist.pl', + ], + 'test_kwargs': {'timeout': 3000}, + }, + 'isolation': { + 'specs': [ + 'predicate-rum', + 'predicate-rum-2', + ], + 'regress_args': [ + '--temp-config', files('logical.conf'), + '--load-extension=rum', + ], + }, +} diff --git a/rum--1.0--1.1.sql b/rum--1.0--1.1.sql new file mode 100644 index 0000000000..dcb838f7dd --- /dev/null +++ b/rum--1.0--1.1.sql @@ -0,0 +1,1106 @@ +/* + * RUM version 1.1 + */ + +CREATE FUNCTION rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +ALTER FUNCTION + rum_tsquery_timestamp_consistent (internal,smallint,tsvector,int,internal,internal,internal,internal) + RENAME TO rum_tsquery_addon_consistent; + +CREATE FUNCTION rum_numeric_cmp(numeric, numeric) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE OPERATOR CLASS rum_tsvector_addon_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE text; + +CREATE OPERATOR CLASS rum_tsvector_hash_addon_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 btint4cmp(integer, integer), + FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE integer; + +/*--------------------int2-----------------------*/ + +CREATE FUNCTION rum_int2_extract_value(int2, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int2_compare_prefix(int2, int2, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int2_extract_query(int2, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_int2_distance(int2, int2) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_int2_distance, + LEFTARG = int2, + RIGHTARG = int2, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_int2_left_distance(int2, int2) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_int2_left_distance, + LEFTARG = int2, + RIGHTARG = int2, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_int2_right_distance(int2, int2) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_int2_right_distance, + LEFTARG = int2, + RIGHTARG = int2, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_int2_outer_distance(int2, int2, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int2_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_int2_ops +DEFAULT FOR TYPE int2 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (int2,int2) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btint2cmp(int2,int2), + FUNCTION 2 rum_int2_extract_value(int2, internal), + FUNCTION 3 rum_int2_extract_query(int2, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_int2_compare_prefix(int2,int2,int2, internal), + -- support to int2 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_int2_config(internal), + FUNCTION 9 rum_int2_outer_distance(int2, int2, smallint), +STORAGE int2; + +/*--------------------int4-----------------------*/ + +CREATE FUNCTION rum_int4_extract_value(int4, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int4_compare_prefix(int4, int4, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int4_extract_query(int4, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_int4_distance(int4, int4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_int4_distance, + LEFTARG = int4, + RIGHTARG = int4, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_int4_left_distance(int4, int4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_int4_left_distance, + LEFTARG = int4, + RIGHTARG = int4, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_int4_right_distance(int4, int4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_int4_right_distance, + LEFTARG = int4, + RIGHTARG = int4, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_int4_outer_distance(int4, int4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int4_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_int4_ops +DEFAULT FOR TYPE int4 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (int4,int4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btint4cmp(int4,int4), + FUNCTION 2 rum_int4_extract_value(int4, internal), + FUNCTION 3 rum_int4_extract_query(int4, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_int4_compare_prefix(int4,int4,int2, internal), + -- support to int4 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_int4_config(internal), + FUNCTION 9 rum_int4_outer_distance(int4, int4, smallint), +STORAGE int4; + +/*--------------------int8-----------------------*/ + +CREATE FUNCTION rum_int8_extract_value(int8, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int8_compare_prefix(int8, int8, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int8_extract_query(int8, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_int8_distance(int8, int8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_int8_distance, + LEFTARG = int8, + RIGHTARG = int8, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_int8_left_distance(int8, int8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_int8_left_distance, + LEFTARG = int8, + RIGHTARG = int8, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_int8_right_distance(int8, int8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_int8_right_distance, + LEFTARG = int8, + RIGHTARG = int8, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_int8_outer_distance(int8, int8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int8_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_int8_ops +DEFAULT FOR TYPE int8 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (int8,int8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btint8cmp(int8,int8), + FUNCTION 2 rum_int8_extract_value(int8, internal), + FUNCTION 3 rum_int8_extract_query(int8, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_int8_compare_prefix(int8,int8,int2, internal), + -- support to int8 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_int8_config(internal), + FUNCTION 9 rum_int8_outer_distance(int8, int8, smallint), +STORAGE int8; + +/*--------------------float4-----------------------*/ + +CREATE FUNCTION rum_float4_extract_value(float4, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float4_compare_prefix(float4, float4, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float4_extract_query(float4, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_float4_distance(float4, float4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_float4_distance, + LEFTARG = float4, + RIGHTARG = float4, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_float4_left_distance(float4, float4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_float4_left_distance, + LEFTARG = float4, + RIGHTARG = float4, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_float4_right_distance(float4, float4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_float4_right_distance, + LEFTARG = float4, + RIGHTARG = float4, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_float4_outer_distance(float4, float4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float4_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_float4_ops +DEFAULT FOR TYPE float4 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (float4,float4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btfloat4cmp(float4,float4), + FUNCTION 2 rum_float4_extract_value(float4, internal), + FUNCTION 3 rum_float4_extract_query(float4, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_float4_compare_prefix(float4,float4,int2, internal), + -- support to float4 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_float4_config(internal), + FUNCTION 9 rum_float4_outer_distance(float4, float4, smallint), +STORAGE float4; + +/*--------------------float8-----------------------*/ + +CREATE FUNCTION rum_float8_extract_value(float8, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float8_compare_prefix(float8, float8, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float8_extract_query(float8, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_float8_distance(float8, float8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_float8_distance, + LEFTARG = float8, + RIGHTARG = float8, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_float8_left_distance(float8, float8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_float8_left_distance, + LEFTARG = float8, + RIGHTARG = float8, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_float8_right_distance(float8, float8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_float8_right_distance, + LEFTARG = float8, + RIGHTARG = float8, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_float8_outer_distance(float8, float8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float8_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_float8_ops +DEFAULT FOR TYPE float8 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (float8,float8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btfloat8cmp(float8,float8), + FUNCTION 2 rum_float8_extract_value(float8, internal), + FUNCTION 3 rum_float8_extract_query(float8, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_float8_compare_prefix(float8,float8,int2, internal), + -- support to float8 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_float8_config(internal), + FUNCTION 9 rum_float8_outer_distance(float8, float8, smallint), +STORAGE float8; + +/*--------------------money-----------------------*/ + +CREATE FUNCTION rum_money_extract_value(money, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_money_compare_prefix(money, money, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_money_extract_query(money, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_money_distance(money, money) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_money_distance, + LEFTARG = money, + RIGHTARG = money, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_money_left_distance(money, money) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_money_left_distance, + LEFTARG = money, + RIGHTARG = money, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_money_right_distance(money, money) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_money_right_distance, + LEFTARG = money, + RIGHTARG = money, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_money_outer_distance(money, money, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_money_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_money_ops +DEFAULT FOR TYPE money USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (money,money) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (money,money) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (money,money) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 cash_cmp(money,money), + FUNCTION 2 rum_money_extract_value(money, internal), + FUNCTION 3 rum_money_extract_query(money, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_money_compare_prefix(money,money,int2, internal), + -- support to money distance in rum_tsvector_addon_ops + FUNCTION 6 rum_money_config(internal), + FUNCTION 9 rum_money_outer_distance(money, money, smallint), +STORAGE money; + +/*--------------------oid-----------------------*/ + +CREATE FUNCTION rum_oid_extract_value(oid, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_oid_compare_prefix(oid, oid, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_oid_extract_query(oid, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_oid_distance(oid, oid) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_oid_distance, + LEFTARG = oid, + RIGHTARG = oid, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_oid_left_distance(oid, oid) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_oid_left_distance, + LEFTARG = oid, + RIGHTARG = oid, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_oid_right_distance(oid, oid) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_oid_right_distance, + LEFTARG = oid, + RIGHTARG = oid, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_oid_outer_distance(oid, oid, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_oid_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_oid_ops +DEFAULT FOR TYPE oid USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (oid,oid) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btoidcmp(oid,oid), + FUNCTION 2 rum_oid_extract_value(oid, internal), + FUNCTION 3 rum_oid_extract_query(oid, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_oid_compare_prefix(oid,oid,int2, internal), + -- support to oid distance in rum_tsvector_addon_ops + FUNCTION 6 rum_oid_config(internal), + FUNCTION 9 rum_oid_outer_distance(oid, oid, smallint), +STORAGE oid; + +/*--------------------time-----------------------*/ + +CREATE FUNCTION rum_time_extract_value(time, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_time_compare_prefix(time, time, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_time_extract_query(time, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_time_ops +DEFAULT FOR TYPE time USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 time_cmp(time,time), + FUNCTION 2 rum_time_extract_value(time, internal), + FUNCTION 3 rum_time_extract_query(time, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_time_compare_prefix(time,time,int2, internal), +STORAGE time; + +/*--------------------timetz-----------------------*/ + +CREATE FUNCTION rum_timetz_extract_value(timetz, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timetz_compare_prefix(timetz, timetz, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timetz_extract_query(timetz, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_timetz_ops +DEFAULT FOR TYPE timetz USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 timetz_cmp(timetz,timetz), + FUNCTION 2 rum_timetz_extract_value(timetz, internal), + FUNCTION 3 rum_timetz_extract_query(timetz, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_timetz_compare_prefix(timetz,timetz,int2, internal), +STORAGE timetz; + +/*--------------------date-----------------------*/ + +CREATE FUNCTION rum_date_extract_value(date, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_date_compare_prefix(date, date, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_date_extract_query(date, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_date_ops +DEFAULT FOR TYPE date USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 date_cmp(date,date), + FUNCTION 2 rum_date_extract_value(date, internal), + FUNCTION 3 rum_date_extract_query(date, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_date_compare_prefix(date,date,int2, internal), +STORAGE date; + +/*--------------------interval-----------------------*/ + +CREATE FUNCTION rum_interval_extract_value(interval, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_interval_compare_prefix(interval, interval, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_interval_extract_query(interval, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_interval_ops +DEFAULT FOR TYPE interval USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 interval_cmp(interval,interval), + FUNCTION 2 rum_interval_extract_value(interval, internal), + FUNCTION 3 rum_interval_extract_query(interval, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_interval_compare_prefix(interval,interval,int2, internal), +STORAGE interval; + +/*--------------------macaddr-----------------------*/ + +CREATE FUNCTION rum_macaddr_extract_value(macaddr, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_macaddr_compare_prefix(macaddr, macaddr, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_macaddr_extract_query(macaddr, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_macaddr_ops +DEFAULT FOR TYPE macaddr USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 macaddr_cmp(macaddr,macaddr), + FUNCTION 2 rum_macaddr_extract_value(macaddr, internal), + FUNCTION 3 rum_macaddr_extract_query(macaddr, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_macaddr_compare_prefix(macaddr,macaddr,int2, internal), +STORAGE macaddr; + +/*--------------------inet-----------------------*/ + +CREATE FUNCTION rum_inet_extract_value(inet, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_inet_compare_prefix(inet, inet, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_inet_extract_query(inet, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_inet_ops +DEFAULT FOR TYPE inet USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 network_cmp(inet,inet), + FUNCTION 2 rum_inet_extract_value(inet, internal), + FUNCTION 3 rum_inet_extract_query(inet, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_inet_compare_prefix(inet,inet,int2, internal), +STORAGE inet; + +/*--------------------cidr-----------------------*/ + +CREATE FUNCTION rum_cidr_extract_value(cidr, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_cidr_compare_prefix(cidr, cidr, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_cidr_extract_query(cidr, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_cidr_ops +DEFAULT FOR TYPE cidr USING rum +AS + OPERATOR 1 < (inet, inet), + OPERATOR 2 <= (inet, inet), + OPERATOR 3 = (inet, inet), + OPERATOR 4 >= (inet, inet), + OPERATOR 5 > (inet, inet), + FUNCTION 1 network_cmp(inet,inet), + FUNCTION 2 rum_cidr_extract_value(cidr, internal), + FUNCTION 3 rum_cidr_extract_query(cidr, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_cidr_compare_prefix(cidr,cidr,int2, internal), +STORAGE cidr; + +/*--------------------text-----------------------*/ + +CREATE FUNCTION rum_text_extract_value(text, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_text_compare_prefix(text, text, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_text_extract_query(text, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_text_ops +DEFAULT FOR TYPE text USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 bttextcmp(text,text), + FUNCTION 2 rum_text_extract_value(text, internal), + FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), +STORAGE text; + +/*--------------------varchar-----------------------*/ + + +CREATE OPERATOR CLASS rum_varchar_ops +DEFAULT FOR TYPE varchar USING rum +AS + OPERATOR 1 < (text, text), + OPERATOR 2 <= (text, text), + OPERATOR 3 = (text, text), + OPERATOR 4 >= (text, text), + OPERATOR 5 > (text, text), + FUNCTION 1 bttextcmp(text,text), + FUNCTION 2 rum_text_extract_value(text, internal), + FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), +STORAGE varchar; + +/*--------------------"char"-----------------------*/ + +CREATE FUNCTION rum_char_extract_value("char", internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_char_compare_prefix("char", "char", int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_char_extract_query("char", internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_char_ops +DEFAULT FOR TYPE "char" USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 btcharcmp("char","char"), + FUNCTION 2 rum_char_extract_value("char", internal), + FUNCTION 3 rum_char_extract_query("char", internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_char_compare_prefix("char","char",int2, internal), +STORAGE "char"; + +/*--------------------bytea-----------------------*/ + +CREATE FUNCTION rum_bytea_extract_value(bytea, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bytea_compare_prefix(bytea, bytea, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bytea_extract_query(bytea, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_bytea_ops +DEFAULT FOR TYPE bytea USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 byteacmp(bytea,bytea), + FUNCTION 2 rum_bytea_extract_value(bytea, internal), + FUNCTION 3 rum_bytea_extract_query(bytea, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_bytea_compare_prefix(bytea,bytea,int2, internal), +STORAGE bytea; + +/*--------------------bit-----------------------*/ + +CREATE FUNCTION rum_bit_extract_value(bit, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bit_compare_prefix(bit, bit, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bit_extract_query(bit, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_bit_ops +DEFAULT FOR TYPE bit USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 bitcmp(bit,bit), + FUNCTION 2 rum_bit_extract_value(bit, internal), + FUNCTION 3 rum_bit_extract_query(bit, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_bit_compare_prefix(bit,bit,int2, internal), +STORAGE bit; + +/*--------------------varbit-----------------------*/ + +CREATE FUNCTION rum_varbit_extract_value(varbit, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_varbit_compare_prefix(varbit, varbit, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_varbit_extract_query(varbit, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_varbit_ops +DEFAULT FOR TYPE varbit USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 varbitcmp(varbit,varbit), + FUNCTION 2 rum_varbit_extract_value(varbit, internal), + FUNCTION 3 rum_varbit_extract_query(varbit, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_varbit_compare_prefix(varbit,varbit,int2, internal), +STORAGE varbit; + +/*--------------------numeric-----------------------*/ + +CREATE FUNCTION rum_numeric_extract_value(numeric, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_numeric_compare_prefix(numeric, numeric, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_numeric_extract_query(numeric, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_numeric_ops +DEFAULT FOR TYPE numeric USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 rum_numeric_cmp(numeric,numeric), + FUNCTION 2 rum_numeric_extract_value(numeric, internal), + FUNCTION 3 rum_numeric_extract_query(numeric, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_numeric_compare_prefix(numeric,numeric,int2, internal), +STORAGE numeric; + diff --git a/rum--1.0.sql b/rum--1.0.sql deleted file mode 100644 index c8dbfcbd96..0000000000 --- a/rum--1.0.sql +++ /dev/null @@ -1,407 +0,0 @@ -CREATE OR REPLACE FUNCTION rumhandler(internal) -RETURNS index_am_handler -AS 'MODULE_PATHNAME' -LANGUAGE C; - -/* - * RUM access method - */ - -CREATE ACCESS METHOD rum TYPE INDEX HANDLER rumhandler; - -/* - * RUM built-in types, operators and functions - */ - --- Type used in distance calculations with normalization argument -CREATE TYPE rum_distance_query AS (query tsquery, method int); - -CREATE FUNCTION tsquery_to_distance_query(tsquery) -RETURNS rum_distance_query -AS 'MODULE_PATHNAME', 'tsquery_to_distance_query' -LANGUAGE C IMMUTABLE STRICT; - -CREATE CAST (tsquery AS rum_distance_query) - WITH FUNCTION tsquery_to_distance_query(tsquery) AS IMPLICIT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_tt' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery,int) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_ttf' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,rum_distance_query) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_td' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = tsquery, - PROCEDURE = rum_ts_distance -); - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = rum_distance_query, - PROCEDURE = rum_ts_distance -); - -CREATE FUNCTION rum_timestamp_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamp_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamp_left_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamp_left_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamp_right_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamp_right_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=| -); - -/* - * rum_tsvector_ops operator class - */ - -CREATE FUNCTION rum_extract_tsvector(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsvector_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - --- To prevent calling from SQL -CREATE FUNCTION rum_ts_join_pos(internal, internal) -RETURNS bytea -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_ops -DEFAULT FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE text; - -/* - * rum_tsvector_hash_ops operator class. - * - * Stores hash of entries as keys in index. - */ - -CREATE FUNCTION rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_hash_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE integer; - -/* - * rum_timestamp_ops operator class - */ - --- timestamp operator class - -CREATE FUNCTION rum_timestamp_extract_value(timestamp,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_outer_distance(timestamp, timestamp, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE OPERATOR CLASS rum_timestamp_ops -DEFAULT FOR TYPE timestamp USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamp_cmp(timestamp,timestamp), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamp disttance in rum_tsvector_timestamp_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamp; - -/* - * rum_tsvector_timestamp_ops operator class. - * - * Stores timestamp with tsvector. - */ - -CREATE FUNCTION rum_tsquery_timestamp_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamp_ops operator class - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_timestamptz_ops operator class - */ - -CREATE FUNCTION rum_timestamptz_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamptz_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamptz_left_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_left_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamptz_left_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamptz_right_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_right_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamptz_right_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=| -); - -CREATE OPERATOR CLASS rum_timestamptz_ops -DEFAULT FOR TYPE timestamptz USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamptz_cmp(timestamptz,timestamptz), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamptz distance in rum_tsvector_timestamptz_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamptz; - -/* - * rum_tsvector_timestamptz_ops operator class. - * - * Stores tsvector with timestamptz. - */ - -CREATE OPERATOR CLASS rum_tsvector_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamptz_ops operator class - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_tsquery_ops operator class. - * - * Used for inversed text search. - */ - -CREATE FUNCTION ruminv_extract_tsquery(tsquery,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsvector_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsquery_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsquery_ops -DEFAULT FOR TYPE tsquery USING rum -AS - OPERATOR 1 @@ (tsquery, tsvector), - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 ruminv_extract_tsquery(tsquery,internal,internal,internal,internal), - FUNCTION 3 ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 ruminv_tsvector_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 ruminv_tsquery_config(internal), - STORAGE text; diff --git a/rum--1.1--1.2.sql b/rum--1.1--1.2.sql new file mode 100644 index 0000000000..f1ea81bc1f --- /dev/null +++ b/rum--1.1--1.2.sql @@ -0,0 +1,190 @@ +/* + * RUM version 1.2 + */ + +/*--------------------anyarray-----------------------*/ + +CREATE FUNCTION rum_anyarray_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE OPERATOR % ( + PROCEDURE = rum_anyarray_similar, + LEFTARG = anyarray, + RIGHTARG = anyarray, + COMMUTATOR = '%', + RESTRICT = contsel, + JOIN = contjoinsel +); + + +CREATE FUNCTION rum_anyarray_distance(anyarray,anyarray) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_anyarray_distance, + LEFTARG = anyarray, + RIGHTARG = anyarray, + COMMUTATOR = '<=>' +); + + +CREATE FUNCTION rum_extract_anyarray(anyarray,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_anyarray_consistent(internal, smallint, anyarray, integer, internal, internal, internal, internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +CREATE OPERATOR CLASS rum_anyarray_ops +DEFAULT FOR TYPE anyarray USING rum +AS + OPERATOR 1 && (anyarray, anyarray), + OPERATOR 2 @> (anyarray, anyarray), + OPERATOR 3 <@ (anyarray, anyarray), + OPERATOR 4 = (anyarray, anyarray), + OPERATOR 5 % (anyarray, anyarray), + OPERATOR 20 <=> (anyarray, anyarray) FOR ORDER BY pg_catalog.float_ops, + --dispatch function 1 for concrete type + FUNCTION 2 rum_extract_anyarray(anyarray,internal,internal,internal,internal), + FUNCTION 3 rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_anyarray_consistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), + FUNCTION 6 rum_anyarray_config(internal), + FUNCTION 8 rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal), + STORAGE anyelement; + +CREATE OPERATOR CLASS rum_anyarray_addon_ops +FOR TYPE anyarray USING rum +AS + OPERATOR 1 && (anyarray, anyarray), + OPERATOR 2 @> (anyarray, anyarray), + OPERATOR 3 <@ (anyarray, anyarray), + OPERATOR 4 = (anyarray, anyarray), + --dispatch function 1 for concrete type + FUNCTION 2 ginarrayextract(anyarray,internal,internal), + FUNCTION 3 ginqueryarrayextract(anyarray,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 ginarrayconsistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), + STORAGE anyelement; + +/*--------------------int2-----------------------*/ + +CREATE FUNCTION rum_int2_key_distance(int2, int2, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_int2_ops USING rum ADD + FUNCTION 8 (int2,int2) rum_int2_key_distance(int2, int2, smallint); + +/*--------------------int4-----------------------*/ + +CREATE FUNCTION rum_int4_key_distance(int4, int4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_int4_ops USING rum ADD + FUNCTION 8 (int4,int4) rum_int4_key_distance(int4, int4, smallint); + +/*--------------------int8-----------------------*/ + +CREATE FUNCTION rum_int8_key_distance(int8, int8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_int8_ops USING rum ADD + FUNCTION 8 (int8,int8) rum_int8_key_distance(int8, int8, smallint); + +/*--------------------float4-----------------------*/ + +CREATE FUNCTION rum_float4_key_distance(float4, float4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_float4_ops USING rum ADD + FUNCTION 8 (float4,float4) rum_float4_key_distance(float4, float4, smallint); + +/*--------------------float8-----------------------*/ + +CREATE FUNCTION rum_float8_key_distance(float8, float8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_float8_ops USING rum ADD + FUNCTION 8 (float8,float8) rum_float8_key_distance(float8, float8, smallint); + +/*--------------------money-----------------------*/ + +CREATE FUNCTION rum_money_key_distance(money, money, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_money_ops USING rum ADD + FUNCTION 8 (money,money) rum_money_key_distance(money, money, smallint); + +/*--------------------oid-----------------------*/ + +CREATE FUNCTION rum_oid_key_distance(oid, oid, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_oid_ops USING rum ADD + FUNCTION 8 (oid,oid) rum_oid_key_distance(oid, oid, smallint); + +/*--------------------timestamp-----------------------*/ + +CREATE FUNCTION rum_timestamp_key_distance(timestamp, timestamp, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_timestamp_ops USING rum ADD + FUNCTION 8 (timestamp,timestamp) rum_timestamp_key_distance(timestamp, timestamp, smallint); + +/*--------------------timestamptz-----------------------*/ + +CREATE FUNCTION rum_timestamptz_key_distance(timestamptz, timestamptz, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_timestamptz_ops USING rum ADD + FUNCTION 8 (timestamptz,timestamptz) rum_timestamptz_key_distance(timestamptz, timestamptz, smallint); + diff --git a/rum--1.2--1.3.sql b/rum--1.2--1.3.sql new file mode 100644 index 0000000000..649b3524db --- /dev/null +++ b/rum--1.2--1.3.sql @@ -0,0 +1,19 @@ +/* + * RUM version 1.3 + */ + +CREATE FUNCTION rum_ts_score(tsvector,tsquery) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_score_tt' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_ts_score(tsvector,tsquery,int) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_score_ttf' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_ts_score(tsvector,rum_distance_query) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_score_td' +LANGUAGE C IMMUTABLE STRICT; + diff --git a/rum.control b/rum.control index b5d28f2586..30a00ccf67 100644 --- a/rum.control +++ b/rum.control @@ -1,5 +1,5 @@ # RUM extension comment = 'RUM index access method' -default_version = '1.0' +default_version = '1.3' module_pathname = '$libdir/rum' relocatable = true diff --git a/rum_init.sql b/rum_init.sql new file mode 100644 index 0000000000..621c4d2b9f --- /dev/null +++ b/rum_init.sql @@ -0,0 +1,1726 @@ +CREATE FUNCTION rumhandler(internal) +RETURNS index_am_handler +AS 'MODULE_PATHNAME' +LANGUAGE C; + +/* + * RUM access method + */ + +CREATE ACCESS METHOD rum TYPE INDEX HANDLER rumhandler; + +/* + * RUM built-in types, operators and functions + */ + +-- Type used in distance calculations with normalization argument +CREATE TYPE rum_distance_query AS (query tsquery, method int); + +CREATE FUNCTION tsquery_to_distance_query(tsquery) +RETURNS rum_distance_query +AS 'MODULE_PATHNAME', 'tsquery_to_distance_query' +LANGUAGE C IMMUTABLE STRICT; + +CREATE CAST (tsquery AS rum_distance_query) + WITH FUNCTION tsquery_to_distance_query(tsquery) AS IMPLICIT; + +CREATE FUNCTION rum_ts_distance(tsvector,tsquery) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_distance_tt' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_ts_distance(tsvector,tsquery,int) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_distance_ttf' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_ts_distance(tsvector,rum_distance_query) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_distance_td' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + LEFTARG = tsvector, + RIGHTARG = tsquery, + PROCEDURE = rum_ts_distance +); + +CREATE OPERATOR <=> ( + LEFTARG = tsvector, + RIGHTARG = rum_distance_query, + PROCEDURE = rum_ts_distance +); + +CREATE FUNCTION rum_timestamp_distance(timestamp, timestamp) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_timestamp_distance, + LEFTARG = timestamp, + RIGHTARG = timestamp, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_timestamp_left_distance(timestamp, timestamp) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_timestamp_left_distance, + LEFTARG = timestamp, + RIGHTARG = timestamp, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_timestamp_right_distance(timestamp, timestamp) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_timestamp_right_distance, + LEFTARG = timestamp, + RIGHTARG = timestamp, + COMMUTATOR = <=| +); + +/* + * rum_tsvector_ops operator class + */ + +CREATE FUNCTION rum_extract_tsvector(tsvector,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_tsvector_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_tsquery_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +-- To prevent calling from SQL +CREATE FUNCTION rum_ts_join_pos(internal, internal) +RETURNS bytea +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR CLASS rum_tsvector_ops +DEFAULT FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), + FUNCTION 6 rum_tsvector_config(internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), + FUNCTION 10 rum_ts_join_pos(internal, internal), + STORAGE text; + +/* + * rum_tsvector_hash_ops operator class. + * + * Stores hash of entries as keys in index. + */ + +CREATE FUNCTION rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR CLASS rum_tsvector_hash_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btint4cmp(integer, integer), + FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 6 rum_tsvector_config(internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), + FUNCTION 10 rum_ts_join_pos(internal, internal), + STORAGE integer; + +/* + * rum_timestamp_ops operator class + */ + +-- timestamp operator class + +CREATE FUNCTION rum_timestamp_extract_value(timestamp,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timestamp_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timestamp_outer_distance(timestamp, timestamp, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE OPERATOR CLASS rum_timestamp_ops +DEFAULT FOR TYPE timestamp USING rum +AS + OPERATOR 1 <, + OPERATOR 2 <=, + OPERATOR 3 =, + OPERATOR 4 >=, + OPERATOR 5 >, + --support + FUNCTION 1 timestamp_cmp(timestamp,timestamp), + FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), + FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), + FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), + FUNCTION 6 rum_timestamp_config(internal), + -- support to timestamp distance in rum_tsvector_timestamp_ops + FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), + OPERATOR 20 <=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, +STORAGE timestamp; + +/* + * rum_tsvector_timestamp_ops operator class. + * + * Stores timestamp with tsvector. + */ + +CREATE FUNCTION rum_tsquery_timestamp_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +/* + * !!!deprecated, use rum_tsvector_addon_ops!!! + */ +CREATE OPERATOR CLASS rum_tsvector_timestamp_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE text; + +/* + * rum_tsvector_hash_timestamp_ops operator class + * !!!deprecated, use rum_tsvector_hash_addon_ops!!! + */ + +CREATE OPERATOR CLASS rum_tsvector_hash_timestamp_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 btint4cmp(integer, integer), + FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE integer; + +/* + * rum_timestamptz_ops operator class + */ + +CREATE FUNCTION rum_timestamptz_distance(timestamptz, timestamptz) +RETURNS float8 +AS 'MODULE_PATHNAME', 'rum_timestamp_distance' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_timestamptz_distance, + LEFTARG = timestamptz, + RIGHTARG = timestamptz, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_timestamptz_left_distance(timestamptz, timestamptz) +RETURNS float8 +AS 'MODULE_PATHNAME', 'rum_timestamp_left_distance' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_timestamptz_left_distance, + LEFTARG = timestamptz, + RIGHTARG = timestamptz, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_timestamptz_right_distance(timestamptz, timestamptz) +RETURNS float8 +AS 'MODULE_PATHNAME', 'rum_timestamp_right_distance' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_timestamptz_right_distance, + LEFTARG = timestamptz, + RIGHTARG = timestamptz, + COMMUTATOR = <=| +); + +CREATE OPERATOR CLASS rum_timestamptz_ops +DEFAULT FOR TYPE timestamptz USING rum +AS + OPERATOR 1 <, + OPERATOR 2 <=, + OPERATOR 3 =, + OPERATOR 4 >=, + OPERATOR 5 >, + --support + FUNCTION 1 timestamptz_cmp(timestamptz,timestamptz), + FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), + FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), + FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), + FUNCTION 6 rum_timestamp_config(internal), + -- support to timestamptz distance in rum_tsvector_timestamptz_ops + FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), + OPERATOR 20 <=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, +STORAGE timestamptz; + +/* + * rum_tsvector_timestamptz_ops operator class. + * + * Stores tsvector with timestamptz. + */ + +CREATE OPERATOR CLASS rum_tsvector_timestamptz_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE text; + +/* + * rum_tsvector_hash_timestamptz_ops operator class + */ + +CREATE OPERATOR CLASS rum_tsvector_hash_timestamptz_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 btint4cmp(integer, integer), + FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE integer; + +/* + * rum_tsquery_ops operator class. + * + * Used for inversed text search. + */ + +CREATE FUNCTION ruminv_extract_tsquery(tsquery,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION ruminv_tsvector_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION ruminv_tsquery_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR CLASS rum_tsquery_ops +DEFAULT FOR TYPE tsquery USING rum +AS + OPERATOR 1 @@ (tsquery, tsvector), + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 ruminv_extract_tsquery(tsquery,internal,internal,internal,internal), + FUNCTION 3 ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 ruminv_tsvector_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 6 ruminv_tsquery_config(internal), + STORAGE text; +/* + * RUM version 1.1 + */ + +CREATE FUNCTION rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +ALTER FUNCTION + rum_tsquery_timestamp_consistent (internal,smallint,tsvector,int,internal,internal,internal,internal) + RENAME TO rum_tsquery_addon_consistent; + +CREATE FUNCTION rum_numeric_cmp(numeric, numeric) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE OPERATOR CLASS rum_tsvector_addon_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE text; + +CREATE OPERATOR CLASS rum_tsvector_hash_addon_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + --support function + FUNCTION 1 btint4cmp(integer, integer), + FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + STORAGE integer; + +/*--------------------int2-----------------------*/ + +CREATE FUNCTION rum_int2_extract_value(int2, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int2_compare_prefix(int2, int2, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int2_extract_query(int2, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_int2_distance(int2, int2) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_int2_distance, + LEFTARG = int2, + RIGHTARG = int2, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_int2_left_distance(int2, int2) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_int2_left_distance, + LEFTARG = int2, + RIGHTARG = int2, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_int2_right_distance(int2, int2) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_int2_right_distance, + LEFTARG = int2, + RIGHTARG = int2, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_int2_outer_distance(int2, int2, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int2_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_int2_ops +DEFAULT FOR TYPE int2 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (int2,int2) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btint2cmp(int2,int2), + FUNCTION 2 rum_int2_extract_value(int2, internal), + FUNCTION 3 rum_int2_extract_query(int2, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_int2_compare_prefix(int2,int2,int2, internal), + -- support to int2 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_int2_config(internal), + FUNCTION 9 rum_int2_outer_distance(int2, int2, smallint), +STORAGE int2; + +/*--------------------int4-----------------------*/ + +CREATE FUNCTION rum_int4_extract_value(int4, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int4_compare_prefix(int4, int4, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int4_extract_query(int4, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_int4_distance(int4, int4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_int4_distance, + LEFTARG = int4, + RIGHTARG = int4, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_int4_left_distance(int4, int4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_int4_left_distance, + LEFTARG = int4, + RIGHTARG = int4, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_int4_right_distance(int4, int4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_int4_right_distance, + LEFTARG = int4, + RIGHTARG = int4, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_int4_outer_distance(int4, int4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int4_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_int4_ops +DEFAULT FOR TYPE int4 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (int4,int4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btint4cmp(int4,int4), + FUNCTION 2 rum_int4_extract_value(int4, internal), + FUNCTION 3 rum_int4_extract_query(int4, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_int4_compare_prefix(int4,int4,int2, internal), + -- support to int4 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_int4_config(internal), + FUNCTION 9 rum_int4_outer_distance(int4, int4, smallint), +STORAGE int4; + +/*--------------------int8-----------------------*/ + +CREATE FUNCTION rum_int8_extract_value(int8, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int8_compare_prefix(int8, int8, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int8_extract_query(int8, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_int8_distance(int8, int8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_int8_distance, + LEFTARG = int8, + RIGHTARG = int8, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_int8_left_distance(int8, int8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_int8_left_distance, + LEFTARG = int8, + RIGHTARG = int8, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_int8_right_distance(int8, int8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_int8_right_distance, + LEFTARG = int8, + RIGHTARG = int8, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_int8_outer_distance(int8, int8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_int8_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_int8_ops +DEFAULT FOR TYPE int8 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (int8,int8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btint8cmp(int8,int8), + FUNCTION 2 rum_int8_extract_value(int8, internal), + FUNCTION 3 rum_int8_extract_query(int8, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_int8_compare_prefix(int8,int8,int2, internal), + -- support to int8 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_int8_config(internal), + FUNCTION 9 rum_int8_outer_distance(int8, int8, smallint), +STORAGE int8; + +/*--------------------float4-----------------------*/ + +CREATE FUNCTION rum_float4_extract_value(float4, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float4_compare_prefix(float4, float4, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float4_extract_query(float4, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_float4_distance(float4, float4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_float4_distance, + LEFTARG = float4, + RIGHTARG = float4, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_float4_left_distance(float4, float4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_float4_left_distance, + LEFTARG = float4, + RIGHTARG = float4, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_float4_right_distance(float4, float4) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_float4_right_distance, + LEFTARG = float4, + RIGHTARG = float4, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_float4_outer_distance(float4, float4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float4_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_float4_ops +DEFAULT FOR TYPE float4 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (float4,float4) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btfloat4cmp(float4,float4), + FUNCTION 2 rum_float4_extract_value(float4, internal), + FUNCTION 3 rum_float4_extract_query(float4, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_float4_compare_prefix(float4,float4,int2, internal), + -- support to float4 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_float4_config(internal), + FUNCTION 9 rum_float4_outer_distance(float4, float4, smallint), +STORAGE float4; + +/*--------------------float8-----------------------*/ + +CREATE FUNCTION rum_float8_extract_value(float8, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float8_compare_prefix(float8, float8, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float8_extract_query(float8, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_float8_distance(float8, float8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_float8_distance, + LEFTARG = float8, + RIGHTARG = float8, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_float8_left_distance(float8, float8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_float8_left_distance, + LEFTARG = float8, + RIGHTARG = float8, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_float8_right_distance(float8, float8) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_float8_right_distance, + LEFTARG = float8, + RIGHTARG = float8, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_float8_outer_distance(float8, float8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_float8_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_float8_ops +DEFAULT FOR TYPE float8 USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (float8,float8) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btfloat8cmp(float8,float8), + FUNCTION 2 rum_float8_extract_value(float8, internal), + FUNCTION 3 rum_float8_extract_query(float8, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_float8_compare_prefix(float8,float8,int2, internal), + -- support to float8 distance in rum_tsvector_addon_ops + FUNCTION 6 rum_float8_config(internal), + FUNCTION 9 rum_float8_outer_distance(float8, float8, smallint), +STORAGE float8; + +/*--------------------money-----------------------*/ + +CREATE FUNCTION rum_money_extract_value(money, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_money_compare_prefix(money, money, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_money_extract_query(money, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_money_distance(money, money) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_money_distance, + LEFTARG = money, + RIGHTARG = money, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_money_left_distance(money, money) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_money_left_distance, + LEFTARG = money, + RIGHTARG = money, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_money_right_distance(money, money) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_money_right_distance, + LEFTARG = money, + RIGHTARG = money, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_money_outer_distance(money, money, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_money_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_money_ops +DEFAULT FOR TYPE money USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (money,money) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (money,money) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (money,money) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 cash_cmp(money,money), + FUNCTION 2 rum_money_extract_value(money, internal), + FUNCTION 3 rum_money_extract_query(money, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_money_compare_prefix(money,money,int2, internal), + -- support to money distance in rum_tsvector_addon_ops + FUNCTION 6 rum_money_config(internal), + FUNCTION 9 rum_money_outer_distance(money, money, smallint), +STORAGE money; + +/*--------------------oid-----------------------*/ + +CREATE FUNCTION rum_oid_extract_value(oid, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_oid_compare_prefix(oid, oid, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_oid_extract_query(oid, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + + +CREATE FUNCTION rum_oid_distance(oid, oid) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_oid_distance, + LEFTARG = oid, + RIGHTARG = oid, + COMMUTATOR = <=> +); + +CREATE FUNCTION rum_oid_left_distance(oid, oid) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR <=| ( + PROCEDURE = rum_oid_left_distance, + LEFTARG = oid, + RIGHTARG = oid, + COMMUTATOR = |=> +); + +CREATE FUNCTION rum_oid_right_distance(oid, oid) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR |=> ( + PROCEDURE = rum_oid_right_distance, + LEFTARG = oid, + RIGHTARG = oid, + COMMUTATOR = <=| +); + +CREATE FUNCTION rum_oid_outer_distance(oid, oid, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_oid_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + + +CREATE OPERATOR CLASS rum_oid_ops +DEFAULT FOR TYPE oid USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + OPERATOR 20 <=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 21 <=| (oid,oid) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 22 |=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 btoidcmp(oid,oid), + FUNCTION 2 rum_oid_extract_value(oid, internal), + FUNCTION 3 rum_oid_extract_query(oid, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_oid_compare_prefix(oid,oid,int2, internal), + -- support to oid distance in rum_tsvector_addon_ops + FUNCTION 6 rum_oid_config(internal), + FUNCTION 9 rum_oid_outer_distance(oid, oid, smallint), +STORAGE oid; + +/*--------------------time-----------------------*/ + +CREATE FUNCTION rum_time_extract_value(time, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_time_compare_prefix(time, time, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_time_extract_query(time, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_time_ops +DEFAULT FOR TYPE time USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 time_cmp(time,time), + FUNCTION 2 rum_time_extract_value(time, internal), + FUNCTION 3 rum_time_extract_query(time, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_time_compare_prefix(time,time,int2, internal), +STORAGE time; + +/*--------------------timetz-----------------------*/ + +CREATE FUNCTION rum_timetz_extract_value(timetz, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timetz_compare_prefix(timetz, timetz, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_timetz_extract_query(timetz, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_timetz_ops +DEFAULT FOR TYPE timetz USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 timetz_cmp(timetz,timetz), + FUNCTION 2 rum_timetz_extract_value(timetz, internal), + FUNCTION 3 rum_timetz_extract_query(timetz, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_timetz_compare_prefix(timetz,timetz,int2, internal), +STORAGE timetz; + +/*--------------------date-----------------------*/ + +CREATE FUNCTION rum_date_extract_value(date, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_date_compare_prefix(date, date, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_date_extract_query(date, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_date_ops +DEFAULT FOR TYPE date USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 date_cmp(date,date), + FUNCTION 2 rum_date_extract_value(date, internal), + FUNCTION 3 rum_date_extract_query(date, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_date_compare_prefix(date,date,int2, internal), +STORAGE date; + +/*--------------------interval-----------------------*/ + +CREATE FUNCTION rum_interval_extract_value(interval, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_interval_compare_prefix(interval, interval, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_interval_extract_query(interval, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_interval_ops +DEFAULT FOR TYPE interval USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 interval_cmp(interval,interval), + FUNCTION 2 rum_interval_extract_value(interval, internal), + FUNCTION 3 rum_interval_extract_query(interval, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_interval_compare_prefix(interval,interval,int2, internal), +STORAGE interval; + +/*--------------------macaddr-----------------------*/ + +CREATE FUNCTION rum_macaddr_extract_value(macaddr, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_macaddr_compare_prefix(macaddr, macaddr, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_macaddr_extract_query(macaddr, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_macaddr_ops +DEFAULT FOR TYPE macaddr USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 macaddr_cmp(macaddr,macaddr), + FUNCTION 2 rum_macaddr_extract_value(macaddr, internal), + FUNCTION 3 rum_macaddr_extract_query(macaddr, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_macaddr_compare_prefix(macaddr,macaddr,int2, internal), +STORAGE macaddr; + +/*--------------------inet-----------------------*/ + +CREATE FUNCTION rum_inet_extract_value(inet, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_inet_compare_prefix(inet, inet, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_inet_extract_query(inet, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_inet_ops +DEFAULT FOR TYPE inet USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 network_cmp(inet,inet), + FUNCTION 2 rum_inet_extract_value(inet, internal), + FUNCTION 3 rum_inet_extract_query(inet, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_inet_compare_prefix(inet,inet,int2, internal), +STORAGE inet; + +/*--------------------cidr-----------------------*/ + +CREATE FUNCTION rum_cidr_extract_value(cidr, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_cidr_compare_prefix(cidr, cidr, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_cidr_extract_query(cidr, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_cidr_ops +DEFAULT FOR TYPE cidr USING rum +AS + OPERATOR 1 < (inet, inet), + OPERATOR 2 <= (inet, inet), + OPERATOR 3 = (inet, inet), + OPERATOR 4 >= (inet, inet), + OPERATOR 5 > (inet, inet), + FUNCTION 1 network_cmp(inet,inet), + FUNCTION 2 rum_cidr_extract_value(cidr, internal), + FUNCTION 3 rum_cidr_extract_query(cidr, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_cidr_compare_prefix(cidr,cidr,int2, internal), +STORAGE cidr; + +/*--------------------text-----------------------*/ + +CREATE FUNCTION rum_text_extract_value(text, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_text_compare_prefix(text, text, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_text_extract_query(text, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_text_ops +DEFAULT FOR TYPE text USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 bttextcmp(text,text), + FUNCTION 2 rum_text_extract_value(text, internal), + FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), +STORAGE text; + +/*--------------------varchar-----------------------*/ + + +CREATE OPERATOR CLASS rum_varchar_ops +DEFAULT FOR TYPE varchar USING rum +AS + OPERATOR 1 < (text, text), + OPERATOR 2 <= (text, text), + OPERATOR 3 = (text, text), + OPERATOR 4 >= (text, text), + OPERATOR 5 > (text, text), + FUNCTION 1 bttextcmp(text,text), + FUNCTION 2 rum_text_extract_value(text, internal), + FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), +STORAGE varchar; + +/*--------------------"char"-----------------------*/ + +CREATE FUNCTION rum_char_extract_value("char", internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_char_compare_prefix("char", "char", int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_char_extract_query("char", internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_char_ops +DEFAULT FOR TYPE "char" USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 btcharcmp("char","char"), + FUNCTION 2 rum_char_extract_value("char", internal), + FUNCTION 3 rum_char_extract_query("char", internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_char_compare_prefix("char","char",int2, internal), +STORAGE "char"; + +/*--------------------bytea-----------------------*/ + +CREATE FUNCTION rum_bytea_extract_value(bytea, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bytea_compare_prefix(bytea, bytea, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bytea_extract_query(bytea, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_bytea_ops +DEFAULT FOR TYPE bytea USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 byteacmp(bytea,bytea), + FUNCTION 2 rum_bytea_extract_value(bytea, internal), + FUNCTION 3 rum_bytea_extract_query(bytea, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_bytea_compare_prefix(bytea,bytea,int2, internal), +STORAGE bytea; + +/*--------------------bit-----------------------*/ + +CREATE FUNCTION rum_bit_extract_value(bit, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bit_compare_prefix(bit, bit, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_bit_extract_query(bit, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_bit_ops +DEFAULT FOR TYPE bit USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 bitcmp(bit,bit), + FUNCTION 2 rum_bit_extract_value(bit, internal), + FUNCTION 3 rum_bit_extract_query(bit, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_bit_compare_prefix(bit,bit,int2, internal), +STORAGE bit; + +/*--------------------varbit-----------------------*/ + +CREATE FUNCTION rum_varbit_extract_value(varbit, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_varbit_compare_prefix(varbit, varbit, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_varbit_extract_query(varbit, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_varbit_ops +DEFAULT FOR TYPE varbit USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 varbitcmp(varbit,varbit), + FUNCTION 2 rum_varbit_extract_value(varbit, internal), + FUNCTION 3 rum_varbit_extract_query(varbit, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_varbit_compare_prefix(varbit,varbit,int2, internal), +STORAGE varbit; + +/*--------------------numeric-----------------------*/ + +CREATE FUNCTION rum_numeric_extract_value(numeric, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_numeric_compare_prefix(numeric, numeric, int2, internal) +RETURNS int4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE FUNCTION rum_numeric_extract_query(numeric, internal, int2, internal, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + + +CREATE OPERATOR CLASS rum_numeric_ops +DEFAULT FOR TYPE numeric USING rum +AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 rum_numeric_cmp(numeric,numeric), + FUNCTION 2 rum_numeric_extract_value(numeric, internal), + FUNCTION 3 rum_numeric_extract_query(numeric, internal, int2, internal, internal), + FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), + FUNCTION 5 rum_numeric_compare_prefix(numeric,numeric,int2, internal), +STORAGE numeric; + +/* + * RUM version 1.2 + */ + +/*--------------------anyarray-----------------------*/ + +CREATE FUNCTION rum_anyarray_config(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE OPERATOR % ( + PROCEDURE = rum_anyarray_similar, + LEFTARG = anyarray, + RIGHTARG = anyarray, + COMMUTATOR = '%', + RESTRICT = contsel, + JOIN = contjoinsel +); + + +CREATE FUNCTION rum_anyarray_distance(anyarray,anyarray) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE OPERATOR <=> ( + PROCEDURE = rum_anyarray_distance, + LEFTARG = anyarray, + RIGHTARG = anyarray, + COMMUTATOR = '<=>' +); + + +CREATE FUNCTION rum_extract_anyarray(anyarray,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_anyarray_consistent(internal, smallint, anyarray, integer, internal, internal, internal, internal) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +CREATE OPERATOR CLASS rum_anyarray_ops +DEFAULT FOR TYPE anyarray USING rum +AS + OPERATOR 1 && (anyarray, anyarray), + OPERATOR 2 @> (anyarray, anyarray), + OPERATOR 3 <@ (anyarray, anyarray), + OPERATOR 4 = (anyarray, anyarray), + OPERATOR 5 % (anyarray, anyarray), + OPERATOR 20 <=> (anyarray, anyarray) FOR ORDER BY pg_catalog.float_ops, + --dispatch function 1 for concrete type + FUNCTION 2 rum_extract_anyarray(anyarray,internal,internal,internal,internal), + FUNCTION 3 rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_anyarray_consistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), + FUNCTION 6 rum_anyarray_config(internal), + FUNCTION 8 rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal), + STORAGE anyelement; + +CREATE OPERATOR CLASS rum_anyarray_addon_ops +FOR TYPE anyarray USING rum +AS + OPERATOR 1 && (anyarray, anyarray), + OPERATOR 2 @> (anyarray, anyarray), + OPERATOR 3 <@ (anyarray, anyarray), + OPERATOR 4 = (anyarray, anyarray), + --dispatch function 1 for concrete type + FUNCTION 2 ginarrayextract(anyarray,internal,internal), + FUNCTION 3 ginqueryarrayextract(anyarray,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 ginarrayconsistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), + STORAGE anyelement; + +/*--------------------int2-----------------------*/ + +CREATE FUNCTION rum_int2_key_distance(int2, int2, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_int2_ops USING rum ADD + FUNCTION 8 (int2,int2) rum_int2_key_distance(int2, int2, smallint); + +/*--------------------int4-----------------------*/ + +CREATE FUNCTION rum_int4_key_distance(int4, int4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_int4_ops USING rum ADD + FUNCTION 8 (int4,int4) rum_int4_key_distance(int4, int4, smallint); + +/*--------------------int8-----------------------*/ + +CREATE FUNCTION rum_int8_key_distance(int8, int8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_int8_ops USING rum ADD + FUNCTION 8 (int8,int8) rum_int8_key_distance(int8, int8, smallint); + +/*--------------------float4-----------------------*/ + +CREATE FUNCTION rum_float4_key_distance(float4, float4, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_float4_ops USING rum ADD + FUNCTION 8 (float4,float4) rum_float4_key_distance(float4, float4, smallint); + +/*--------------------float8-----------------------*/ + +CREATE FUNCTION rum_float8_key_distance(float8, float8, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_float8_ops USING rum ADD + FUNCTION 8 (float8,float8) rum_float8_key_distance(float8, float8, smallint); + +/*--------------------money-----------------------*/ + +CREATE FUNCTION rum_money_key_distance(money, money, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_money_ops USING rum ADD + FUNCTION 8 (money,money) rum_money_key_distance(money, money, smallint); + +/*--------------------oid-----------------------*/ + +CREATE FUNCTION rum_oid_key_distance(oid, oid, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_oid_ops USING rum ADD + FUNCTION 8 (oid,oid) rum_oid_key_distance(oid, oid, smallint); + +/*--------------------timestamp-----------------------*/ + +CREATE FUNCTION rum_timestamp_key_distance(timestamp, timestamp, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_timestamp_ops USING rum ADD + FUNCTION 8 (timestamp,timestamp) rum_timestamp_key_distance(timestamp, timestamp, smallint); + +/*--------------------timestamptz-----------------------*/ + +CREATE FUNCTION rum_timestamptz_key_distance(timestamptz, timestamptz, smallint) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +ALTER OPERATOR FAMILY rum_timestamptz_ops USING rum ADD + FUNCTION 8 (timestamptz,timestamptz) rum_timestamptz_key_distance(timestamptz, timestamptz, smallint); + +/* + * RUM version 1.3 + */ + +CREATE FUNCTION rum_ts_score(tsvector,tsquery) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_score_tt' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_ts_score(tsvector,tsquery,int) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_score_ttf' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION rum_ts_score(tsvector,rum_distance_query) +RETURNS float4 +AS 'MODULE_PATHNAME', 'rum_ts_score_td' +LANGUAGE C IMMUTABLE STRICT; + diff --git a/specs/predicate-rum-2.spec b/specs/predicate-rum-2.spec new file mode 100644 index 0000000000..c88383caee --- /dev/null +++ b/specs/predicate-rum-2.spec @@ -0,0 +1,63 @@ +# Test for page level predicate locking in rum +# +# Test to check reduced false positives +# +# Queries are written in such a way that an index scan(from one transaction) and an index insert(from another transaction) will try to access different parts(sub-tree) of the index. + +setup +{ + CREATE TABLE rum_tbl (id serial, tsv tsvector); + + CREATE TABLE text_table (id1 serial, t text[]); + + INSERT INTO text_table(t) SELECT array[chr(i) || chr(j)] FROM generate_series(65,90) i, + generate_series(65,90) j ; + + -- We need to use pseudorandom to generate values for test table + -- In this case we use linear congruential generator because random() + -- function may generate different outputs with different systems + DO $$ + DECLARE + c integer := 17; + a integer := 261; + m integer := 6760; + Xi integer := 228; + BEGIN + FOR i in 1..338 LOOP + INSERT INTO rum_tbl(tsv) VALUES (''); + FOR j in 1..10 LOOP + UPDATE rum_tbl SET tsv = tsv || (SELECT to_tsvector('simple', t[1]) FROM text_table WHERE id1 = Xi % 676 + 1) WHERE id = i; + Xi = (a * Xi + c) % m; + END LOOP; + END LOOP; + END; + $$; + + CREATE INDEX rum_tbl_idx ON rum_tbl USING rum (tsv rum_tsvector_ops); +} + +teardown +{ + DROP TABLE text_table; + DROP TABLE rum_tbl; +} + +session "s1" +setup { + BEGIN ISOLATION LEVEL SERIALIZABLE; + set enable_seqscan=off; + } +step "rxy1" { SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; } +step "wx1" { INSERT INTO rum_tbl(tsv) values('ab'); } +step "c1" { COMMIT; } + +session "s2" +setup { + BEGIN ISOLATION LEVEL SERIALIZABLE; + set enable_seqscan=off; + } + +step "rxy2" { SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; } +step "wy2" { INSERT INTO rum_tbl(tsv) values('xz'); } +step "c2" { COMMIT; } + diff --git a/specs/predicate-rum.spec b/specs/predicate-rum.spec new file mode 100644 index 0000000000..4d324b9ef2 --- /dev/null +++ b/specs/predicate-rum.spec @@ -0,0 +1,62 @@ +# Test for page level predicate locking in rum +# +# Test to verify serialization failures +# +# Queries are written in such a way that an index scan(from one transaction) and an index insert(from another transaction) will try to access the same part(sub-tree) of the index. + +setup +{ + CREATE TABLE rum_tbl (id serial, tsv tsvector); + + CREATE TABLE text_table (id1 serial, t text[]); + + INSERT INTO text_table(t) SELECT array[chr(i) || chr(j)] FROM generate_series(65,90) i, + generate_series(65,90) j ; + + -- We need to use pseudorandom to generate values for test table + -- In this case we use linear congruential generator because random() + -- function may generate different outputs with different systems + DO $$ + DECLARE + c integer := 17; + a integer := 261; + m integer := 6760; + Xi integer := 228; + BEGIN + FOR i in 1..338 LOOP + INSERT INTO rum_tbl(tsv) VALUES (''); + FOR j in 1..10 LOOP + UPDATE rum_tbl SET tsv = tsv || (SELECT to_tsvector('simple', t[1]) FROM text_table WHERE id1 = Xi % 676 + 1) WHERE id = i; + Xi = (a * Xi + c) % m; + END LOOP; + END LOOP; + END; + $$; + + CREATE INDEX rum_tbl_idx ON rum_tbl USING rum (tsv rum_tsvector_ops); +} + +teardown +{ + DROP TABLE text_table; + DROP TABLE rum_tbl; +} + +session "s1" +setup { + BEGIN ISOLATION LEVEL SERIALIZABLE; + set enable_seqscan=off; + } +step "rxy1" { SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; } +step "wx1" { INSERT INTO rum_tbl(tsv) values('qh'); } +step "c1" { COMMIT; } + +session "s2" +setup { + BEGIN ISOLATION LEVEL SERIALIZABLE; + set enable_seqscan=off; + } + +step "rxy2" { SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; } +step "wy2" { INSERT INTO rum_tbl(tsv) values('hx'); } +step "c2" { COMMIT; } diff --git a/sql/altorder.sql b/sql/altorder.sql index 12c23fb442..850e252325 100644 --- a/sql/altorder.sql +++ b/sql/altorder.sql @@ -1,13 +1,28 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder.out - test output for 64-bit systems and + * altorder_1.out - test output for 32-bit systems. + * + */ + + CREATE TABLE atsts (id int, t tsvector, d timestamp); +\copy atsts from 'data/tsts.data' +-- PGPRO-2537: We need more data to test rumsort.c with logtape.c +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' \copy atsts from 'data/tsts.data' -CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_timestamp_ops, d) +CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; @@ -30,9 +45,8 @@ SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) @@ -52,6 +66,11 @@ EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; + EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; @@ -73,4 +92,6 @@ EXPLAIN (costs off) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -DROP TABLE atsts CASCADE; +EXPLAIN (costs off) +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; diff --git a/sql/altorder_hash.sql b/sql/altorder_hash.sql index cfe0740fa7..148407c661 100644 --- a/sql/altorder_hash.sql +++ b/sql/altorder_hash.sql @@ -1,76 +1,90 @@ -CREATE TABLE atsts (id int, t tsvector, d timestamp); +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder_hash.out - test output for 64-bit systems and + * altorder_hash_1.out - test output for 32-bit systems. + * + */ -\copy atsts from 'data/tsts.data' -CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_hash_timestamp_ops, d) +CREATE TABLE atstsh (id int, t tsvector, d timestamp); + +\copy atstsh from 'data/tsts.data' + +CREATE INDEX atstsh_idx ON atstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; -SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; -SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; -SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; -SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr&qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq&yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq|yt'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq|yt)&(wr|qh)'; SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; -SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; -SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; -SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; -SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'wr&qh'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq&yt'; +SELECT count(*) FROM atstsh WHERE t @@ 'eq|yt'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM atstsh WHERE t @@ '(eq|yt)&(wr|qh)'; EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; -SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; EXPLAIN (costs off) -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; -SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; +SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; + +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; EXPLAIN (costs off) -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -DROP TABLE atsts CASCADE; diff --git a/sql/array.sql b/sql/array.sql new file mode 100644 index 0000000000..9eba800bcf --- /dev/null +++ b/sql/array.sql @@ -0,0 +1,260 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * array.out - test output for 64-bit systems and + * array_1.out - test output for 32-bit systems. + * + */ + + +set enable_seqscan=off; +set enable_sort=off; + +/* + * Complete checks for int2[]. + */ + +CREATE TABLE test_array ( + i int2[] +); +INSERT INTO test_array VALUES ('{}'), ('{0}'), ('{1,2,3,4}'), ('{1,2,3}'), ('{1,2}'), ('{1}'); + +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); + + +SELECT NULL::int[] = '{1}'; +SELECT NULL::int[] && '{1}'; +SELECT NULL::int[] @> '{1}'; +SELECT NULL::int[] <@ '{1}'; +SELECT NULL::int[] % '{1}'; +SELECT NULL::int[] <=> '{1}'; + +INSERT INTO test_array VALUES (NULL); +SELECT * FROM test_array WHERE i = '{1}'; +DELETE FROM test_array WHERE i IS NULL; + +SELECT * FROM test_array WHERE i = '{NULL}'; +SELECT * FROM test_array WHERE i = '{1,2,3,NULL}'; +SELECT * FROM test_array WHERE i = '{{1,2},{3,4}}'; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +SELECT * FROM test_array WHERE i = '{}'; +SELECT * FROM test_array WHERE i = '{0}'; +SELECT * FROM test_array WHERE i = '{1}'; +SELECT * FROM test_array WHERE i = '{1,2}'; +SELECT * FROM test_array WHERE i = '{2,1}'; +SELECT * FROM test_array WHERE i = '{1,2,3,3}'; +SELECT * FROM test_array WHERE i = '{0,0}'; +SELECT * FROM test_array WHERE i = '{100}'; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +SELECT * FROM test_array WHERE i && '{}'; +SELECT * FROM test_array WHERE i && '{1}'; +SELECT * FROM test_array WHERE i && '{2}'; +SELECT * FROM test_array WHERE i && '{3}'; +SELECT * FROM test_array WHERE i && '{4}'; +SELECT * FROM test_array WHERE i && '{1,2}'; +SELECT * FROM test_array WHERE i && '{1,2,3}'; +SELECT * FROM test_array WHERE i && '{1,2,3,4}'; +SELECT * FROM test_array WHERE i && '{4,3,2,1}'; +SELECT * FROM test_array WHERE i && '{0,0}'; +SELECT * FROM test_array WHERE i && '{100}'; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +SELECT * FROM test_array WHERE i @> '{}'; +SELECT * FROM test_array WHERE i @> '{1}'; +SELECT * FROM test_array WHERE i @> '{2}'; +SELECT * FROM test_array WHERE i @> '{3}'; +SELECT * FROM test_array WHERE i @> '{4}'; +SELECT * FROM test_array WHERE i @> '{1,2,4}'; +SELECT * FROM test_array WHERE i @> '{1,2,3,4}'; +SELECT * FROM test_array WHERE i @> '{4,3,2,1}'; +SELECT * FROM test_array WHERE i @> '{0,0}'; +SELECT * FROM test_array WHERE i @> '{100}'; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +SELECT * FROM test_array WHERE i <@ '{}'; +SELECT * FROM test_array WHERE i <@ '{1}'; +SELECT * FROM test_array WHERE i <@ '{2}'; +SELECT * FROM test_array WHERE i <@ '{1,2,4}'; +SELECT * FROM test_array WHERE i <@ '{1,2,3,4}'; +SELECT * FROM test_array WHERE i <@ '{4,3,2,1}'; +SELECT * FROM test_array WHERE i <@ '{0,0}'; +SELECT * FROM test_array WHERE i <@ '{100}'; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +SELECT * FROM test_array WHERE i % '{}'; +SELECT * FROM test_array WHERE i % '{1}'; +SELECT * FROM test_array WHERE i % '{2}'; +SELECT * FROM test_array WHERE i % '{1,2}'; +SELECT * FROM test_array WHERE i % '{1,2,4}'; +SELECT * FROM test_array WHERE i % '{1,2,3,4}'; +SELECT * FROM test_array WHERE i % '{4,3,2,1}'; +SELECT * FROM test_array WHERE i % '{1,2,3,4,5}'; +SELECT * FROM test_array WHERE i % '{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}'; +SELECT * FROM test_array WHERE i % '{1,10,20,30,40,50}'; +SELECT * FROM test_array WHERE i % '{1,10,20,30}'; +SELECT * FROM test_array WHERE i % '{1,1,1,1,1}'; +SELECT * FROM test_array WHERE i % '{0,0}'; +SELECT * FROM test_array WHERE i % '{100}'; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; +SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; + +DROP INDEX idx_array; + + +ALTER TABLE test_array ADD COLUMN add_info timestamp; + +CREATE INDEX idx_array ON test_array +USING rum (i rum_anyarray_addon_ops, add_info) +WITH (attach = 'add_info', to = 'i'); + +WITH q as ( + SELECT row_number() OVER (ORDER BY i) idx, ctid FROM test_array +) +UPDATE test_array SET add_info = '2016-05-16 14:21:25'::timestamp + + format('%s days', q.idx)::interval +FROM q WHERE test_array.ctid = q.ctid; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; + +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{1}' ORDER BY add_info <=> '2016-05-16 14:21:25' LIMIT 10; +SELECT * FROM test_array WHERE i && '{1}' ORDER BY add_info <=> '2016-05-16 14:21:25' LIMIT 10; + +DROP INDEX idx_array; + + +/* + * Sanity checks for popular array types. + */ + +ALTER TABLE test_array ALTER COLUMN i TYPE int4[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +ALTER TABLE test_array ALTER COLUMN i TYPE int8[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +ALTER TABLE test_array ALTER COLUMN i TYPE text[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +ALTER TABLE test_array ALTER COLUMN i TYPE varchar[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +ALTER TABLE test_array ALTER COLUMN i TYPE char[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +ALTER TABLE test_array ALTER COLUMN i TYPE numeric[] USING i::numeric[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +ALTER TABLE test_array ALTER COLUMN i TYPE float4[] USING i::float4[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +ALTER TABLE test_array ALTER COLUMN i TYPE float8[] USING i::float8[]; +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_ops); +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i = '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i && '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i @> '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i <@ '{}'; +EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; +DROP INDEX idx_array; + +/* + * Check ordering using distance operator + * + * We want to check that index scan provides us correct ordering by distance + * operator. File 'data/rum_array.data' contains two arrays that statisfy + * i @> '{23,20}' and have finite distance i <=> '{51}', and a bunch of arrays + * that statisfy i @> '{23,20}' and have infinite distance i <=> '{51}'. + * + * When ordering by distance the order of this bunch of arrays with infinite + * distance is not determined and may depend of PostgreSQL version and system. + * We don't add another sort expression to ORDER BY because that might cause + * the planner to avoid using the index. Instead, we replace arrays that have + * infinite distance with {-1} to unambiguously determine the test output. + * + * 'Infinity' is printed differently in the output in different PostgreSQL + * versions, so we replace it with -1. + */ + +CREATE TABLE test_array_order ( + i int2[] +); +\copy test_array_order(i) from 'data/rum_array.data'; + +CREATE INDEX idx_array_order ON test_array_order USING rum (i rum_anyarray_ops); + +/* + * Check that plan of the query uses ordering provided by index scan + */ + +EXPLAIN (COSTS OFF) +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; diff --git a/sql/bit.sql b/sql/bit.sql new file mode 100644 index 0000000000..ab31916ab8 --- /dev/null +++ b/sql/bit.sql @@ -0,0 +1,15 @@ +set enable_seqscan=off; + +CREATE TABLE test_bit ( + i bit(3) +); + +INSERT INTO test_bit VALUES ('001'),('010'),('011'),('100'),('101'),('110'); + +CREATE INDEX idx_bit ON test_bit USING rum (i); + +SELECT * FROM test_bit WHERE i<'100'::bit(3) ORDER BY i; +SELECT * FROM test_bit WHERE i<='100'::bit(3) ORDER BY i; +SELECT * FROM test_bit WHERE i='100'::bit(3) ORDER BY i; +SELECT * FROM test_bit WHERE i>='100'::bit(3) ORDER BY i; +SELECT * FROM test_bit WHERE i>'100'::bit(3) ORDER BY i; diff --git a/sql/bytea.sql b/sql/bytea.sql new file mode 100644 index 0000000000..b186abe0b8 --- /dev/null +++ b/sql/bytea.sql @@ -0,0 +1,17 @@ +set enable_seqscan=off; +-- ensure consistent test output regardless of the default bytea format +SET bytea_output TO escape; + +CREATE TABLE test_bytea ( + i bytea +); + +INSERT INTO test_bytea VALUES ('a'),('ab'),('abc'),('abb'),('axy'),('xyz'); + +CREATE INDEX idx_bytea ON test_bytea USING rum (i); + +SELECT * FROM test_bytea WHERE i<'abc'::bytea ORDER BY i; +SELECT * FROM test_bytea WHERE i<='abc'::bytea ORDER BY i; +SELECT * FROM test_bytea WHERE i='abc'::bytea ORDER BY i; +SELECT * FROM test_bytea WHERE i>='abc'::bytea ORDER BY i; +SELECT * FROM test_bytea WHERE i>'abc'::bytea ORDER BY i; diff --git a/sql/char.sql b/sql/char.sql new file mode 100644 index 0000000000..ea65b9fefc --- /dev/null +++ b/sql/char.sql @@ -0,0 +1,15 @@ +set enable_seqscan=off; + +CREATE TABLE test_char ( + i "char" +); + +INSERT INTO test_char VALUES ('a'),('b'),('c'),('d'),('e'),('f'); + +CREATE INDEX idx_char ON test_char USING rum (i); + +SELECT * FROM test_char WHERE i<'d'::"char" ORDER BY i; +SELECT * FROM test_char WHERE i<='d'::"char" ORDER BY i; +SELECT * FROM test_char WHERE i='d'::"char" ORDER BY i; +SELECT * FROM test_char WHERE i>='d'::"char" ORDER BY i; +SELECT * FROM test_char WHERE i>'d'::"char" ORDER BY i; diff --git a/sql/cidr.sql b/sql/cidr.sql new file mode 100644 index 0000000000..fb3c9012e0 --- /dev/null +++ b/sql/cidr.sql @@ -0,0 +1,22 @@ +set enable_seqscan=off; + +CREATE TABLE test_cidr ( + i cidr +); + +INSERT INTO test_cidr VALUES + ( '1.2.3.4' ), + ( '1.2.4.4' ), + ( '1.2.5.4' ), + ( '1.2.6.4' ), + ( '1.2.7.4' ), + ( '1.2.8.4' ) +; + +CREATE INDEX idx_cidr ON test_cidr USING rum (i); + +SELECT * FROM test_cidr WHERE i<'1.2.6.4'::cidr ORDER BY i; +SELECT * FROM test_cidr WHERE i<='1.2.6.4'::cidr ORDER BY i; +SELECT * FROM test_cidr WHERE i='1.2.6.4'::cidr ORDER BY i; +SELECT * FROM test_cidr WHERE i>='1.2.6.4'::cidr ORDER BY i; +SELECT * FROM test_cidr WHERE i>'1.2.6.4'::cidr ORDER BY i; diff --git a/sql/date.sql b/sql/date.sql new file mode 100644 index 0000000000..263fa46dc9 --- /dev/null +++ b/sql/date.sql @@ -0,0 +1,22 @@ +set enable_seqscan=off; + +CREATE TABLE test_date ( + i date +); + +INSERT INTO test_date VALUES + ( '2004-10-23' ), + ( '2004-10-24' ), + ( '2004-10-25' ), + ( '2004-10-26' ), + ( '2004-10-27' ), + ( '2004-10-28' ) +; + +CREATE INDEX idx_date ON test_date USING rum (i); + +SELECT * FROM test_date WHERE i<'2004-10-26'::date ORDER BY i; +SELECT * FROM test_date WHERE i<='2004-10-26'::date ORDER BY i; +SELECT * FROM test_date WHERE i='2004-10-26'::date ORDER BY i; +SELECT * FROM test_date WHERE i>='2004-10-26'::date ORDER BY i; +SELECT * FROM test_date WHERE i>'2004-10-26'::date ORDER BY i; diff --git a/sql/expr.sql b/sql/expr.sql new file mode 100644 index 0000000000..d7b7ee3d24 --- /dev/null +++ b/sql/expr.sql @@ -0,0 +1,21 @@ +CREATE TABLE documents ( + en text not null, + score float not null, + textsearch_index_en_col tsvector +); + +INSERT INTO documents VALUES ('the pet cat is in the shed', 56, to_tsvector('english', 'the pet cat is in the shed')); + +CREATE INDEX textsearch_index_en ON documents + USING rum (textsearch_index_en_col rum_tsvector_addon_ops, score) + WITH (attach = 'score', to = 'textsearch_index_en_col'); + +SET enable_seqscan=off; +-- should be 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + +SET enable_seqscan=on; +-- 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + +DROP TABLE documents; diff --git a/sql/float4.sql b/sql/float4.sql new file mode 100644 index 0000000000..bb2d428d61 --- /dev/null +++ b/sql/float4.sql @@ -0,0 +1,23 @@ +set enable_seqscan=off; + +CREATE TABLE test_float4 ( + i float4 +); + +INSERT INTO test_float4 VALUES (-2),(-1),(0),(1),(2),(3); + +CREATE INDEX idx_float4 ON test_float4 USING rum (i); + +SELECT * FROM test_float4 WHERE i<1::float4 ORDER BY i; +SELECT * FROM test_float4 WHERE i<=1::float4 ORDER BY i; +SELECT * FROM test_float4 WHERE i=1::float4 ORDER BY i; +SELECT * FROM test_float4 WHERE i>=1::float4 ORDER BY i; +SELECT * FROM test_float4 WHERE i>1::float4 ORDER BY i; + +EXPLAIN (costs off) +SELECT *, i <=> 0::float4 FROM test_float4 ORDER BY i <=> 0::float4; +SELECT *, i <=> 0::float4 FROM test_float4 ORDER BY i <=> 0::float4; + +EXPLAIN (costs off) +SELECT *, i <=> 1::float4 FROM test_float4 WHERE i<1::float4 ORDER BY i <=> 1::float4; +SELECT *, i <=> 1::float4 FROM test_float4 WHERE i<1::float4 ORDER BY i <=> 1::float4; diff --git a/sql/float8.sql b/sql/float8.sql new file mode 100644 index 0000000000..b61cbfb0da --- /dev/null +++ b/sql/float8.sql @@ -0,0 +1,34 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * float8.out - test output for 64-bit systems and + * float8_1.out - test output for 32-bit systems. + * + */ + + +set enable_seqscan=off; + +CREATE TABLE test_float8 ( + i float8 +); + +INSERT INTO test_float8 VALUES (-2),(-1),(0),(1),(2),(3); + +CREATE INDEX idx_float8 ON test_float8 USING rum (i); + +SELECT * FROM test_float8 WHERE i<1::float8 ORDER BY i; +SELECT * FROM test_float8 WHERE i<=1::float8 ORDER BY i; +SELECT * FROM test_float8 WHERE i=1::float8 ORDER BY i; +SELECT * FROM test_float8 WHERE i>=1::float8 ORDER BY i; +SELECT * FROM test_float8 WHERE i>1::float8 ORDER BY i; + +EXPLAIN (costs off) +SELECT *, i <=> 0::float8 FROM test_float8 ORDER BY i <=> 0::float8; +SELECT *, i <=> 0::float8 FROM test_float8 ORDER BY i <=> 0::float8; + +EXPLAIN (costs off) +SELECT *, i <=> 1::float8 FROM test_float8 WHERE i<1::float8 ORDER BY i <=> 1::float8; +SELECT *, i <=> 1::float8 FROM test_float8 WHERE i<1::float8 ORDER BY i <=> 1::float8; diff --git a/sql/inet.sql b/sql/inet.sql new file mode 100644 index 0000000000..28facb712d --- /dev/null +++ b/sql/inet.sql @@ -0,0 +1,22 @@ +set enable_seqscan=off; + +CREATE TABLE test_inet ( + i inet +); + +INSERT INTO test_inet VALUES + ( '1.2.3.4/16' ), + ( '1.2.4.4/16' ), + ( '1.2.5.4/16' ), + ( '1.2.6.4/16' ), + ( '1.2.7.4/16' ), + ( '1.2.8.4/16' ) +; + +CREATE INDEX idx_inet ON test_inet USING rum (i); + +SELECT * FROM test_inet WHERE i<'1.2.6.4/16'::inet ORDER BY i; +SELECT * FROM test_inet WHERE i<='1.2.6.4/16'::inet ORDER BY i; +SELECT * FROM test_inet WHERE i='1.2.6.4/16'::inet ORDER BY i; +SELECT * FROM test_inet WHERE i>='1.2.6.4/16'::inet ORDER BY i; +SELECT * FROM test_inet WHERE i>'1.2.6.4/16'::inet ORDER BY i; diff --git a/sql/int2.sql b/sql/int2.sql new file mode 100644 index 0000000000..419f03e008 --- /dev/null +++ b/sql/int2.sql @@ -0,0 +1,23 @@ +set enable_seqscan=off; + +CREATE TABLE test_int2 ( + i int2 +); + +INSERT INTO test_int2 VALUES (-2),(-1),(0),(1),(2),(3); + +CREATE INDEX idx_int2 ON test_int2 USING rum (i); + +SELECT * FROM test_int2 WHERE i<1::int2 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=1::int2 ORDER BY i; +SELECT * FROM test_int2 WHERE i=1::int2 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=1::int2 ORDER BY i; +SELECT * FROM test_int2 WHERE i>1::int2 ORDER BY i; + +EXPLAIN (costs off) +SELECT *, i <=> 0::int2 FROM test_int2 ORDER BY i <=> 0::int2; +SELECT *, i <=> 0::int2 FROM test_int2 ORDER BY i <=> 0::int2; + +EXPLAIN (costs off) +SELECT *, i <=> 1::int2 FROM test_int2 WHERE i<1::int2 ORDER BY i <=> 1::int2; +SELECT *, i <=> 1::int2 FROM test_int2 WHERE i<1::int2 ORDER BY i <=> 1::int2; diff --git a/sql/int4.sql b/sql/int4.sql new file mode 100644 index 0000000000..2fa0e8afec --- /dev/null +++ b/sql/int4.sql @@ -0,0 +1,160 @@ +set enable_seqscan=off; + +CREATE TABLE test_int4 ( + i int4 +); + +INSERT INTO test_int4 VALUES (-2),(-1),(0),(1),(2),(3); + +CREATE INDEX idx_int4 ON test_int4 USING rum (i); + +SELECT * FROM test_int4 WHERE i<1::int4 ORDER BY i; +SELECT * FROM test_int4 WHERE i<=1::int4 ORDER BY i; +SELECT * FROM test_int4 WHERE i=1::int4 ORDER BY i; +SELECT * FROM test_int4 WHERE i>=1::int4 ORDER BY i; +SELECT * FROM test_int4 WHERE i>1::int4 ORDER BY i; + +EXPLAIN (costs off) +SELECT *, i <=> 0::int4 FROM test_int4 ORDER BY i <=> 0::int4; +SELECT *, i <=> 0::int4 FROM test_int4 ORDER BY i <=> 0::int4; + +EXPLAIN (costs off) +SELECT *, i <=> 1::int4 FROM test_int4 WHERE i<1::int4 ORDER BY i <=> 1::int4; +SELECT *, i <=> 1::int4 FROM test_int4 WHERE i<1::int4 ORDER BY i <=> 1::int4; + +CREATE TABLE test_int4_o AS SELECT id::int4, t FROM tsts; + +CREATE INDEX test_int4_o_idx ON test_int4_o USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't'); + +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + +EXPLAIN (costs off) +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; +SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + +CREATE TABLE test_int4_a AS SELECT id::int4, t FROM tsts; + +CREATE INDEX test_int4_a_idx ON test_int4_a USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); + +EXPLAIN (costs off) +SELECT count(*) FROM test_int4_a WHERE id < 400; +SELECT count(*) FROM test_int4_a WHERE id < 400; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int4_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + +EXPLAIN (costs off) +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; +SELECT id FROM test_int4_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + +CREATE TABLE test_int4_h_o AS SELECT id::int4, t FROM tsts; + +CREATE INDEX test_int4_h_o_idx ON test_int4_h_o USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't'); + +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + +EXPLAIN (costs off) +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; +SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + +CREATE TABLE test_int4_h_a AS SELECT id::int4, t FROM tsts; + +CREATE INDEX test_int4_h_a_idx ON test_int4_h_a USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); + +EXPLAIN (costs off) +SELECT count(*) FROM test_int4_h_a WHERE id < 400; +SELECT count(*) FROM test_int4_h_a WHERE id < 400; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int4_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + +EXPLAIN (costs off) +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; + +CREATE TABLE test_int4_id_t AS SELECT id::int4, t FROM tsts; + +CREATE INDEX test_int4_id_t_idx ON test_int4_o USING rum + (t rum_tsvector_ops, id); + +EXPLAIN (costs off) +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400::int4 ORDER BY id <=> 400::int4; +SELECT id FROM test_int4_h_a WHERE t @@ 'wr&qh' AND id <= 400::int4 ORDER BY id <=> 400::int4; diff --git a/sql/int8.sql b/sql/int8.sql new file mode 100644 index 0000000000..c51705e62b --- /dev/null +++ b/sql/int8.sql @@ -0,0 +1,172 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * int8.out - test output for 64-bit systems and + * int8_1.out - test output for 32-bit systems. + * + */ + + +set enable_seqscan=off; + +CREATE TABLE test_int8 ( + i int8 +); + +INSERT INTO test_int8 VALUES (-2),(-1),(0),(1),(2),(3); + +CREATE INDEX idx_int8 ON test_int8 USING rum (i); + +SELECT * FROM test_int8 WHERE i<1::int8 ORDER BY i; +SELECT * FROM test_int8 WHERE i<=1::int8 ORDER BY i; +SELECT * FROM test_int8 WHERE i=1::int8 ORDER BY i; +SELECT * FROM test_int8 WHERE i>=1::int8 ORDER BY i; +SELECT * FROM test_int8 WHERE i>1::int8 ORDER BY i; + +EXPLAIN (costs off) +SELECT *, i <=> 0::int8 FROM test_int8 ORDER BY i <=> 0::int8; +SELECT *, i <=> 0::int8 FROM test_int8 ORDER BY i <=> 0::int8; + +EXPLAIN (costs off) +SELECT *, i <=> 1::int8 FROM test_int8 WHERE i<1::int8 ORDER BY i <=> 1::int8; +SELECT *, i <=> 1::int8 FROM test_int8 WHERE i<1::int8 ORDER BY i <=> 1::int8; + +CREATE TABLE test_int8_o AS SELECT id::int8, t FROM tsts; + +CREATE INDEX test_int8_o_idx ON test_int8_o USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't'); + +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + +EXPLAIN (costs off) +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; +SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + +CREATE TABLE test_int8_a AS SELECT id::int8, t FROM tsts; + +CREATE INDEX test_int8_a_idx ON test_int8_a USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); + +EXPLAIN (costs off) +SELECT count(*) FROM test_int8_a WHERE id < 400::int8; +SELECT count(*) FROM test_int8_a WHERE id < 400::int8; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + +EXPLAIN (costs off) +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; +SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + +CREATE TABLE test_int8_h_o AS SELECT id::int8, t FROM tsts; + +CREATE INDEX test_int8_h_o_idx ON test_int8_h_o USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't'); + +RESET enable_seqscan; +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_seqscan = off; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; +SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + +CREATE TABLE test_int8_h_a AS SELECT id::int8, t FROM tsts; + +CREATE INDEX test_int8_h_a_idx ON test_int8_h_a USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); + +EXPLAIN (costs off) +SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; +SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; + +EXPLAIN (costs off) +SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; +EXPLAIN (costs off) +SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; +SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; + +CREATE TABLE test_int8_id_t AS SELECT id::int8, t FROM tsts; + +CREATE INDEX test_int8_id_t_idx ON test_int8_o USING rum + (t rum_tsvector_ops, id); + +EXPLAIN (costs off) +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; +SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; diff --git a/sql/interval.sql b/sql/interval.sql new file mode 100644 index 0000000000..87e1b9960b --- /dev/null +++ b/sql/interval.sql @@ -0,0 +1,22 @@ +set enable_seqscan=off; + +CREATE TABLE test_interval ( + i interval +); + +INSERT INTO test_interval VALUES + ( '03:55:08' ), + ( '04:55:08' ), + ( '05:55:08' ), + ( '08:55:08' ), + ( '09:55:08' ), + ( '10:55:08' ) +; + +CREATE INDEX idx_interval ON test_interval USING rum (i); + +SELECT * FROM test_interval WHERE i<'08:55:08'::interval ORDER BY i; +SELECT * FROM test_interval WHERE i<='08:55:08'::interval ORDER BY i; +SELECT * FROM test_interval WHERE i='08:55:08'::interval ORDER BY i; +SELECT * FROM test_interval WHERE i>='08:55:08'::interval ORDER BY i; +SELECT * FROM test_interval WHERE i>'08:55:08'::interval ORDER BY i; diff --git a/sql/macaddr.sql b/sql/macaddr.sql new file mode 100644 index 0000000000..64da5af1cb --- /dev/null +++ b/sql/macaddr.sql @@ -0,0 +1,22 @@ +set enable_seqscan=off; + +CREATE TABLE test_macaddr ( + i macaddr +); + +INSERT INTO test_macaddr VALUES + ( '22:00:5c:03:55:08' ), + ( '22:00:5c:04:55:08' ), + ( '22:00:5c:05:55:08' ), + ( '22:00:5c:08:55:08' ), + ( '22:00:5c:09:55:08' ), + ( '22:00:5c:10:55:08' ) +; + +CREATE INDEX idx_macaddr ON test_macaddr USING rum (i); + +SELECT * FROM test_macaddr WHERE i<'22:00:5c:08:55:08'::macaddr ORDER BY i; +SELECT * FROM test_macaddr WHERE i<='22:00:5c:08:55:08'::macaddr ORDER BY i; +SELECT * FROM test_macaddr WHERE i='22:00:5c:08:55:08'::macaddr ORDER BY i; +SELECT * FROM test_macaddr WHERE i>='22:00:5c:08:55:08'::macaddr ORDER BY i; +SELECT * FROM test_macaddr WHERE i>'22:00:5c:08:55:08'::macaddr ORDER BY i; diff --git a/sql/money.sql b/sql/money.sql new file mode 100644 index 0000000000..13df5ed260 --- /dev/null +++ b/sql/money.sql @@ -0,0 +1,34 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * money.out - test output for 64-bit systems and + * money_1.out - test output for 32-bit systems. + * + */ + + +set enable_seqscan=off; + +CREATE TABLE test_money ( + i money +); + +INSERT INTO test_money VALUES ('-2'),('-1'),('0'),('1'),('2'),('3'); + +CREATE INDEX idx_money ON test_money USING rum (i); + +SELECT * FROM test_money WHERE i<'1'::money ORDER BY i; +SELECT * FROM test_money WHERE i<='1'::money ORDER BY i; +SELECT * FROM test_money WHERE i='1'::money ORDER BY i; +SELECT * FROM test_money WHERE i>='1'::money ORDER BY i; +SELECT * FROM test_money WHERE i>'1'::money ORDER BY i; + +EXPLAIN (costs off) +SELECT *, i <=> 0::money FROM test_money ORDER BY i <=> 0::money; +SELECT *, i <=> 0::money FROM test_money ORDER BY i <=> 0::money; + +EXPLAIN (costs off) +SELECT *, i <=> 1::money FROM test_money WHERE i<1::money ORDER BY i <=> 1::money; +SELECT *, i <=> 1::money FROM test_money WHERE i<1::money ORDER BY i <=> 1::money; diff --git a/sql/numeric.sql b/sql/numeric.sql new file mode 100644 index 0000000000..e3287a372b --- /dev/null +++ b/sql/numeric.sql @@ -0,0 +1,15 @@ +set enable_seqscan=off; + +CREATE TABLE test_numeric ( + i numeric +); + +INSERT INTO test_numeric VALUES (-2),(-1),(0),(1),(2),(3); + +CREATE INDEX idx_numeric ON test_numeric USING rum (i); + +SELECT * FROM test_numeric WHERE i<'1'::numeric ORDER BY i; +SELECT * FROM test_numeric WHERE i<='1'::numeric ORDER BY i; +SELECT * FROM test_numeric WHERE i='1'::numeric ORDER BY i; +SELECT * FROM test_numeric WHERE i>='1'::numeric ORDER BY i; +SELECT * FROM test_numeric WHERE i>'1'::numeric ORDER BY i; diff --git a/sql/oid.sql b/sql/oid.sql new file mode 100644 index 0000000000..3d6d86ff93 --- /dev/null +++ b/sql/oid.sql @@ -0,0 +1,23 @@ +set enable_seqscan=off; + +CREATE TABLE test_oid ( + i oid +); + +INSERT INTO test_oid VALUES (0),(1),(2),(3),(4),(5); + +CREATE INDEX idx_oid ON test_oid USING rum (i); + +SELECT * FROM test_oid WHERE i<3::oid ORDER BY i; +SELECT * FROM test_oid WHERE i<=3::oid ORDER BY i; +SELECT * FROM test_oid WHERE i=3::oid ORDER BY i; +SELECT * FROM test_oid WHERE i>=3::oid ORDER BY i; +SELECT * FROM test_oid WHERE i>3::oid ORDER BY i; + +EXPLAIN (costs off) +SELECT *, i <=> 0::oid FROM test_oid ORDER BY i <=> 0::oid; +SELECT *, i <=> 0::oid FROM test_oid ORDER BY i <=> 0::oid; + +EXPLAIN (costs off) +SELECT *, i <=> 1::oid FROM test_oid WHERE i<1::oid ORDER BY i <=> 1::oid; +SELECT *, i <=> 1::oid FROM test_oid WHERE i<1::oid ORDER BY i <=> 1::oid; diff --git a/sql/orderby.sql b/sql/orderby.sql index adb5101e13..a2bd227873 100644 --- a/sql/orderby.sql +++ b/sql/orderby.sql @@ -1,22 +1,26 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby.out - test output for 64-bit systems and + * orderby_1.out - test output for 32-bit systems. + * + */ + + CREATE TABLE tsts (id int, t tsvector, d timestamp); \copy tsts from 'data/tsts.data' -CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_timestamp_ops, d) +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; - SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; @@ -27,8 +31,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; +-- Test bitmap index scan RESET enable_bitmapscan; SET enable_seqscan = off; @@ -62,8 +65,34 @@ EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +-- Test index scan +RESET enable_indexscan; +RESET enable_indexonlyscan; SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; @@ -77,4 +106,37 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; -DROP TABLE tsts CASCADE; +-- Test "ORDER BY" error message +DROP INDEX tsts_idx; + +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d); + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +-- Test multicolumn index + +RESET enable_indexscan; +RESET enable_indexonlyscan; +RESET enable_bitmapscan; +SET enable_seqscan = off; + +DROP INDEX tsts_idx; + +CREATE INDEX tsts_id_idx ON tsts USING rum (t rum_tsvector_addon_ops, id, d) + WITH (attach = 'd', to = 't'); + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; diff --git a/sql/orderby_hash.sql b/sql/orderby_hash.sql index bbe6d209a5..dba8f17ca1 100644 --- a/sql/orderby_hash.sql +++ b/sql/orderby_hash.sql @@ -1,80 +1,135 @@ -CREATE TABLE tsts (id int, t tsvector, d timestamp); +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby_hash.out - test output for 64-bit systems and + * orderby_hash_1.out - test output for 32-bit systems. + * + */ -\copy tsts from 'data/tsts.data' -CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_hash_timestamp_ops, d) - WITH (attach = 'd', to = 't'); +CREATE TABLE tstsh (id int, t tsvector, d timestamp); + +\copy tstsh from 'data/tsts.data' +CREATE INDEX tstsh_idx ON tstsh USING rum (t rum_tsvector_hash_addon_ops, d) + WITH (attach = 'd', to = 't'); -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; +-- Test bitmap index scan RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +-- Test index scan +RESET enable_indexscan; +RESET enable_indexonlyscan; SET enable_bitmapscan=OFF; EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; +-- Test multicolumn index -DROP TABLE tsts CASCADE; +RESET enable_indexscan; +RESET enable_indexonlyscan; +RESET enable_bitmapscan; +SET enable_seqscan = off; + +DROP INDEX tstsh_idx; + +CREATE INDEX tstsh_id_idx ON tsts USING rum (t rum_tsvector_addon_ops, id, d) + WITH (attach = 'd', to = 't'); + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 1::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND id = 355::int ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2016-05-11 11:21:22.326724'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d = '2000-05-01'::timestamp ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; diff --git a/sql/rum.sql b/sql/rum.sql index 404e444806..8414bb95c5 100644 --- a/sql/rum.sql +++ b/sql/rum.sql @@ -7,11 +7,22 @@ BEFORE UPDATE OR INSERT ON test_rum FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger('a', 'pg_catalog.english', 't'); CREATE INDEX rumidx ON test_rum USING rum (a rum_tsvector_ops); +-- Check empty table using index scan +SELECT + a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + * + FROM test_rum + ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; + +-- Fill the table with data \copy test_rum(t) from 'data/rum.data'; -CREATE INDEX failed_rumidx ON test_rum USING rum (a rum_tsvector_timestamp_ops); +CREATE INDEX failed_rumidx ON test_rum USING rum (a rum_tsvector_addon_ops); SET enable_seqscan=off; +SET enable_indexscan=off; explain (costs off) SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); @@ -36,27 +47,35 @@ SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'def <2> fgr'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), * +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), * +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), * FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), * +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), * +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), + * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); @@ -74,7 +93,13 @@ SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'rat') SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER BY a; -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; CREATE TABLE tst (i int4, t tsvector); INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(1,100000) i; @@ -101,19 +126,29 @@ VACUUM tst; INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i; set enable_bitmapscan=off; +SET enable_indexscan=on; explain (costs off) SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); -SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'w:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); -SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'b:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'b:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*'); -DROP TABLE test_rum CASCADE; -DROP TABLE tst CASCADE; +-- Test correct work of phrase operator when position information is not in index. +create table test_rum_addon as table test_rum; +alter table test_rum_addon add column id serial; +create index on test_rum_addon using rum (a rum_tsvector_addon_ops, id) with (attach = 'id', to='a'); + +select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); +explain (costs off) select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); +-- + +select ('bjarn:6237 stroustrup:6238'::tsvector <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; +SELECT ('stroustrup:5508B,6233B,6238B bjarn:6235B,6237B' <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; diff --git a/sql/rum_hash.sql b/sql/rum_hash.sql index 8014cfbdec..a33b8fde31 100644 --- a/sql/rum_hash.sql +++ b/sql/rum_hash.sql @@ -1,113 +1,128 @@ -CREATE TABLE test_rum( t text, a tsvector ); +CREATE TABLE test_rum_hash( t text, a tsvector ); CREATE TRIGGER tsvectorupdate -BEFORE UPDATE OR INSERT ON test_rum +BEFORE UPDATE OR INSERT ON test_rum_hash FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger('a', 'pg_catalog.english', 't'); -CREATE INDEX rumidx ON test_rum USING rum (a rum_tsvector_hash_ops); +CREATE INDEX rumhashidx ON test_rum_hash USING rum (a rum_tsvector_hash_ops); -\copy test_rum(t) from 'data/rum.data'; +\copy test_rum_hash(t) from 'data/rum.data'; -CREATE INDEX failed_rumidx ON test_rum USING rum (a rum_tsvector_timestamp_ops); +CREATE INDEX failed_rumidx ON test_rum_hash USING rum (a rum_tsvector_addon_ops); SET enable_seqscan=off; +SET enable_indexscan=off; explain (costs off) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); explain (costs off) -SELECT * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote') +SELECT * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote') ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); explain (costs off) -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have&wish'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'knew&brain'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'among'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'structure&ancient'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary|sight)&(sending|heart)'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '(gave | half) <-> way'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '(gave | !half) <-> way'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave & way'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave & wooded & !look'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'have&wish'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'knew&brain'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'among'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'structure&ancient'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary|sight)&(sending|heart)'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '(gave | half) <-> way'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '(gave | !half) <-> way'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '!gave & way'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '!gave & wooded & !look'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <2> fgr'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), * - FROM test_rum +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), * - FROM test_rum +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * - FROM test_rum + FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), * - FROM test_rum +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), * - FROM test_rum +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), + * + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); -INSERT INTO test_rum (t) VALUES ('foo bar foo the over foo qq bar'); -INSERT INTO test_rum (t) VALUES ('345 qwerty copyright'); -INSERT INTO test_rum (t) VALUES ('345 qwerty'); -INSERT INTO test_rum (t) VALUES ('A fat cat has just eaten a rat.'); +INSERT INTO test_rum_hash (t) VALUES ('foo bar foo the over foo qq bar'); +INSERT INTO test_rum_hash (t) VALUES ('345 qwerty copyright'); +INSERT INTO test_rum_hash (t) VALUES ('345 qwerty'); +INSERT INTO test_rum_hash (t) VALUES ('A fat cat has just eaten a rat.'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'qwerty&345'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', '345'); -SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'rat'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'bar'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'qwerty&345'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', '345'); +SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'rat'); -SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER BY a; +SELECT a FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER BY a; -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; -CREATE TABLE tst (i int4, t tsvector); -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(1,100000) i; -CREATE INDEX tstidx ON tst USING rum (t rum_tsvector_hash_ops); +CREATE TABLE tst_hash (i int4, t tsvector); +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(1,100000) i; +CREATE INDEX tst_hashidx ON tst_hash USING rum (t rum_tsvector_hash_ops); -DELETE FROM tst WHERE i = 1; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(10001,11000) i; +DELETE FROM tst_hash WHERE i = 1; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(10001,11000) i; -DELETE FROM tst WHERE i = 2; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(11001,12000) i; +DELETE FROM tst_hash WHERE i = 2; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(11001,12000) i; -DELETE FROM tst WHERE i = 3; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(12001,13000) i; +DELETE FROM tst_hash WHERE i = 3; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(12001,13000) i; -DELETE FROM tst WHERE i = 4; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(13001,14000) i; +DELETE FROM tst_hash WHERE i = 4; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(13001,14000) i; -DELETE FROM tst WHERE i = 5; -VACUUM tst; -INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i; +DELETE FROM tst_hash WHERE i = 5; +VACUUM tst_hash; +INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i; set enable_bitmapscan=off; +SET enable_indexscan=on; explain (costs off) SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * - FROM test_rum + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * - FROM test_rum + FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); -DROP TABLE test_rum CASCADE; -DROP TABLE tst CASCADE; diff --git a/sql/rum_validate.sql b/sql/rum_validate.sql new file mode 100644 index 0000000000..455db5db56 --- /dev/null +++ b/sql/rum_validate.sql @@ -0,0 +1,67 @@ +-- +-- Various sanity tests +-- + +-- First validate operator classes +SELECT opcname, amvalidate(opc.oid) +FROM pg_opclass opc JOIN pg_am am ON am.oid = opcmethod +WHERE amname = 'rum' +ORDER BY opcname; + +-- +-- Test access method and 'rumidx' index properties +-- + +-- Access method properties +SELECT a.amname, p.name, pg_indexam_has_property(a.oid,p.name) +FROM pg_am a, unnest(array['can_order','can_unique','can_multi_col','can_exclude']) p(name) +WHERE a.amname = 'rum' ORDER BY a.amname; + +-- Index properties +SELECT p.name, pg_index_has_property('rumidx'::regclass,p.name) +FROM unnest(array['clusterable','index_scan','bitmap_scan','backward_scan']) p(name); + +-- Index column properties +SELECT p.name, pg_index_column_has_property('rumidx'::regclass,1,p.name) +FROM unnest(array['asc','desc','nulls_first','nulls_last','orderable','distance_orderable','returnable','search_array','search_nulls']) p(name); + +-- +-- Check incorrect operator class +-- + +DROP INDEX rumidx; + +-- PGPRO-1175: Check incorrect operator class, i.e. it shouldn't work correctly +CREATE OPERATOR CLASS rum_tsvector_norm_ops +FOR TYPE tsvector USING rum +AS + OPERATOR 1 @@ (tsvector, tsquery), + OPERATOR 2 <=> (tsvector, rum_distance_query) FOR ORDER BY pg_catalog.float_ops, + FUNCTION 1 gin_cmp_tslexeme(text, text), + FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), + FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), + FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), + FUNCTION 6 rum_tsvector_config(internal), + FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), + FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), + FUNCTION 10 rum_ts_join_pos(internal, internal), + STORAGE text; + +CREATE INDEX rum_norm_idx ON test_rum USING rum(a rum_tsvector_norm_ops); + +SET enable_seqscan=off; +SET enable_bitmapscan=off; +SET enable_indexscan=on; + +-- PGPRO-1175: Select using incorrect operator class +SELECT a + FROM test_rum + WHERE a @@ to_tsquery('pg_catalog.english', 'bar') + ORDER BY a <=> (to_tsquery('pg_catalog.english', 'bar'),0); + +-- PGPRO-9026: column and attached column cannot be the same +CREATE TABLE test_array (i int2[]); +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_addon_ops) WITH (attach = 'i', to = 'i'); +SELECT * FROM test_array WHERE i && '{1}'; +DROP TABLE test_array; diff --git a/sql/rum_weight.sql b/sql/rum_weight.sql new file mode 100644 index 0000000000..3fcee8b06e --- /dev/null +++ b/sql/rum_weight.sql @@ -0,0 +1,44 @@ +CREATE TABLE testweight_rum( t text, a tsvector, r text ); + +CREATE FUNCTION fill_weight_trigger() RETURNS trigger AS $$ +begin + new.a := + setweight(to_tsvector('pg_catalog.english', coalesce(new.r,'')), 'A') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.t,'')), 'D'); + return new; +end +$$ LANGUAGE plpgsql; + +CREATE TRIGGER tsvectorweightupdate +BEFORE INSERT OR UPDATE ON testweight_rum +FOR EACH ROW EXECUTE PROCEDURE fill_weight_trigger(); + +CREATE INDEX rumidx_weight ON testweight_rum USING rum (a rum_tsvector_ops); + +\copy testweight_rum(t,r) from 'data/rum_weight.data' DELIMITER '|' ; + +SET enable_seqscan=off; +SET enable_indexscan=off; + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever:A|wrote'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'among:ABC'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'structure:D&ancient:BCD'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary:DC|sight)&(sending:ABC|heart)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave:D & way'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<->go)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<2>go)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a | way<->reach)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a & way<->reach)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go & !way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'show:d & seem & !town:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & !way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & !way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'think<->go:d | go<->see'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d<->think'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach<->think'); + + diff --git a/sql/ruminv.sql b/sql/ruminv.sql index ec836fd165..b1c8eba709 100644 --- a/sql/ruminv.sql +++ b/sql/ruminv.sql @@ -10,6 +10,7 @@ INSERT INTO test_invrum VALUES ('(a|b)&c'::tsquery); INSERT INTO test_invrum VALUES ('(!(a|b))&c'::tsquery); INSERT INTO test_invrum VALUES ('(a|b)&(c|d)'::tsquery); INSERT INTO test_invrum VALUES ('!a'::tsquery); +INSERT INTO test_invrum VALUES ('(a|a1|a2|a3|a4|a5)&(b|b1|b2|b3|b4|b5|b6)&!(c|c1|c2|c3)'::tsquery); SELECT * FROM test_invrum WHERE q @@ ''::tsvector; SELECT * FROM test_invrum WHERE q @@ 'a'::tsvector; diff --git a/sql/security.sql b/sql/security.sql new file mode 100644 index 0000000000..da7b83957b --- /dev/null +++ b/sql/security.sql @@ -0,0 +1,5 @@ +-- Check security CVE-2020-14350 +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) RETURNS bool AS $$ SELECT false $$ LANGUAGE SQL; +CREATE EXTENSION rum; +DROP FUNCTION rum_anyarray_similar(anyarray,anyarray); + diff --git a/sql/text.sql b/sql/text.sql new file mode 100644 index 0000000000..1f340b7109 --- /dev/null +++ b/sql/text.sql @@ -0,0 +1,86 @@ +set enable_seqscan=off; + +CREATE TABLE test_text ( + i text +); + +INSERT INTO test_text VALUES ('a'),('ab'),('abc'),('abb'),('axy'),('xyz'); + +CREATE INDEX idx_text ON test_text USING rum (i); + +SELECT * FROM test_text WHERE i<'abc' ORDER BY i; +SELECT * FROM test_text WHERE i<='abc' ORDER BY i; +SELECT * FROM test_text WHERE i='abc' ORDER BY i; +SELECT * FROM test_text WHERE i>='abc' ORDER BY i; +SELECT * FROM test_text WHERE i>'abc' ORDER BY i; + +CREATE TABLE test_text_o AS SELECT id::text, t FROM tsts; + +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + +CREATE INDEX test_text_o_idx ON test_text_o USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't'); + +RESET enable_indexscan; +RESET enable_indexonlyscan; +SET enable_bitmapscan=OFF; +SET enable_seqscan = off; + +EXPLAIN (costs off) +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; +SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + +CREATE TABLE test_text_a AS SELECT id::text, t FROM tsts; + +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column +CREATE INDEX test_text_a_idx ON test_text_a USING rum + (t rum_tsvector_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); + +EXPLAIN (costs off) +SELECT count(*) FROM test_text_a WHERE id < '400'; +SELECT count(*) FROM test_text_a WHERE id < '400'; + +EXPLAIN (costs off) +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; +SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + +CREATE TABLE test_text_h_o AS SELECT id::text, t FROM tsts; + +CREATE INDEX test_text_h_o_idx ON test_text_h_o USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't'); + +EXPLAIN (costs off) +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; +SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + +CREATE TABLE test_text_h_a AS SELECT id::text, t FROM tsts; + +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column +CREATE INDEX test_text_h_a_idx ON test_text_h_a USING rum + (t rum_tsvector_hash_addon_ops, id) + WITH (attach = 'id', to = 't', order_by_attach='t'); + +EXPLAIN (costs off) +SELECT count(*) FROM test_text_h_a WHERE id < '400'; +SELECT count(*) FROM test_text_h_a WHERE id < '400'; + +EXPLAIN (costs off) +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; +EXPLAIN (costs off) +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; +SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; + diff --git a/sql/time.sql b/sql/time.sql new file mode 100644 index 0000000000..7a7a4e7dfd --- /dev/null +++ b/sql/time.sql @@ -0,0 +1,22 @@ +set enable_seqscan=off; + +CREATE TABLE test_time ( + i time +); + +INSERT INTO test_time VALUES + ( '03:55:08' ), + ( '04:55:08' ), + ( '05:55:08' ), + ( '08:55:08' ), + ( '09:55:08' ), + ( '10:55:08' ) +; + +CREATE INDEX idx_time ON test_time USING rum (i); + +SELECT * FROM test_time WHERE i<'08:55:08'::time ORDER BY i; +SELECT * FROM test_time WHERE i<='08:55:08'::time ORDER BY i; +SELECT * FROM test_time WHERE i='08:55:08'::time ORDER BY i; +SELECT * FROM test_time WHERE i>='08:55:08'::time ORDER BY i; +SELECT * FROM test_time WHERE i>'08:55:08'::time ORDER BY i; diff --git a/sql/timestamp.sql b/sql/timestamp.sql index 4b20398e0c..3386229ddc 100644 --- a/sql/timestamp.sql +++ b/sql/timestamp.sql @@ -1,3 +1,13 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * timestamp.out - test output for 64-bit systems and + * timestamp_1.out - test output for 32-bit systems. + * + */ + CREATE TABLE test_timestamp ( i timestamp @@ -12,14 +22,11 @@ INSERT INTO test_timestamp VALUES ( '2004-10-26 10:55:08' ) ; -SELECT i::timestamptz AS i INTO test_timestamptz FROM test_timestamp; - SELECT i <=> '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; SELECT i <=| '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; SELECT i |=> '2004-10-26 06:24:08', i FROM test_timestamp ORDER BY 1, 2 ASC; CREATE INDEX idx_timestamp ON test_timestamp USING rum (i); -CREATE INDEX idx_timestamptz ON test_timestamptz USING rum (i); set enable_seqscan=off; @@ -43,7 +50,35 @@ explain (costs off) SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; +explain (costs off) +SELECT *, i <=> '2004-10-26 08:55:08'::timestamp FROM test_timestamp + ORDER BY i <=> '2004-10-26 08:55:08'::timestamp; +SELECT *, i <=> '2004-10-26 08:55:08'::timestamp FROM test_timestamp + ORDER BY i <=> '2004-10-26 08:55:08'::timestamp; + +explain (costs off) +SELECT *, i <=> '2004-10-26 05:00:00'::timestamp FROM test_timestamp + WHERE i>'2004-10-26 05:00:00'::timestamp ORDER BY i <=> '2004-10-26 05:00:00'::timestamp; +SELECT *, i <=> '2004-10-26 05:00:00'::timestamp FROM test_timestamp + WHERE i>'2004-10-26 05:00:00'::timestamp ORDER BY i <=> '2004-10-26 05:00:00'::timestamp; + +-- Tests for timestamptz + +SELECT i::timestamptz AS i INTO test_timestamptz FROM test_timestamp; +CREATE INDEX idx_timestamptz ON test_timestamptz USING rum (i); + explain (costs off) SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; +explain (costs off) +SELECT *, i <=> '2004-10-26 08:55:08'::timestamptz FROM test_timestamptz + ORDER BY i <=> '2004-10-26 08:55:08'::timestamptz; +SELECT *, i <=> '2004-10-26 08:55:08'::timestamptz FROM test_timestamptz + ORDER BY i <=> '2004-10-26 08:55:08'::timestamptz; + +explain (costs off) +SELECT *, i <=> '2004-10-26 05:00:00'::timestamptz FROM test_timestamptz + WHERE i>'2004-10-26 05:00:00'::timestamptz ORDER BY i <=> '2004-10-26 05:00:00'::timestamptz; +SELECT *, i <=> '2004-10-26 05:00:00'::timestamptz FROM test_timestamptz + WHERE i>'2004-10-26 05:00:00'::timestamptz ORDER BY i <=> '2004-10-26 05:00:00'::timestamptz; diff --git a/sql/timetz.sql b/sql/timetz.sql new file mode 100644 index 0000000000..ed4176a7a1 --- /dev/null +++ b/sql/timetz.sql @@ -0,0 +1,22 @@ +set enable_seqscan=off; + +CREATE TABLE test_timetz ( + i timetz +); + +INSERT INTO test_timetz VALUES + ( '03:55:08 GMT+2' ), + ( '04:55:08 GMT+2' ), + ( '05:55:08 GMT+2' ), + ( '08:55:08 GMT+2' ), + ( '09:55:08 GMT+2' ), + ( '10:55:08 GMT+2' ) +; + +CREATE INDEX idx_timetz ON test_timetz USING rum (i); + +SELECT * FROM test_timetz WHERE i<'08:55:08 GMT+2'::timetz ORDER BY i; +SELECT * FROM test_timetz WHERE i<='08:55:08 GMT+2'::timetz ORDER BY i; +SELECT * FROM test_timetz WHERE i='08:55:08 GMT+2'::timetz ORDER BY i; +SELECT * FROM test_timetz WHERE i>='08:55:08 GMT+2'::timetz ORDER BY i; +SELECT * FROM test_timetz WHERE i>'08:55:08 GMT+2'::timetz ORDER BY i; diff --git a/sql/varbit.sql b/sql/varbit.sql new file mode 100644 index 0000000000..0d8caa5058 --- /dev/null +++ b/sql/varbit.sql @@ -0,0 +1,15 @@ +set enable_seqscan=off; + +CREATE TABLE test_varbit ( + i varbit +); + +INSERT INTO test_varbit VALUES ('001'),('010'),('011'),('100'),('101'),('110'); + +CREATE INDEX idx_varbit ON test_varbit USING rum (i); + +SELECT * FROM test_varbit WHERE i<'100'::varbit ORDER BY i; +SELECT * FROM test_varbit WHERE i<='100'::varbit ORDER BY i; +SELECT * FROM test_varbit WHERE i='100'::varbit ORDER BY i; +SELECT * FROM test_varbit WHERE i>='100'::varbit ORDER BY i; +SELECT * FROM test_varbit WHERE i>'100'::varbit ORDER BY i; diff --git a/sql/varchar.sql b/sql/varchar.sql new file mode 100644 index 0000000000..15aa994a59 --- /dev/null +++ b/sql/varchar.sql @@ -0,0 +1,15 @@ +set enable_seqscan=off; + +CREATE TABLE test_varchar ( + i varchar +); + +INSERT INTO test_varchar VALUES ('a'),('ab'),('abc'),('abb'),('axy'),('xyz'); + +CREATE INDEX idx_varchar ON test_varchar USING rum (i); + +SELECT * FROM test_varchar WHERE i<'abc'::varchar ORDER BY i; +SELECT * FROM test_varchar WHERE i<='abc'::varchar ORDER BY i; +SELECT * FROM test_varchar WHERE i='abc'::varchar ORDER BY i; +SELECT * FROM test_varchar WHERE i>='abc'::varchar ORDER BY i; +SELECT * FROM test_varchar WHERE i>'abc'::varchar ORDER BY i; diff --git a/src/btree_rum.c b/src/btree_rum.c new file mode 100644 index 0000000000..dd43a3c037 --- /dev/null +++ b/src/btree_rum.c @@ -0,0 +1,686 @@ +#include "postgres.h" + +#include + +#include "access/stratnum.h" +#include "utils/builtins.h" +#include "utils/bytea.h" +#include "utils/cash.h" +#include "utils/date.h" +#if PG_VERSION_NUM >= 120000 +#include "utils/float.h" +#endif +#include "utils/inet.h" +#include "utils/numeric.h" +#include "utils/timestamp.h" +#include "utils/varbit.h" + +#include "rum.h" + +#if defined(_MSC_VER) && _MSC_VER >= 1200 && _MSC_VER < 1800 // Between VC++ 6.0 and VC++ 11.0 +#include +#define isfinite _finite +#elif defined(__sun) && defined(__SVR4) //Solaris +#if !defined(isfinite) +#include +#define isfinite finite +#endif +#elif defined(_AIX) // AIX +#if !defined(isfinite) +#include +#define isfinite finite +#endif +#elif defined(__hpux) // HPUX +#if !defined(isfinite) +#if defined(__ia64) && !defined(finite) +#define isfinite(x) ((sizeof(x) == sizeof(float) ? _Isfinitef(x) : _IsFinite(x))) +#else +#include +#define isfinite finite +#endif +#endif +#endif + +typedef struct QueryInfo +{ + StrategyNumber strategy; + Datum datum; + bool is_varlena; + Datum (*typecmp) (FunctionCallInfo); +} QueryInfo; + + +/*** RUM support functions shared by all datatypes ***/ + +static Datum +rum_btree_extract_value(FunctionCallInfo fcinfo, bool is_varlena) +{ + Datum datum = PG_GETARG_DATUM(0); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + Datum *entries = (Datum *) palloc(sizeof(Datum)); + + if (is_varlena) + datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); + entries[0] = datum; + *nentries = 1; + + PG_RETURN_POINTER(entries); +} + +/* + * For BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, and + * BTEqualStrategyNumber we want to start the index scan at the + * supplied query datum, and work forward. For BTLessStrategyNumber + * and BTLessEqualStrategyNumber, we need to start at the leftmost + * key, and work forward until the supplied query datum (which must be + * sent along inside the QueryInfo structure). + */ +static Datum +rum_btree_extract_query(FunctionCallInfo fcinfo, + bool is_varlena, + Datum (*leftmostvalue) (void), + Datum (*typecmp) (FunctionCallInfo)) +{ + Datum datum = PG_GETARG_DATUM(0); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + bool **partialmatch = (bool **) PG_GETARG_POINTER(3); + Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4); + Datum *entries = (Datum *) palloc(sizeof(Datum)); + QueryInfo *data = (QueryInfo *) palloc(sizeof(QueryInfo)); + bool *ptr_partialmatch; + + *nentries = 1; + ptr_partialmatch = *partialmatch = (bool *) palloc(sizeof(bool)); + *ptr_partialmatch = false; + if (is_varlena) + datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); + data->strategy = strategy; + data->datum = datum; + data->is_varlena = is_varlena; + data->typecmp = typecmp; + *extra_data = (Pointer *) palloc(sizeof(Pointer)); + **extra_data = (Pointer) data; + + switch (strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + entries[0] = leftmostvalue(); + *ptr_partialmatch = true; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + *ptr_partialmatch = true; + /*FALLTHROUGH*/ + case BTEqualStrategyNumber: + case RUM_DISTANCE: + case RUM_LEFT_DISTANCE: + case RUM_RIGHT_DISTANCE: + entries[0] = datum; + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + } + + PG_RETURN_POINTER(entries); +} + +/* + * Datum a is a value from extract_query method and for BTLess* + * strategy it is a left-most value. So, use original datum from QueryInfo + * to decide to stop scanning or not. Datum b is always from index. + */ +static Datum +rum_btree_compare_prefix(FunctionCallInfo fcinfo) +{ + Datum a = PG_GETARG_DATUM(0); + Datum b = PG_GETARG_DATUM(1); + QueryInfo *data = (QueryInfo *) PG_GETARG_POINTER(3); + int32 res, + cmp; + + cmp = DatumGetInt32(DirectFunctionCall2Coll( + data->typecmp, + PG_GET_COLLATION(), + (data->strategy == BTLessStrategyNumber || + data->strategy == BTLessEqualStrategyNumber) + ? data->datum : a, + b)); + + switch (data->strategy) + { + case BTLessStrategyNumber: + /* If original datum > indexed one then return match */ + if (cmp > 0) + res = 0; + else + res = 1; + break; + case BTLessEqualStrategyNumber: + /* The same except equality */ + if (cmp >= 0) + res = 0; + else + res = 1; + break; + case BTEqualStrategyNumber: + if (cmp != 0) + res = 1; + else + res = 0; + break; + case BTGreaterEqualStrategyNumber: + /* If original datum <= indexed one then return match */ + if (cmp <= 0) + res = 0; + else + res = 1; + break; + case BTGreaterStrategyNumber: + /* If original datum <= indexed one then return match */ + /* If original datum == indexed one then continue scan */ + if (cmp < 0) + res = 0; + else if (cmp == 0) + res = -1; + else + res = 1; + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + data->strategy); + res = 0; + } + + PG_RETURN_INT32(res); +} + +PG_FUNCTION_INFO_V1(rum_btree_consistent); +Datum +rum_btree_consistent(PG_FUNCTION_ARGS) +{ + bool *recheck = (bool *) PG_GETARG_POINTER(5); + + *recheck = false; + PG_RETURN_BOOL(true); +} + + +/*** RUM_SUPPORT macro defines the datatype specific functions ***/ + +#define RUM_SUPPORT(type, is_varlena, leftmostvalue, typecmp) \ +PG_FUNCTION_INFO_V1(rum_##type##_extract_value); \ +Datum \ +rum_##type##_extract_value(PG_FUNCTION_ARGS) \ +{ \ + return rum_btree_extract_value(fcinfo, is_varlena); \ +} \ +PG_FUNCTION_INFO_V1(rum_##type##_extract_query); \ +Datum \ +rum_##type##_extract_query(PG_FUNCTION_ARGS) \ +{ \ + return rum_btree_extract_query(fcinfo, \ + is_varlena, leftmostvalue, typecmp); \ +} \ +PG_FUNCTION_INFO_V1(rum_##type##_compare_prefix); \ +Datum \ +rum_##type##_compare_prefix(PG_FUNCTION_ARGS) \ +{ \ + return rum_btree_compare_prefix(fcinfo); \ +} + +#define RUM_SUPPORT_DIST(type, is_varlena, leftmostvalue, typecmp, isinfinite, subtract) \ +RUM_SUPPORT(type, is_varlena, leftmostvalue, typecmp) \ +PG_FUNCTION_INFO_V1(rum_##type##_config); \ +Datum \ +rum_##type##_config(PG_FUNCTION_ARGS) \ +{ \ + RumConfig *config = (RumConfig *) PG_GETARG_POINTER(0); \ + \ + config->addInfoTypeOid = InvalidOid; \ + \ + config->strategyInfo[0].strategy = RUM_LEFT_DISTANCE; \ + config->strategyInfo[0].direction = BackwardScanDirection; \ + \ + config->strategyInfo[1].strategy = RUM_RIGHT_DISTANCE; \ + config->strategyInfo[1].direction = ForwardScanDirection; \ + \ + config->strategyInfo[2].strategy = InvalidStrategy; \ + \ + PG_RETURN_VOID(); \ +} \ +PG_FUNCTION_INFO_V1(rum_##type##_distance); \ +Datum \ +rum_##type##_distance(PG_FUNCTION_ARGS) \ +{ \ + Datum a = PG_GETARG_DATUM(0); \ + Datum b = PG_GETARG_DATUM(1); \ + double diff; \ + \ + if (isinfinite(a) || isinfinite(b)) \ + { \ + if (isinfinite(a) && isinfinite(b)) \ + diff = 0; \ + else \ + diff = get_float8_infinity(); \ + } \ + else \ + { \ + int r = DatumGetInt32(DirectFunctionCall2Coll( \ + typecmp, PG_GET_COLLATION(), a, b)); \ + \ + diff = (r > 0) ? subtract(a, b) : subtract(b, a); \ + } \ + \ + PG_RETURN_FLOAT8(diff); \ +} \ +PG_FUNCTION_INFO_V1(rum_##type##_left_distance); \ +Datum \ +rum_##type##_left_distance(PG_FUNCTION_ARGS) \ +{ \ + Datum a = PG_GETARG_DATUM(0); \ + Datum b = PG_GETARG_DATUM(1); \ + double diff; \ + \ + if (isinfinite(a) || isinfinite(b)) \ + { \ + if (isinfinite(a) && isinfinite(b)) \ + diff = 0; \ + else \ + diff = get_float8_infinity(); \ + } \ + else \ + { \ + int r = DatumGetInt32(DirectFunctionCall2Coll( \ + typecmp, PG_GET_COLLATION(), a, b)); \ + \ + diff = (r > 0) ? get_float8_infinity() : subtract(b, a); \ + } \ + \ + PG_RETURN_FLOAT8(diff); \ +} \ +PG_FUNCTION_INFO_V1(rum_##type##_right_distance); \ +Datum \ +rum_##type##_right_distance(PG_FUNCTION_ARGS) \ +{ \ + Datum a = PG_GETARG_DATUM(0); \ + Datum b = PG_GETARG_DATUM(1); \ + double diff; \ + \ + if (isinfinite(a) || isinfinite(b)) \ + { \ + if (isinfinite(a) && isinfinite(b)) \ + diff = 0; \ + else \ + diff = get_float8_infinity(); \ + } \ + else \ + { \ + int r = DatumGetInt32(DirectFunctionCall2Coll( \ + typecmp, PG_GET_COLLATION(), a, b)); \ + \ + diff = (r > 0) ? subtract(a, b) : get_float8_infinity(); \ + } \ + \ + PG_RETURN_FLOAT8(diff); \ +} \ +PG_FUNCTION_INFO_V1(rum_##type##_outer_distance); \ +Datum \ +rum_##type##_outer_distance(PG_FUNCTION_ARGS) \ +{ \ + StrategyNumber strategy = PG_GETARG_UINT16(2); \ + Datum diff; \ + \ + switch (strategy) \ + { \ + case RUM_DISTANCE: \ + diff = DirectFunctionCall2(rum_##type##_distance, \ + PG_GETARG_DATUM(0), \ + PG_GETARG_DATUM(1)); \ + break; \ + case RUM_LEFT_DISTANCE: \ + diff = DirectFunctionCall2(rum_##type##_left_distance, \ + PG_GETARG_DATUM(0), \ + PG_GETARG_DATUM(1)); \ + break; \ + case RUM_RIGHT_DISTANCE: \ + diff = DirectFunctionCall2(rum_##type##_right_distance, \ + PG_GETARG_DATUM(0), \ + PG_GETARG_DATUM(1)); \ + break; \ + default: \ + elog(ERROR, "rum_%s_outer_distance: unknown strategy %u", \ + #type, strategy); \ + } \ + \ + PG_RETURN_DATUM(diff); \ +} \ +PG_FUNCTION_INFO_V1(rum_##type##_key_distance); \ +Datum \ +rum_##type##_key_distance(PG_FUNCTION_ARGS) \ +{ \ + StrategyNumber strategy = PG_GETARG_UINT16(2); \ + Datum diff; \ + \ + switch (strategy) \ + { \ + case RUM_DISTANCE: \ + diff = DirectFunctionCall2(rum_##type##_distance, \ + PG_GETARG_DATUM(0), \ + PG_GETARG_DATUM(1)); \ + break; \ + case RUM_LEFT_DISTANCE: \ + diff = DirectFunctionCall2(rum_##type##_left_distance, \ + PG_GETARG_DATUM(0), \ + PG_GETARG_DATUM(1)); \ + break; \ + case RUM_RIGHT_DISTANCE: \ + diff = DirectFunctionCall2(rum_##type##_right_distance, \ + PG_GETARG_DATUM(0), \ + PG_GETARG_DATUM(1)); \ + break; \ + default: \ + elog(ERROR, "rum_%s_key_distance: unknown strategy %u", \ + #type, strategy); \ + } \ + \ + PG_RETURN_DATUM(diff); \ +} + +static bool +always_false(Datum a) +{ + return false; +} + +/*** Datatype specifications ***/ + +static Datum +leftmostvalue_int2(void) +{ + return Int16GetDatum(SHRT_MIN); +} + +static float8 +int2subtract(Datum a, Datum b) +{ + return ((float8)DatumGetInt16(a)) - ((float8)DatumGetInt16(b)); +} + +RUM_SUPPORT_DIST(int2, false, leftmostvalue_int2, btint2cmp, always_false, int2subtract) + +static Datum +leftmostvalue_int4(void) +{ + return Int32GetDatum(INT_MIN); +} + +static float8 +int4subtract(Datum a, Datum b) +{ + return ((float8)DatumGetInt32(a)) - ((float8)DatumGetInt32(b)); +} + +RUM_SUPPORT_DIST(int4, false, leftmostvalue_int4, btint4cmp, always_false, int4subtract) + +static Datum +leftmostvalue_int8(void) +{ + return Int64GetDatum(PG_INT64_MIN); +} + +static float8 +int8subtract(Datum a, Datum b) +{ + return ((float8)DatumGetInt64(a)) - ((float8)DatumGetInt64(b)); +} + +RUM_SUPPORT_DIST(int8, false, leftmostvalue_int8, btint8cmp, always_false, int8subtract) + +static Datum +leftmostvalue_float4(void) +{ + return Float4GetDatum(-get_float4_infinity()); +} + +static bool +float4_is_infinite(Datum a) +{ + return !isfinite(DatumGetFloat4(a)); +} + +static float8 +float4subtract(Datum a, Datum b) +{ + return ((float8)DatumGetFloat4(a)) - ((float8)DatumGetFloat4(b)); +} + +RUM_SUPPORT_DIST(float4, false, leftmostvalue_float4, btfloat4cmp, + float4_is_infinite, float4subtract) + +static Datum +leftmostvalue_float8(void) +{ + return Float8GetDatum(-get_float8_infinity()); +} + +static bool +float8_is_infinite(Datum a) +{ + return !isfinite(DatumGetFloat8(a)); +} + +static float8 +float8subtract(Datum a, Datum b) +{ + return DatumGetFloat8(a) - DatumGetFloat8(b); +} + +RUM_SUPPORT_DIST(float8, false, leftmostvalue_float8, btfloat8cmp, + float8_is_infinite, float8subtract) + +static Datum +leftmostvalue_money(void) +{ + return Int64GetDatum(PG_INT64_MIN); +} + +RUM_SUPPORT_DIST(money, false, leftmostvalue_money, cash_cmp, always_false, int8subtract) + +static Datum +leftmostvalue_oid(void) +{ + return ObjectIdGetDatum(0); +} + +static float8 +oidsubtract(Datum a, Datum b) +{ + return ((float8)DatumGetObjectId(a)) - ((float8)DatumGetObjectId(b)); +} + +RUM_SUPPORT_DIST(oid, false, leftmostvalue_oid, btoidcmp, always_false, oidsubtract) + +static Datum +leftmostvalue_timestamp(void) +{ + return TimestampGetDatum(DT_NOBEGIN); +} + +static bool +timestamp_is_infinite(Datum a) +{ + return TIMESTAMP_NOT_FINITE(DatumGetTimestamp(a)); +} + +static float8 +timestamp_subtract(Datum a, Datum b) +{ + return (DatumGetTimestamp(a) - DatumGetTimestamp(b)) / 1e6; +} + +RUM_SUPPORT_DIST(timestamp, false, leftmostvalue_timestamp, timestamp_cmp, + timestamp_is_infinite, timestamp_subtract) + +RUM_SUPPORT_DIST(timestamptz, false, leftmostvalue_timestamp, timestamp_cmp, + timestamp_is_infinite, timestamp_subtract) + + +static Datum +leftmostvalue_time(void) +{ + return TimeADTGetDatum(0); +} + +RUM_SUPPORT(time, false, leftmostvalue_time, time_cmp) + +static Datum +leftmostvalue_timetz(void) +{ + TimeTzADT *v = palloc(sizeof(TimeTzADT)); + + v->time = 0; + v->zone = -24 * 3600; /* XXX is that true? */ + + return TimeTzADTPGetDatum(v); +} + +RUM_SUPPORT(timetz, false, leftmostvalue_timetz, timetz_cmp) + +static Datum +leftmostvalue_date(void) +{ + return DateADTGetDatum(DATEVAL_NOBEGIN); +} + +RUM_SUPPORT(date, false, leftmostvalue_date, date_cmp) + +static Datum +leftmostvalue_interval(void) +{ + Interval *v = palloc(sizeof(Interval)); + + v->time = DT_NOBEGIN; + v->day = 0; + v->month = 0; + return IntervalPGetDatum(v); +} + +RUM_SUPPORT(interval, false, leftmostvalue_interval, interval_cmp) + +static Datum +leftmostvalue_macaddr(void) +{ + macaddr *v = palloc0(sizeof(macaddr)); + + return MacaddrPGetDatum(v); +} + +RUM_SUPPORT(macaddr, false, leftmostvalue_macaddr, macaddr_cmp) + +static Datum +leftmostvalue_inet(void) +{ + return DirectFunctionCall1(inet_in, CStringGetDatum("0.0.0.0/0")); +} + +RUM_SUPPORT(inet, true, leftmostvalue_inet, network_cmp) + +RUM_SUPPORT(cidr, true, leftmostvalue_inet, network_cmp) + +static Datum +leftmostvalue_text(void) +{ + return PointerGetDatum(cstring_to_text_with_len("", 0)); +} + +RUM_SUPPORT(text, true, leftmostvalue_text, bttextcmp) + +static Datum +leftmostvalue_char(void) +{ + return CharGetDatum(SCHAR_MIN); +} + +RUM_SUPPORT(char, false, leftmostvalue_char, btcharcmp) + +RUM_SUPPORT(bytea, true, leftmostvalue_text, byteacmp) + +static Datum +leftmostvalue_bit(void) +{ + return DirectFunctionCall3(bit_in, + CStringGetDatum(""), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); +} + +RUM_SUPPORT(bit, true, leftmostvalue_bit, bitcmp) + +static Datum +leftmostvalue_varbit(void) +{ + return DirectFunctionCall3(varbit_in, + CStringGetDatum(""), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); +} + +RUM_SUPPORT(varbit, true, leftmostvalue_varbit, bitcmp) + +/* + * Numeric type hasn't a real left-most value, so we use PointerGetDatum(NULL) + * (*not* a SQL NULL) to represent that. We can get away with that because + * the value returned by our leftmostvalue function will never be stored in + * the index nor passed to anything except our compare and prefix-comparison + * functions. The same trick could be used for other pass-by-reference types. + */ + +#define NUMERIC_IS_LEFTMOST(x) ((x) == NULL) + +PG_FUNCTION_INFO_V1(rum_numeric_cmp); + +Datum +rum_numeric_cmp(PG_FUNCTION_ARGS) +{ + Numeric a = (Numeric) PG_GETARG_POINTER(0); + Numeric b = (Numeric) PG_GETARG_POINTER(1); + int res = 0; + + if (NUMERIC_IS_LEFTMOST(a)) + { + res = (NUMERIC_IS_LEFTMOST(b)) ? 0 : -1; + } + else if (NUMERIC_IS_LEFTMOST(b)) + { + res = 1; + } + else + { + res = DatumGetInt32(DirectFunctionCall2(numeric_cmp, + NumericGetDatum(a), + NumericGetDatum(b))); + } + + PG_RETURN_INT32(res); +} + +static Datum +leftmostvalue_numeric(void) +{ + return PointerGetDatum(NULL); +} + +RUM_SUPPORT(numeric, true, leftmostvalue_numeric, rum_numeric_cmp) + +/* Compatibility with rum-1.0, but see gen_rum_sql--1.0--1.1.pl */ +PG_FUNCTION_INFO_V1(rum_timestamp_consistent); +Datum +rum_timestamp_consistent(PG_FUNCTION_ARGS) +{ + bool *recheck = (bool *) PG_GETARG_POINTER(5); + + *recheck = false; + PG_RETURN_BOOL(true); +} diff --git a/src/disable_core_macro.h b/src/disable_core_macro.h new file mode 100644 index 0000000000..0d6c4a8a3b --- /dev/null +++ b/src/disable_core_macro.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * disable_core_macro.h + * Support including tuplesort.c from postgresql core code. + * + * Copyright (c) 2022-2024, Postgres Professional + * + *------------------------------------------------------------------------- + */ + +#ifndef __DISABLE_CORE_MACRO_H__ +#define __DISABLE_CORE_MACRO_H__ + +#undef TRACE_SORT +#undef DEBUG_BOUNDED_SORT +#undef TRACE_POSTGRESQL_SORT_START +#undef TRACE_POSTGRESQL_SORT_DONE + +#if PG_VERSION_NUM >= 110000 +#define TRACE_POSTGRESQL_SORT_START(arg1, arg2, arg3, arg4, arg5, arg6) \ + do {} while(0) +#else +#define TRACE_POSTGRESQL_SORT_START(arg1, arg2, arg3, arg4, arg5) \ + do {} while(0) +#endif + + +#define TRACE_POSTGRESQL_SORT_DONE(arg1, arg2) \ + do {} while(0) + + + +#endif /* __DISABLE_CORE_MACRO_H__ */ diff --git a/src/qsort_tuple.c b/src/qsort_tuple.c new file mode 100644 index 0000000000..0cb46e1416 --- /dev/null +++ b/src/qsort_tuple.c @@ -0,0 +1,332 @@ +/* + * autogenerated by src/backend/utils/sort/gen_qsort_tuple.pl, do not edit! + * + * This file is included by tuplesort.c, rather than compiled separately. + */ + +/* $NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $ */ + +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Qsort routine based on J. L. Bentley and M. D. McIlroy, + * "Engineering a sort function", + * Software--Practice and Experience 23 (1993) 1249-1265. + * + * We have modified their original by adding a check for already-sorted input, + * which seems to be a win per discussions on pgsql-hackers around 2006-03-21. + * + * Also, we recurse on the smaller partition and iterate on the larger one, + * which ensures we cannot recurse more than log(N) levels (since the + * partition recursed to is surely no more than half of the input). Bentley + * and McIlroy explicitly rejected doing this on the grounds that it's "not + * worth the effort", but we have seen crashes in the field due to stack + * overrun, so that judgment seems wrong. + */ + +static void +swapfunc(SortTuple *a, SortTuple *b, size_t n) +{ + do + { + SortTuple t = *a; + *a++ = *b; + *b++ = t; + } while (--n > 0); +} + +#define swap(a, b) \ + do { \ + SortTuple t = *(a); \ + *(a) = *(b); \ + *(b) = t; \ + } while (0) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n) + +static SortTuple * +med3_tuple(SortTuple *a, SortTuple *b, SortTuple *c, SortTupleComparator cmp_tuple, Tuplesortstate *state) +{ + return cmp_tuple(a, b, state) < 0 ? + (cmp_tuple(b, c, state) < 0 ? b : + (cmp_tuple(a, c, state) < 0 ? c : a)) + : (cmp_tuple(b, c, state) > 0 ? b : + (cmp_tuple(a, c, state) < 0 ? a : c)); +} + +static void +qsort_tuple(SortTuple *a, size_t n, SortTupleComparator cmp_tuple, Tuplesortstate *state) +{ + SortTuple *pa, + *pb, + *pc, + *pd, + *pl, + *pm, + *pn; + size_t d1, + d2; + int r, + presorted; + +loop: + CHECK_FOR_INTERRUPTS(); + if (n < 7) + { + for (pm = a + 1; pm < a + n; pm++) + for (pl = pm; pl > a && cmp_tuple(pl - 1, pl, state) > 0; pl--) + swap(pl, pl - 1); + return; + } + presorted = 1; + for (pm = a + 1; pm < a + n; pm++) + { + CHECK_FOR_INTERRUPTS(); + if (cmp_tuple(pm - 1, pm, state) > 0) + { + presorted = 0; + break; + } + } + if (presorted) + return; + pm = a + (n / 2); + if (n > 7) + { + pl = a; + pn = a + (n - 1); + if (n > 40) + { + size_t d = (n / 8); + + pl = med3_tuple(pl, pl + d, pl + 2 * d, cmp_tuple, state); + pm = med3_tuple(pm - d, pm, pm + d, cmp_tuple, state); + pn = med3_tuple(pn - 2 * d, pn - d, pn, cmp_tuple, state); + } + pm = med3_tuple(pl, pm, pn, cmp_tuple, state); + } + swap(a, pm); + pa = pb = a + 1; + pc = pd = a + (n - 1); + for (;;) + { + while (pb <= pc && (r = cmp_tuple(pb, a, state)) <= 0) + { + if (r == 0) + { + swap(pa, pb); + pa++; + } + pb++; + CHECK_FOR_INTERRUPTS(); + } + while (pb <= pc && (r = cmp_tuple(pc, a, state)) >= 0) + { + if (r == 0) + { + swap(pc, pd); + pd--; + } + pc--; + CHECK_FOR_INTERRUPTS(); + } + if (pb > pc) + break; + swap(pb, pc); + pb++; + pc--; + } + pn = a + n; + d1 = Min(pa - a, pb - pa); + vecswap(a, pb - d1, d1); + d1 = Min(pd - pc, pn - pd - 1); + vecswap(pb, pn - d1, d1); + d1 = pb - pa; + d2 = pd - pc; + if (d1 <= d2) + { + /* Recurse on left partition, then iterate on right partition */ + if (d1 > 1) + qsort_tuple(a, d1, cmp_tuple, state); + if (d2 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_tuple(pn - d2, d2, cmp_tuple, state); */ + a = pn - d2; + n = d2; + goto loop; + } + } + else + { + /* Recurse on right partition, then iterate on left partition */ + if (d2 > 1) + qsort_tuple(pn - d2, d2, cmp_tuple, state); + if (d1 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_tuple(a, d1, cmp_tuple, state); */ + n = d1; + goto loop; + } + } +} + +#define cmp_ssup(a, b, ssup) \ + ApplySortComparator((a)->datum1, (a)->isnull1, \ + (b)->datum1, (b)->isnull1, ssup) + +static SortTuple * +med3_ssup(SortTuple *a, SortTuple *b, SortTuple *c, SortSupport ssup) +{ + return cmp_ssup(a, b, ssup) < 0 ? + (cmp_ssup(b, c, ssup) < 0 ? b : + (cmp_ssup(a, c, ssup) < 0 ? c : a)) + : (cmp_ssup(b, c, ssup) > 0 ? b : + (cmp_ssup(a, c, ssup) < 0 ? a : c)); +} + +static void +qsort_ssup(SortTuple *a, size_t n, SortSupport ssup) +{ + SortTuple *pa, + *pb, + *pc, + *pd, + *pl, + *pm, + *pn; + size_t d1, + d2; + int r, + presorted; + +loop: + CHECK_FOR_INTERRUPTS(); + if (n < 7) + { + for (pm = a + 1; pm < a + n; pm++) + for (pl = pm; pl > a && cmp_ssup(pl - 1, pl, ssup) > 0; pl--) + swap(pl, pl - 1); + return; + } + presorted = 1; + for (pm = a + 1; pm < a + n; pm++) + { + CHECK_FOR_INTERRUPTS(); + if (cmp_ssup(pm - 1, pm, ssup) > 0) + { + presorted = 0; + break; + } + } + if (presorted) + return; + pm = a + (n / 2); + if (n > 7) + { + pl = a; + pn = a + (n - 1); + if (n > 40) + { + size_t d = (n / 8); + + pl = med3_ssup(pl, pl + d, pl + 2 * d, ssup); + pm = med3_ssup(pm - d, pm, pm + d, ssup); + pn = med3_ssup(pn - 2 * d, pn - d, pn, ssup); + } + pm = med3_ssup(pl, pm, pn, ssup); + } + swap(a, pm); + pa = pb = a + 1; + pc = pd = a + (n - 1); + for (;;) + { + while (pb <= pc && (r = cmp_ssup(pb, a, ssup)) <= 0) + { + if (r == 0) + { + swap(pa, pb); + pa++; + } + pb++; + CHECK_FOR_INTERRUPTS(); + } + while (pb <= pc && (r = cmp_ssup(pc, a, ssup)) >= 0) + { + if (r == 0) + { + swap(pc, pd); + pd--; + } + pc--; + CHECK_FOR_INTERRUPTS(); + } + if (pb > pc) + break; + swap(pb, pc); + pb++; + pc--; + } + pn = a + n; + d1 = Min(pa - a, pb - pa); + vecswap(a, pb - d1, d1); + d1 = Min(pd - pc, pn - pd - 1); + vecswap(pb, pn - d1, d1); + d1 = pb - pa; + d2 = pd - pc; + if (d1 <= d2) + { + /* Recurse on left partition, then iterate on right partition */ + if (d1 > 1) + qsort_ssup(a, d1, ssup); + if (d2 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_ssup(pn - d2, d2, ssup); */ + a = pn - d2; + n = d2; + goto loop; + } + } + else + { + /* Recurse on right partition, then iterate on left partition */ + if (d2 > 1) + qsort_ssup(pn - d2, d2, ssup); + if (d1 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_ssup(a, d1, ssup); */ + n = d1; + goto loop; + } + } +} diff --git a/src/rum.h b/src/rum.h index 1cd4ab0426..2139774d08 100644 --- a/src/rum.h +++ b/src/rum.h @@ -3,8 +3,8 @@ * rum.h * Exported definitions for RUM index. * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 2006-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 2006-2022, PostgreSQL Global Development Group * *------------------------------------------------------------------------- */ @@ -19,9 +19,16 @@ #include "access/sdir.h" #include "lib/rbtree.h" #include "storage/bufmgr.h" +#include "utils/datum.h" +#include "utils/memutils.h" #include "rumsort.h" +/* RUM distance strategies */ +#define RUM_DISTANCE 20 +#define RUM_LEFT_DISTANCE 21 +#define RUM_RIGHT_DISTANCE 22 + /* * Page opaque data in a inverted index page. * @@ -159,12 +166,12 @@ typedef struct RumMetaPageData (RumItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ RumItemPointerGetBlockNumber(p) != InvalidBlockNumber) -typedef struct RumKey +typedef struct RumItem { ItemPointerData iptr; bool addInfoIsNull; Datum addInfo; -} RumKey; +} RumItem; #define RumItemSetMin(item) \ do { \ @@ -180,7 +187,7 @@ typedef struct { /* We use BlockIdData not BlockNumber to avoid padding space wastage */ BlockIdData child_blkno; - RumKey key; + RumItem item; } PostingItem; #define PostingItemGetBlockNumber(pointer) \ @@ -257,21 +264,31 @@ typedef signed char RumNullCategory; /* * Data (posting tree) pages */ -#define RumDataPageGetRightBound(page) ((RumKey*) PageGetContents(page)) +/* + * FIXME -- Currently RumItem is placed as a pages right bound and PostingItem + * is placed as a non-leaf pages item. Both RumItem and PostingItem stores + * AddInfo as a raw Datum, which is bogus. It is fine for pass-by-value + * attributes, but it isn't for pass-by-reference, which may have variable + * length of data. This AddInfo is used only by order_by_attach indexes, so it + * isn't allowed to create index using ordering over pass-by-reference AddInfo, + * see initRumState(). This can be solved by having non-fixed length right bound + * and non-fixed non-leaf posting tree item. + */ +#define RumDataPageGetRightBound(page) ((RumItem*) PageGetContents(page)) #define RumDataPageGetData(page) \ - (PageGetContents(page) + MAXALIGN(sizeof(RumKey))) + (PageGetContents(page) + MAXALIGN(sizeof(RumItem))) #define RumDataPageGetItem(page,i) \ (RumDataPageGetData(page) + ((i)-1) * sizeof(PostingItem)) #define RumDataPageGetFreeSpace(page) \ (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ - - MAXALIGN(sizeof(RumKey)) /* right bound */ \ + - MAXALIGN(sizeof(RumItem)) /* right bound */ \ - RumPageGetOpaque(page)->maxoff * sizeof(PostingItem) \ - MAXALIGN(sizeof(RumPageOpaqueData))) #define RumMaxLeafDataItems \ ((BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ - MAXALIGN(sizeof(RumKey)) /* right bound */ - \ + MAXALIGN(sizeof(RumItem)) /* right bound */ - \ MAXALIGN(sizeof(RumPageOpaqueData))) \ / sizeof(ItemPointerData)) @@ -293,7 +310,7 @@ typedef struct #define RumDataPageSize \ (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ - - MAXALIGN(sizeof(RumKey)) /* right bound */ \ + - MAXALIGN(sizeof(RumItem)) /* right bound */ \ - MAXALIGN(sizeof(RumPageOpaqueData)) \ - MAXALIGN(sizeof(RumDataLeafItemIndex) * RumDataLeafIndexCount)) @@ -310,18 +327,12 @@ typedef struct typedef struct RumOptions { int32 vl_len_; /* varlena header (do not touch directly!) */ - bool useFastUpdate; /* use fast updates? */ bool useAlternativeOrder; - int orderByColumn; + int attachColumn; int addToColumn; } RumOptions; #define ALT_ADD_INFO_NULL_FLAG (0x8000) -#define RUM_DEFAULT_USE_FASTUPDATE false -#define RumGetUseFastUpdate(relation) \ - ((relation)->rd_options ? \ - ((RumOptions *) (relation)->rd_options)->useFastUpdate : RUM_DEFAULT_USE_FASTUPDATE) - /* Macros for buffer lock/unlock operations */ #define RUM_UNLOCK BUFFER_LOCK_UNLOCK @@ -348,7 +359,7 @@ typedef struct RumState bool isBuild; bool oneCol; /* true if single-column index */ bool useAlternativeOrder; - AttrNumber attrnOrderByColumn; + AttrNumber attrnAttachColumn; AttrNumber attrnAddToColumn; /* @@ -392,9 +403,19 @@ typedef struct RumState Oid supportCollation[INDEX_MAX_KEYS]; } RumState; +/* Accessor for the i'th attribute of tupdesc. */ +#if PG_VERSION_NUM > 100000 +#define RumTupleDescAttr(tupdesc, i) (TupleDescAttr(tupdesc, i)) +#else +#define RumTupleDescAttr(tupdesc, i) ((tupdesc)->attrs[(i)]) +#endif + /* rumutil.c */ extern bytea *rumoptions(Datum reloptions, bool validate); -extern Datum rumhandler(PG_FUNCTION_ARGS); +extern bool rumproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull); +extern PGDLLEXPORT Datum rumhandler(PG_FUNCTION_ARGS); extern void initRumState(RumState * state, Relation index); extern Buffer RumNewBuffer(Relation index); extern void RumInitBuffer(GenericXLogState *state, Buffer buffer, uint32 flags, @@ -428,13 +449,16 @@ extern void rumbuildempty(Relation index); extern bool ruminsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique +#if PG_VERSION_NUM >= 140000 + , bool indexUnchanged +#endif #if PG_VERSION_NUM >= 100000 , struct IndexInfo *indexInfo #endif ); extern void rumEntryInsert(RumState * rumstate, OffsetNumber attnum, Datum key, RumNullCategory category, - RumKey * items, uint32 nitem, GinStatsData *buildStats); + RumItem * items, uint32 nitem, GinStatsData *buildStats); /* rumbtree.c */ @@ -484,7 +508,7 @@ typedef struct RumBtreeData bool isDelete; /* Data (posting tree) options */ - RumKey *items; + RumItem *items; uint32 nitem; uint32 curitem; @@ -510,7 +534,7 @@ extern void rumEntryFillRoot(RumBtree btree, Buffer root, Buffer lbuf, Buffer rb Page page, Page lpage, Page rpage); extern IndexTuple rumPageGetLinkItup(RumBtree btree, Buffer buf, Page page); extern void rumReadTuple(RumState * rumstate, OffsetNumber attnum, - IndexTuple itup, RumKey * items); + IndexTuple itup, RumItem * items, bool copyAddInfo); extern void rumReadTuplePointers(RumState * rumstate, OffsetNumber attnum, IndexTuple itup, ItemPointerData *ipd); extern void updateItemIndexes(Page page, OffsetNumber attnum, RumState * rumstate); @@ -518,17 +542,17 @@ extern void checkLeafDataPage(RumState * rumstate, AttrNumber attrnum, Page page /* rumdatapage.c */ extern int rumCompareItemPointers(const ItemPointerData *a, const ItemPointerData *b); -extern int compareRumKey(RumState * state, const AttrNumber attno, - const RumKey * a, const RumKey * b); -extern void convertIndexToKey(RumDataLeafItemIndex *src, RumKey *dst); +extern int compareRumItem(RumState * state, const AttrNumber attno, + const RumItem * a, const RumItem * b); +extern void convertIndexToKey(RumDataLeafItemIndex *src, RumItem *dst); extern Pointer rumPlaceToDataPageLeaf(Pointer ptr, OffsetNumber attnum, - RumKey * item, ItemPointer prev, RumState * rumstate); + RumItem * item, ItemPointer prev, RumState * rumstate); extern Size rumCheckPlaceToDataPageLeaf(OffsetNumber attnum, - RumKey * item, ItemPointer prev, RumState * rumstate, Size size); -extern uint32 rumMergeItemPointers(RumState * rumstate, AttrNumber attno, - RumKey * dst, - RumKey * a, uint32 na, - RumKey * b, uint32 nb); + RumItem * item, ItemPointer prev, RumState * rumstate, Size size); +extern uint32 rumMergeRumItems(RumState * rumstate, AttrNumber attno, + RumItem * dst, + RumItem * a, uint32 na, + RumItem * b, uint32 nb); extern void RumDataPageAddItem(Page page, void *data, OffsetNumber offset); extern void RumPageDeletePostingItem(Page page, OffsetNumber offset); @@ -545,15 +569,22 @@ extern RumPostingTreeScan *rumPrepareScanPostingTree(Relation index, extern void rumInsertItemPointers(RumState * rumstate, OffsetNumber attnum, RumPostingTreeScan * gdi, - RumKey * items, uint32 nitem, + RumItem * items, uint32 nitem, GinStatsData *buildStats); -extern Buffer rumScanBeginPostingTree(RumPostingTreeScan * gdi, RumKey *key); +extern Buffer rumScanBeginPostingTree(RumPostingTreeScan * gdi, RumItem *item); extern void rumDataFillRoot(RumBtree btree, Buffer root, Buffer lbuf, Buffer rbuf, Page page, Page lpage, Page rpage); extern void rumPrepareDataScan(RumBtree btree, Relation index, OffsetNumber attnum, RumState * rumstate); /* rumscan.c */ +typedef struct RumScanItem +{ + RumItem item; + Datum keyValue; + RumNullCategory keyCategory; +} RumScanItem; + /* * RumScanKeyData describes a single RUM index qualifier expression. * @@ -585,12 +616,19 @@ typedef struct RumScanKeyData /* array of check flags, reported to consistentFn */ bool *entryRes; + /* array of additional information, used in consistentFn and orderingFn */ Datum *addInfo; bool *addInfoIsNull; + /* additional information, used in outerOrderingFn */ bool useAddToColumn; Datum outerAddInfo; bool outerAddInfoIsNull; + /* Key information, used in orderingFn */ + Datum curKey; + RumNullCategory curKeyCategory; + bool useCurKey; + /* other data needed for calling consistentFn */ Datum query; /* NB: these three arrays have only nuserentries elements! */ @@ -609,15 +647,17 @@ typedef struct RumScanKeyData * isFinished means that all the input entry streams are finished, so this * key cannot succeed for any later TIDs. */ - RumKey curItem; + RumItem curItem; bool curItemMatches; bool recheckCurItem; bool isFinished; bool orderBy; + bool willSort; /* just a copy of RumScanOpaqueData.willSort */ ScanDirection scanDirection; - RumScanKey *addInfoKeys; - int addInfoNKeys; + /* array of keys, used to scan using additional information as keys */ + RumScanKey *addInfoKeys; + uint32 addInfoNKeys; } RumScanKeyData; typedef struct RumScanEntryData @@ -636,34 +676,43 @@ typedef struct RumScanEntryData Buffer buffer; /* current ItemPointer to heap */ - RumKey curRumKey; + RumItem curItem; + + /* Used for ordering using distance */ + Datum curKey; + RumNullCategory curKeyCategory; + bool useCurKey; - /* for a partial-match or full-scan query, we accumulate all TIDs here */ - bool forceUseBitmap; - /* or here if we need to store addinfo */ - Tuplesortstate *matchSortstate; - RumKey collectRumKey; + /* + * For a partial-match or full-scan query, we accumulate all TIDs and + * and additional information here + */ + RumTuplesortstate *matchSortstate; + RumScanItem collectRumItem; /* for full-scan query with order-by */ RumBtreeStack *stack; bool scanWithAddInfo; /* used for Posting list and one page in Posting tree */ - RumKey *list; - MemoryContext context; + RumItem *list; int16 nlist; int16 offset; - ScanDirection scanDirection; + ScanDirection scanDirection; bool isFinished; bool reduceResult; - bool preValue; uint32 predictNumberResult; + + /* used to scan posting tree */ RumPostingTreeScan *gdi; + /* used in fast scan in addition to preConsistentFn */ + bool preValue; + /* Find by AddInfo */ bool useMarkAddInfo; - RumKey markAddInfo; + RumItem markAddInfo; } RumScanEntryData; typedef struct @@ -682,27 +731,32 @@ typedef enum typedef struct RumScanOpaqueData { + /* tempCtx is used to hold consistent and ordering functions data */ MemoryContext tempCtx; - MemoryContext keyCtx; /* used to hold key and entry data */ + /* keyCtx is used to hold key and entry data */ + MemoryContext keyCtx; RumState rumstate; - RumScanKey *keys; /* one per scan qualifier expr */ + RumScanKey *keys; /* one per scan qualifier expr */ uint32 nkeys; - int norderbys; - RumScanEntry *entries; /* one per index search condition */ - RumScanEntry *sortedEntries; /* one per index search condition */ - int entriesIncrIndex; + RumScanEntry *entries; /* one per index search condition */ + RumScanEntry *sortedEntries; /* Sorted entries. Used in fast scan */ + int entriesIncrIndex; /* used in fast scan */ uint32 totalentries; - uint32 allocentries; /* allocated length of entries[] */ + uint32 allocentries; /* allocated length of entries[] and + sortedEntries[] */ - Tuplesortstate *sortstate; + RumTuplesortstate *sortstate; + int norderbys; /* Number of columns in ordering. + Will be assigned to sortstate->nKeys */ - RumKey key; + RumItem item; /* current item used in index scan */ bool firstCall; + bool isVoidRes; /* true if query is unsatisfiable */ + bool willSort; /* is there any columns in ordering */ RumScanType scanType; - TIDBitmap *tbm; ScanDirection naturalOrder; bool secondPass; @@ -734,14 +788,18 @@ extern IndexBulkDeleteResult *rumvacuumcleanup(IndexVacuumInfo *info, extern bool rumvalidate(Oid opclassoid); /* rumbulk.c */ +#if PG_VERSION_NUM <= 100006 || PG_VERSION_NUM == 110000 +typedef RBNode RBTNode; +#endif + typedef struct RumEntryAccumulator { - RBNode rbnode; + RBTNode rbnode; Datum key; RumNullCategory category; OffsetNumber attnum; bool shouldSort; - RumKey *list; + RumItem *list; uint32 maxcount; /* allocated size of list[] */ uint32 count; /* current number of list[] entries */ } RumEntryAccumulator; @@ -756,7 +814,7 @@ typedef struct #if PG_VERSION_NUM >= 100000 RBTreeIterator tree_walk; #endif - RumKey *sortSpace; + RumItem *sortSpace; uint32 sortSpaceN; } BuildAccumulator; @@ -766,7 +824,7 @@ extern void rumInsertBAEntries(BuildAccumulator *accum, Datum *entries, Datum *addInfo, bool *addInfoIsNull, RumNullCategory * categories, int32 nentries); extern void rumBeginBAScan(BuildAccumulator *accum); -extern RumKey *rumGetBAEntry(BuildAccumulator *accum, +extern RumItem *rumGetBAEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *key, RumNullCategory * category, uint32 *n); @@ -778,20 +836,42 @@ extern RumKey *rumGetBAEntry(BuildAccumulator *accum, #define RUM_ADDINFO_JOIN 10 #define RUMNProcs 10 -extern Datum rum_extract_tsvector(PG_FUNCTION_ARGS); -extern Datum rum_extract_tsquery(PG_FUNCTION_ARGS); -extern Datum rum_tsvector_config(PG_FUNCTION_ARGS); -extern Datum rum_tsquery_pre_consistent(PG_FUNCTION_ARGS); -extern Datum rum_tsquery_distance(PG_FUNCTION_ARGS); -extern Datum rum_ts_distance_tt(PG_FUNCTION_ARGS); -extern Datum rum_ts_distance_ttf(PG_FUNCTION_ARGS); -extern Datum rum_ts_distance_td(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_tsvector(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_tsquery(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_tsvector_config(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_tsquery_pre_consistent(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_tsquery_distance(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_ts_distance_tt(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_ts_distance_ttf(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_ts_distance_td(PG_FUNCTION_ARGS); + +extern PGDLLEXPORT Datum tsquery_to_distance_query(PG_FUNCTION_ARGS); + +/* rum_arr_utils.c */ +typedef enum SimilarityType +{ + SMT_COSINE = 1, + SMT_JACCARD = 2, + SMT_OVERLAP = 3 +} SimilarityType; + +#define RUM_SIMILARITY_FUNCTION_DEFAULT SMT_COSINE +#define RUM_SIMILARITY_THRESHOLD_DEFAULT 0.5 -extern Datum tsquery_to_distance_query(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_config(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_anyarray(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_anyarray_query(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_consistent(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_ordering(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_similar(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_distance(PG_FUNCTION_ARGS); /* GUC parameters */ -extern PGDLLIMPORT int RumFuzzySearchLimit; +extern int RumFuzzySearchLimit; +extern float8 RumArraySimilarityThreshold; +extern int RumArraySimilarityFunction; + /* * Functions for reading ItemPointers with additional information. Used in @@ -866,10 +946,14 @@ rumDataPageLeafReadItemPointer(char *ptr, ItemPointer iptr, bool *addInfoIsNull) * Reads next item pointer and additional information from leaf data page. * Replaces current item pointer with the next one. Zero item pointer should be * passed in order to read the first item pointer. + * + * It is necessary to pass copyAddInfo=true if additional information is used + * when the data page is unlocked. If the additional information is used without + * locking one can get unexpected behaviour. */ static inline Pointer -rumDataPageLeafRead(Pointer ptr, OffsetNumber attnum, RumKey * item, - RumState * rumstate) +rumDataPageLeafRead(Pointer ptr, OffsetNumber attnum, RumItem * item, + bool copyAddInfo, RumState * rumstate) { Form_pg_attribute attr; @@ -934,8 +1018,13 @@ rumDataPageLeafRead(Pointer ptr, OffsetNumber attnum, RumKey * item, } else { - ptr = (Pointer) att_align_pointer(ptr, attr->attalign, attr->attlen, ptr); - item->addInfo = fetch_att(ptr, attr->attbyval, attr->attlen); + Datum addInfo; + + ptr = (Pointer) att_align_pointer(ptr, attr->attalign, attr->attlen, + ptr); + addInfo = fetch_att(ptr, attr->attbyval, attr->attlen); + item->addInfo = copyAddInfo ? + datumCopy(addInfo, attr->attbyval, attr->attlen) : addInfo; } ptr = (Pointer) att_addlength_pointer(ptr, attr->attlen, ptr); @@ -949,7 +1038,7 @@ rumDataPageLeafRead(Pointer ptr, OffsetNumber attnum, RumKey * item, * passed in order to read the first item pointer. */ static inline Pointer -rumDataPageLeafReadPointer(Pointer ptr, OffsetNumber attnum, RumKey * item, +rumDataPageLeafReadPointer(Pointer ptr, OffsetNumber attnum, RumItem * item, RumState * rumstate) { Form_pg_attribute attr; @@ -996,4 +1085,22 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg6, Datum arg7, Datum arg8, Datum arg9, Datum arg10); +/* PostgreSQL version-agnostic creation of memory context */ +#if PG_VERSION_NUM >= 120000 +#define RumContextCreate(parent, name) \ + AllocSetContextCreate(parent, name, ALLOCSET_DEFAULT_SIZES) +#elif PG_VERSION_NUM >= 110000 + #define RumContextCreate(parent, name) \ + AllocSetContextCreateExtended(parent, name, \ + ALLOCSET_DEFAULT_MINSIZE, \ + ALLOCSET_DEFAULT_INITSIZE, \ + ALLOCSET_DEFAULT_MAXSIZE) +#else + #define RumContextCreate(parent, name) \ + AllocSetContextCreate(parent, name, \ + ALLOCSET_DEFAULT_MINSIZE, \ + ALLOCSET_DEFAULT_INITSIZE, \ + ALLOCSET_DEFAULT_MAXSIZE) +#endif + #endif /* __RUM_H__ */ diff --git a/src/rum_arr_utils.c b/src/rum_arr_utils.c new file mode 100644 index 0000000000..d8dc00699a --- /dev/null +++ b/src/rum_arr_utils.c @@ -0,0 +1,888 @@ +/*------------------------------------------------------------------------- + * + * rum_arr_utils.c + * various anyarray-search functions + * + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/pg_am.h" +#include "catalog/pg_cast.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_type.h" +#include "commands/defrem.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#if PG_VERSION_NUM >= 120000 +#include "utils/float.h" +#endif +#include "utils/lsyscache.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + +#include "rum.h" + +#include +#include + + +#define RUM_OVERLAP_STRATEGY 1 +#define RUM_CONTAINS_STRATEGY 2 +#define RUM_CONTAINED_STRATEGY 3 +#define RUM_EQUAL_STRATEGY 4 +#define RUM_SIMILAR_STRATEGY 5 + + +#define NDIM 1 + +#define ARR_NELEMS(x) ArrayGetNItems(ARR_NDIM(x), ARR_DIMS(x)) +#define ARR_ISVOID(x) ( (x) == NULL || ARR_NELEMS(x) == 0 ) + +#define CHECKARRVALID(x) \ + do { \ + if (x == NULL) \ + ereport(ERROR, \ + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), \ + errmsg("array must not be NULL"))); \ + else if (x) { \ + if (ARR_NDIM(x) != NDIM && ARR_NDIM(x) != 0) \ + ereport(ERROR, \ + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), \ + errmsg("array must have 1 dimension"))); \ + if (ARR_HASNULL(x)) \ + ereport(ERROR, \ + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), \ + errmsg("array must not contain nulls"))); \ + } \ + } while (0) + +#define INIT_DUMMY_SIMPLE_ARRAY(s, len) \ + do { \ + (s)->elems = NULL; \ + (s)->hashedElems = NULL; \ + (s)->nelems = (len); \ + (s)->nHashedElems = -1; \ + (s)->info = NULL; \ + } while (0) + +#define DIST_FROM_SML(sml) \ + ( (sml == 0.0) ? get_float8_infinity() : ((float8) 1) / ((float8) (sml)) ) + +#if PG_VERSION_NUM < 110000 +#define HASHSTANDARD_PROC HASHPROC +#endif + + +typedef struct AnyArrayTypeInfo +{ + Oid typid; + int16 typlen; + bool typbyval; + char typalign; + MemoryContext funcCtx; + Oid cmpFuncOid; + bool cmpFuncInited; + FmgrInfo cmpFunc; + bool hashFuncInited; + Oid hashFuncOid; + FmgrInfo hashFunc; +} AnyArrayTypeInfo; + +typedef struct SimpleArray +{ + Datum *elems; + int32 *hashedElems; + int32 nelems; + int32 nHashedElems; + AnyArrayTypeInfo *info; +} SimpleArray; + + +#if PG_VERSION_NUM < 110000 +#define SearchSysCacheList(A, B, C, D, E) \ +SearchSysCacheList(A, B, C, D, E, 0) +#endif + + +float8 RumArraySimilarityThreshold = RUM_SIMILARITY_THRESHOLD_DEFAULT; +int RumArraySimilarityFunction = RUM_SIMILARITY_FUNCTION_DEFAULT; + + +PG_FUNCTION_INFO_V1(rum_anyarray_config); + +PG_FUNCTION_INFO_V1(rum_extract_anyarray); +PG_FUNCTION_INFO_V1(rum_extract_anyarray_query); + +PG_FUNCTION_INFO_V1(rum_anyarray_consistent); + +PG_FUNCTION_INFO_V1(rum_anyarray_ordering); +PG_FUNCTION_INFO_V1(rum_anyarray_similar); +PG_FUNCTION_INFO_V1(rum_anyarray_distance); + + +static Oid getAMProc(Oid amOid, Oid typid); + +static AnyArrayTypeInfo *getAnyArrayTypeInfo(MemoryContext ctx, Oid typid); +static AnyArrayTypeInfo *getAnyArrayTypeInfoCached(FunctionCallInfo fcinfo, Oid typid); +static void freeAnyArrayTypeInfo(AnyArrayTypeInfo *info); +static void cmpFuncInit(AnyArrayTypeInfo *info); + +static SimpleArray *Array2SimpleArray(AnyArrayTypeInfo *info, ArrayType *a); +static void freeSimpleArray(SimpleArray *s); +static int cmpAscArrayElem(const void *a, const void *b, void *arg); +static int cmpDescArrayElem(const void *a, const void *b, void *arg); +static void sortSimpleArray(SimpleArray *s, int32 direction); +static void uniqSimpleArray(SimpleArray *s, bool onlyDuplicate); + +static int32 getNumOfIntersect(SimpleArray *sa, SimpleArray *sb); +static float8 getSimilarity(SimpleArray *sa, SimpleArray *sb, int32 intersection); + + +/* + * Specifies additional information type for operator class. + */ +Datum +rum_anyarray_config(PG_FUNCTION_ARGS) +{ + RumConfig *config = (RumConfig *) PG_GETARG_POINTER(0); + + config->addInfoTypeOid = INT4OID; + config->strategyInfo[0].strategy = InvalidStrategy; + + PG_RETURN_VOID(); +} + + +/* + * Extract entries and queries + */ + +/* Enhanced version of ginarrayextract() */ +Datum +rum_extract_anyarray(PG_FUNCTION_ARGS) +{ + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + SimpleArray *sa; + AnyArrayTypeInfo *info; + + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + + Datum **addInfo = (Datum **) PG_GETARG_POINTER(3); + bool **addInfoIsNull = (bool **) PG_GETARG_POINTER(4); + + int i; + + CHECKARRVALID(array); + + info = getAnyArrayTypeInfoCached(fcinfo, ARR_ELEMTYPE(array)); + + sa = Array2SimpleArray(info, array); + sortSimpleArray(sa, 1); + uniqSimpleArray(sa, false); + + *nentries = sa->nelems; + *addInfo = (Datum *) palloc(*nentries * sizeof(Datum)); + *addInfoIsNull = (bool *) palloc(*nentries * sizeof(bool)); + + for (i = 0; i < *nentries; i++) + { + /* Use array's size as additional info */ + (*addInfo)[i] = Int32GetDatum(*nentries); + (*addInfoIsNull)[i] = BoolGetDatum(false); + } + + /* we should not free array, entries[i] points into it */ + PG_RETURN_POINTER(sa->elems); +} + +/* Enhanced version of ginqueryarrayextract() */ +Datum +rum_extract_anyarray_query(PG_FUNCTION_ARGS) +{ + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + SimpleArray *sa; + AnyArrayTypeInfo *info; + + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + + StrategyNumber strategy = PG_GETARG_UINT16(2); + int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); + + CHECKARRVALID(array); + + info = getAnyArrayTypeInfoCached(fcinfo, ARR_ELEMTYPE(array)); + + sa = Array2SimpleArray(info, array); + sortSimpleArray(sa, 1); + uniqSimpleArray(sa, false); + + *nentries = sa->nelems; + + switch (strategy) + { + case RUM_OVERLAP_STRATEGY: + *searchMode = GIN_SEARCH_MODE_DEFAULT; + break; + case RUM_CONTAINS_STRATEGY: + if (*nentries > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else /* everything contains the empty set */ + *searchMode = GIN_SEARCH_MODE_ALL; + break; + case RUM_CONTAINED_STRATEGY: + /* empty set is contained in everything */ + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + case RUM_EQUAL_STRATEGY: + if (*nentries > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + case RUM_SIMILAR_STRATEGY: + *searchMode = GIN_SEARCH_MODE_DEFAULT; + break; + /* Special case for distance */ + case RUM_DISTANCE: + *searchMode = GIN_SEARCH_MODE_DEFAULT; + break; + default: + elog(ERROR, "rum_extract_anyarray_query: unknown strategy number: %d", + strategy); + } + + /* we should not free array, elems[i] points into it */ + PG_RETURN_POINTER(sa->elems); +} + + +/* + * Consistency check + */ + +/* Enhanced version of ginarrayconsistent() */ +Datum +rum_anyarray_consistent(PG_FUNCTION_ARGS) +{ + bool *check = (bool *) PG_GETARG_POINTER(0); + + StrategyNumber strategy = PG_GETARG_UINT16(1); + + /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */ + int32 nkeys = PG_GETARG_INT32(3); + + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + bool *recheck = (bool *) PG_GETARG_POINTER(5); + + /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(6); */ + bool *nullFlags = (bool *) PG_GETARG_POINTER(7); + + Datum *addInfo = (Datum *) PG_GETARG_POINTER(8); + bool *addInfoIsNull = (bool *) PG_GETARG_POINTER(9); + + bool res; + int32 i; + + switch (strategy) + { + case RUM_OVERLAP_STRATEGY: + /* result is not lossy */ + *recheck = false; + /* must have a match for at least one non-null element */ + res = false; + for (i = 0; i < nkeys; i++) + { + if (check[i] && !nullFlags[i]) + { + res = true; + break; + } + } + break; + case RUM_CONTAINS_STRATEGY: + /* result is not lossy */ + *recheck = false; + + /* must have all elements in check[] true, and no nulls */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!check[i] || nullFlags[i]) + { + res = false; + break; + } + } + break; + case RUM_CONTAINED_STRATEGY: + /* we will need recheck */ + *recheck = true; + + /* query must have <= amount of elements than array */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!addInfoIsNull[i] && DatumGetInt32(addInfo[i]) > nkeys) + { + res = false; + break; + } + } + break; + case RUM_EQUAL_STRATEGY: + /* we will need recheck */ + *recheck = true; + + /* + * Must have all elements in check[] true; no discrimination + * against nulls here. This is because array_contain_compare and + * array_eq handle nulls differently ... + * + * Also, query and array must have equal amount of elements. + */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!check[i]) + { + res = false; + break; + } + + if (!addInfoIsNull[i] && DatumGetInt32(addInfo[i]) != nkeys) + { + res = false; + break; + } + } + break; + case RUM_SIMILAR_STRATEGY: + /* we won't need recheck */ + *recheck = false; + + { + int32 intersection = 0, + nentries = -1; + SimpleArray sa, sb; + + for (i = 0; i < nkeys; i++) + if (check[i]) + intersection++; + + if (intersection > 0) + { + float8 sml; + + /* extract array's length from addInfo */ + for (i = 0; i < nkeys; i++) + { + if (!addInfoIsNull[i]) + { + nentries = DatumGetInt32(addInfo[i]); + break; + } + } + + /* there must be addInfo */ + Assert(nentries >= 0); + + INIT_DUMMY_SIMPLE_ARRAY(&sa, nentries); + INIT_DUMMY_SIMPLE_ARRAY(&sb, nkeys); + sml = getSimilarity(&sa, &sb, intersection); + + res = (sml >= RumArraySimilarityThreshold); + } + else + res = false; + } + break; + default: + elog(ERROR, "rum_anyarray_consistent: unknown strategy number: %d", + strategy); + res = false; + } + + PG_RETURN_BOOL(res); +} + + +/* + * Similarity and distance + */ + +Datum +rum_anyarray_ordering(PG_FUNCTION_ARGS) +{ + bool *check = (bool *) PG_GETARG_POINTER(0); + int nkeys = PG_GETARG_INT32(3); + Datum *addInfo = (Datum *) PG_GETARG_POINTER(8); + bool *addInfoIsNull = (bool *) PG_GETARG_POINTER(9); + + float8 sml; + int32 intersection = 0, + nentries = -1; + int i; + + SimpleArray sa, sb; + + for (i = 0; i < nkeys; i++) + if (check[i]) + intersection++; + + if (intersection > 0) + { + /* extract array's length from addInfo */ + for (i = 0; i < nkeys; i++) + { + if (!addInfoIsNull[i]) + { + nentries = DatumGetInt32(addInfo[i]); + break; + } + } + + /* there must be addInfo */ + Assert(nentries >= 0); + + INIT_DUMMY_SIMPLE_ARRAY(&sa, nentries); + INIT_DUMMY_SIMPLE_ARRAY(&sb, nkeys); + sml = getSimilarity(&sa, &sb, intersection); + + PG_RETURN_FLOAT8(DIST_FROM_SML(sml)); + } + + PG_RETURN_FLOAT8(DIST_FROM_SML(0.0)); +} + +Datum +rum_anyarray_similar(PG_FUNCTION_ARGS) +{ + ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); + AnyArrayTypeInfo *info; + SimpleArray *sa, + *sb; + float8 result = 0.0; + + CHECKARRVALID(a); + CHECKARRVALID(b); + + if (ARR_ELEMTYPE(a) != ARR_ELEMTYPE(b)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("array types do not match"))); + + if (ARR_ISVOID(a) || ARR_ISVOID(b)) + PG_RETURN_BOOL(false); + + if (fcinfo->flinfo->fn_extra == NULL) + fcinfo->flinfo->fn_extra = getAnyArrayTypeInfo(fcinfo->flinfo->fn_mcxt, + ARR_ELEMTYPE(a)); + info = (AnyArrayTypeInfo *) fcinfo->flinfo->fn_extra; + + sa = Array2SimpleArray(info, a); + sb = Array2SimpleArray(info, b); + + result = getSimilarity(sa, sb, getNumOfIntersect(sa, sb)); + + freeSimpleArray(sb); + freeSimpleArray(sa); + + PG_FREE_IF_COPY(b, 1); + PG_FREE_IF_COPY(a, 0); + + PG_RETURN_BOOL(result >= RumArraySimilarityThreshold); +} + +Datum +rum_anyarray_distance(PG_FUNCTION_ARGS) +{ + ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); + AnyArrayTypeInfo *info; + SimpleArray *sa, + *sb; + float8 sml = 0.0; + + CHECKARRVALID(a); + CHECKARRVALID(b); + + if (ARR_ELEMTYPE(a) != ARR_ELEMTYPE(b)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("array types do not match"))); + + if (ARR_ISVOID(a) || ARR_ISVOID(b)) + PG_RETURN_FLOAT8(0.0); + + if (fcinfo->flinfo->fn_extra == NULL) + fcinfo->flinfo->fn_extra = getAnyArrayTypeInfo(fcinfo->flinfo->fn_mcxt, + ARR_ELEMTYPE(a)); + info = (AnyArrayTypeInfo *) fcinfo->flinfo->fn_extra; + + sa = Array2SimpleArray(info, a); + sb = Array2SimpleArray(info, b); + + sml = getSimilarity(sa, sb, getNumOfIntersect(sa, sb)); + + freeSimpleArray(sb); + freeSimpleArray(sa); + + PG_FREE_IF_COPY(b, 1); + PG_FREE_IF_COPY(a, 0); + + PG_RETURN_FLOAT8(DIST_FROM_SML(sml)); +} + + +/* + * Convenience routines + */ + +static Oid +getAMProc(Oid amOid, Oid typid) +{ + Oid opclassOid = GetDefaultOpClass(typid, amOid); + Oid procOid; + + Assert(amOid == BTREE_AM_OID || amOid == HASH_AM_OID); + + if (!OidIsValid(opclassOid)) + { + typid = getBaseType(typid); + opclassOid = GetDefaultOpClass(typid, amOid); + + + if (!OidIsValid(opclassOid)) + { + CatCList *catlist; + int i; + + /* + * Search binary-coercible type + */ + catlist = SearchSysCacheList(CASTSOURCETARGET, 1, + ObjectIdGetDatum(typid), + 0, 0); + for (i = 0; i < catlist->n_members; i++) + { + HeapTuple tuple = &catlist->members[i]->tuple; + Form_pg_cast castForm = (Form_pg_cast)GETSTRUCT(tuple); + + if (castForm->castmethod == COERCION_METHOD_BINARY) + { + typid = castForm->casttarget; + opclassOid = GetDefaultOpClass(typid, amOid); + if(OidIsValid(opclassOid)) + break; + } + } + + ReleaseSysCacheList(catlist); + } + } + + if (!OidIsValid(opclassOid)) + return InvalidOid; + + procOid = get_opfamily_proc(get_opclass_family(opclassOid), + typid, typid, + (amOid == BTREE_AM_OID) ? BTORDER_PROC : HASHSTANDARD_PROC); + + if (!OidIsValid(procOid)) + { + typid = get_opclass_input_type(opclassOid); + + procOid = get_opfamily_proc(get_opclass_family(opclassOid), + typid, typid, + (amOid == BTREE_AM_OID) ? BTORDER_PROC : HASHSTANDARD_PROC); + } + + return procOid; +} + + +/* + * AnyArrayTypeInfo functions + */ + +static AnyArrayTypeInfo * +getAnyArrayTypeInfo(MemoryContext ctx, Oid typid) +{ + AnyArrayTypeInfo *info; + + info = MemoryContextAlloc(ctx, sizeof(*info)); + + info->typid = typid; + info->cmpFuncOid = InvalidOid; + info->hashFuncOid = InvalidOid; + info->cmpFuncInited = false; + info->hashFuncInited = false; + info->funcCtx = ctx; + + get_typlenbyvalalign(typid, &info->typlen, &info->typbyval, &info->typalign); + + return info; +} + +static AnyArrayTypeInfo * +getAnyArrayTypeInfoCached(FunctionCallInfo fcinfo, Oid typid) +{ + AnyArrayTypeInfo *info = NULL; + + info = (AnyArrayTypeInfo*)fcinfo->flinfo->fn_extra; + + if (info == NULL || info->typid != typid) + { + freeAnyArrayTypeInfo(info); + info = getAnyArrayTypeInfo(fcinfo->flinfo->fn_mcxt, typid); + fcinfo->flinfo->fn_extra = info; + } + + return info; +} + +static void +freeAnyArrayTypeInfo(AnyArrayTypeInfo *info) +{ + if (info) + { + /* + * there is no way to cleanup FmgrInfo... + */ + pfree(info); + } +} + +static void +cmpFuncInit(AnyArrayTypeInfo *info) +{ + if (info->cmpFuncInited == false) + { + if (!OidIsValid(info->cmpFuncOid)) + { + info->cmpFuncOid = getAMProc(BTREE_AM_OID, info->typid); + + if (!OidIsValid(info->cmpFuncOid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not find compare function"))); + } + + fmgr_info_cxt(info->cmpFuncOid, &info->cmpFunc, info->funcCtx); + info->cmpFuncInited = true; + } +} + + + +/* + * SimpleArray functions + */ + +static SimpleArray * +Array2SimpleArray(AnyArrayTypeInfo *info, ArrayType *a) +{ + SimpleArray *s = palloc(sizeof(SimpleArray)); + + CHECKARRVALID(a); + + s->info = info; + s->nHashedElems = 0; + s->hashedElems = NULL; + + if (ARR_ISVOID(a)) + { + s->elems = NULL; + s->nelems = 0; + } + else + { + deconstruct_array(a, info->typid, + info->typlen, info->typbyval, info->typalign, + &s->elems, NULL, &s->nelems); + } + + return s; +} + +static void +freeSimpleArray(SimpleArray *s) +{ + if (s) + { + if (s->elems) + pfree(s->elems); + if (s->hashedElems) + pfree(s->hashedElems); + pfree(s); + } +} + +static int +cmpAscArrayElem(const void *a, const void *b, void *arg) +{ + FmgrInfo *cmpFunc = (FmgrInfo*)arg; + + Assert(a && b); + return DatumGetInt32(FunctionCall2Coll(cmpFunc, DEFAULT_COLLATION_OID, *(Datum*)a, *(Datum*)b)); +} + +static int +cmpDescArrayElem(const void *a, const void *b, void *arg) +{ + FmgrInfo *cmpFunc = (FmgrInfo*)arg; + + return -DatumGetInt32(FunctionCall2Coll(cmpFunc, DEFAULT_COLLATION_OID, *(Datum*)a, *(Datum*)b)); +} + +static void +sortSimpleArray(SimpleArray *s, int32 direction) +{ + AnyArrayTypeInfo *info = s->info; + + cmpFuncInit(info); + + if (s->nelems > 1) + { + qsort_arg(s->elems, s->nelems, sizeof(Datum), + (direction > 0) ? cmpAscArrayElem : cmpDescArrayElem, + &info->cmpFunc); + } +} + +static void +uniqSimpleArray(SimpleArray *s, bool onlyDuplicate) +{ + AnyArrayTypeInfo *info = s->info; + + cmpFuncInit(info); + + if (s->nelems > 1) + { + Datum *tmp, *dr; + int32 num = s->nelems; + + if (onlyDuplicate) + { + Datum *head = s->elems; + + dr = s->elems; + tmp = s->elems + 1; + + while (tmp - s->elems < num) + { + while (tmp - s->elems < num && cmpAscArrayElem(tmp, dr, &info->cmpFunc) == 0) + tmp++; + + if (tmp - dr > 1) + { + *head = *dr; + head++; + } + dr = tmp; + } + + s->nelems = head - s->elems; + } + else + { + dr = s->elems; + tmp = s->elems + 1; + + while (tmp - s->elems < num) + { + if (cmpAscArrayElem(tmp, dr, &info->cmpFunc) != 0 ) + *(++dr) = *tmp++; + else + tmp++; + } + + s->nelems = dr + 1 - s->elems; + } + } + else if (onlyDuplicate) + { + s->nelems = 0; + } +} + + +/* + * Similarity calculation + */ + +static int32 +getNumOfIntersect(SimpleArray *sa, SimpleArray *sb) +{ + int32 cnt = 0; + int cmp; + Datum *aptr = sa->elems, + *bptr = sb->elems; + AnyArrayTypeInfo *info = sa->info; + + cmpFuncInit(info); + + sortSimpleArray(sa, 1); + uniqSimpleArray(sa, false); + sortSimpleArray(sb, 1); + uniqSimpleArray(sb, false); + + while(aptr - sa->elems < sa->nelems && bptr - sb->elems < sb->nelems) + { + cmp = cmpAscArrayElem(aptr, bptr, &info->cmpFunc); + + if (cmp < 0) + aptr++; + else if (cmp > 0) + bptr++; + else + { + cnt++; + aptr++; + bptr++; + } + } + + return cnt; +} + +static float8 +getSimilarity(SimpleArray *sa, SimpleArray *sb, int32 intersection) +{ + float8 result = 0.0; + + switch (RumArraySimilarityFunction) + { + case SMT_COSINE: + result = ((float8) intersection) / + sqrt(((float8) sa->nelems) * ((float8) sb->nelems)); + break; + case SMT_JACCARD: + result = ((float8) intersection) / + (((float8) sa->nelems) + + ((float8) sb->nelems) - + ((float8) intersection)); + break; + case SMT_OVERLAP: + result = intersection; + break; + default: + elog(ERROR, "unknown similarity type"); + } + + return result; +} diff --git a/src/rum_timestamp.c b/src/rum_timestamp.c deleted file mode 100644 index b4a31811df..0000000000 --- a/src/rum_timestamp.c +++ /dev/null @@ -1,277 +0,0 @@ -#include "postgres.h" - -#include - -#include "access/stratnum.h" -#include "rum.h" -#include "utils/builtins.h" -#include "utils/timestamp.h" - -#define RUM_TMST_DISTANCE 20 -#define RUM_TMST_LEFT_DISTANCE 21 -#define RUM_TMST_RIGHT_DISTANCE 22 - -typedef struct QueryInfo -{ - StrategyNumber strategy; - Datum datum; -} QueryInfo; - - -PG_FUNCTION_INFO_V1(rum_timestamp_extract_value); -Datum -rum_timestamp_extract_value(PG_FUNCTION_ARGS) -{ - Datum datum = PG_GETARG_DATUM(0); - int32 *nentries = (int32 *) PG_GETARG_POINTER(1); - Datum *entries = (Datum *) palloc(sizeof(Datum)); - - entries[0] = datum; - *nentries = 1; - - PG_RETURN_POINTER(entries); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_extract_query); -Datum -rum_timestamp_extract_query(PG_FUNCTION_ARGS) -{ - Datum datum = PG_GETARG_DATUM(0); - int32 *nentries = (int32 *) PG_GETARG_POINTER(1); - StrategyNumber strategy = PG_GETARG_UINT16(2); - bool **partialmatch = (bool **) PG_GETARG_POINTER(3); - Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4); - Datum *entries = (Datum *) palloc(sizeof(Datum)); - QueryInfo *data = (QueryInfo *) palloc(sizeof(QueryInfo)); - bool *ptr_partialmatch; - - *nentries = 1; - ptr_partialmatch = *partialmatch = (bool *) palloc(sizeof(bool)); - *ptr_partialmatch = false; - data->strategy = strategy; - data->datum = datum; - *extra_data = (Pointer *) palloc(sizeof(Pointer)); - **extra_data = (Pointer) data; - - switch (strategy) - { - case BTLessStrategyNumber: - case BTLessEqualStrategyNumber: - entries[0] = TimestampGetDatum(DT_NOBEGIN); /* leftmost */ - *ptr_partialmatch = true; - break; - case BTGreaterEqualStrategyNumber: - case BTGreaterStrategyNumber: - *ptr_partialmatch = true; - case BTEqualStrategyNumber: - case RUM_TMST_DISTANCE: - case RUM_TMST_LEFT_DISTANCE: - case RUM_TMST_RIGHT_DISTANCE: - entries[0] = datum; - break; - default: - elog(ERROR, "unrecognized strategy number: %d", strategy); - } - - PG_RETURN_POINTER(entries); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_compare_prefix); -Datum -rum_timestamp_compare_prefix(PG_FUNCTION_ARGS) -{ - Datum a = PG_GETARG_DATUM(0); - Datum b = PG_GETARG_DATUM(1); - QueryInfo *data = (QueryInfo *) PG_GETARG_POINTER(3); - int32 res, - cmp; - - cmp = DatumGetInt32(DirectFunctionCall2Coll(timestamp_cmp, - PG_GET_COLLATION(), - (data->strategy == BTLessStrategyNumber || - data->strategy == BTLessEqualStrategyNumber) - ? data->datum : a, b)); - - switch (data->strategy) - { - case BTLessStrategyNumber: - /* If original datum > indexed one then return match */ - if (cmp > 0) - res = 0; - else - res = 1; - break; - case BTLessEqualStrategyNumber: - /* The same except equality */ - if (cmp >= 0) - res = 0; - else - res = 1; - break; - case BTEqualStrategyNumber: - if (cmp != 0) - res = 1; - else - res = 0; - break; - case BTGreaterEqualStrategyNumber: - /* If original datum <= indexed one then return match */ - if (cmp <= 0) - res = 0; - else - res = 1; - break; - case BTGreaterStrategyNumber: - /* If original datum <= indexed one then return match */ - /* If original datum == indexed one then continue scan */ - if (cmp < 0) - res = 0; - else if (cmp == 0) - res = -1; - else - res = 1; - break; - default: - elog(ERROR, "unrecognized strategy number: %d", data->strategy); - res = 0; - } - - PG_RETURN_INT32(res); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_consistent); -Datum -rum_timestamp_consistent(PG_FUNCTION_ARGS) -{ - bool *recheck = (bool *) PG_GETARG_POINTER(5); - - *recheck = false; - PG_RETURN_BOOL(true); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_distance); -Datum -rum_timestamp_distance(PG_FUNCTION_ARGS) -{ - Timestamp dt1 = PG_GETARG_TIMESTAMP(0); - Timestamp dt2 = PG_GETARG_TIMESTAMP(1); - double diff; - - if (TIMESTAMP_NOT_FINITE(dt1) || TIMESTAMP_NOT_FINITE(dt2)) - { - if (TIMESTAMP_NOT_FINITE(dt1) && TIMESTAMP_NOT_FINITE(dt2)) - diff = 0; - else - diff = get_float8_infinity(); - } - else - { - /* see timestamp_mi */ - diff = (dt1 > dt2) ? dt1 - dt2 : dt2 - dt1; - diff /= 1e6; - } - - PG_RETURN_FLOAT8(diff); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_left_distance); -Datum -rum_timestamp_left_distance(PG_FUNCTION_ARGS) -{ - Timestamp dt1 = PG_GETARG_TIMESTAMP(0); - Timestamp dt2 = PG_GETARG_TIMESTAMP(1); - double diff; - - if (TIMESTAMP_NOT_FINITE(dt1) || TIMESTAMP_NOT_FINITE(dt2)) - { - if (TIMESTAMP_NOT_FINITE(dt1) && TIMESTAMP_NOT_FINITE(dt2)) - diff = 0; - else - diff = get_float8_infinity(); - } - else - { - /* see timestamp_mi */ - diff = (dt1 > dt2) ? get_float8_infinity() : dt2 - dt1; - diff /= 1e6; - } - - PG_RETURN_FLOAT8(diff); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_right_distance); -Datum -rum_timestamp_right_distance(PG_FUNCTION_ARGS) -{ - Timestamp dt1 = PG_GETARG_TIMESTAMP(0); - Timestamp dt2 = PG_GETARG_TIMESTAMP(1); - double diff; - - if (TIMESTAMP_NOT_FINITE(dt1) || TIMESTAMP_NOT_FINITE(dt2)) - { - if (TIMESTAMP_NOT_FINITE(dt1) && TIMESTAMP_NOT_FINITE(dt2)) - diff = 0; - else - diff = get_float8_infinity(); - } - else - { - /* see timestamp_mi */ - diff = (dt1 > dt2) ? dt1 - dt2 : get_float8_infinity(); - diff /= 1e6; - } - - PG_RETURN_FLOAT8(diff); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_outer_distance); -Datum -rum_timestamp_outer_distance(PG_FUNCTION_ARGS) -{ - StrategyNumber strategy = PG_GETARG_UINT16(2); - Datum diff; - - switch (strategy) - { - case RUM_TMST_DISTANCE: - diff = DirectFunctionCall2(rum_timestamp_distance, - PG_GETARG_DATUM(0), - PG_GETARG_DATUM(1)); - break; - case RUM_TMST_LEFT_DISTANCE: - diff = DirectFunctionCall2(rum_timestamp_left_distance, - PG_GETARG_DATUM(0), - PG_GETARG_DATUM(1)); - break; - case RUM_TMST_RIGHT_DISTANCE: - diff = DirectFunctionCall2(rum_timestamp_right_distance, - PG_GETARG_DATUM(0), - PG_GETARG_DATUM(1)); - break; - default: - elog(ERROR, "rum_timestamp_outer_distance: unknown strategy %u", - strategy); - } - - PG_RETURN_DATUM(diff); -} - -PG_FUNCTION_INFO_V1(rum_timestamp_config); -Datum -rum_timestamp_config(PG_FUNCTION_ARGS) -{ - RumConfig *config = (RumConfig *) PG_GETARG_POINTER(0); - - config->addInfoTypeOid = InvalidOid; - - config->strategyInfo[0].strategy = RUM_TMST_LEFT_DISTANCE; - config->strategyInfo[0].direction = BackwardScanDirection; - - config->strategyInfo[1].strategy = RUM_TMST_RIGHT_DISTANCE; - config->strategyInfo[1].direction = ForwardScanDirection; - - config->strategyInfo[2].strategy = InvalidStrategy; - - PG_RETURN_VOID(); -} - diff --git a/src/rum_ts_utils.c b/src/rum_ts_utils.c index 18bcaec5a3..d3b9c5478a 100644 --- a/src/rum_ts_utils.c +++ b/src/rum_ts_utils.c @@ -3,8 +3,8 @@ * rum_ts_utils.c * various text-search functions * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * *------------------------------------------------------------------------- */ @@ -20,6 +20,9 @@ #include "tsearch/ts_utils.h" #include "utils/array.h" #include "utils/builtins.h" +#if PG_VERSION_NUM >= 120000 +#include "utils/float.h" +#endif #include "utils/typcache.h" #include "rum.h" @@ -31,6 +34,33 @@ #define TS_EXEC_PHRASE_NO_POS TS_EXEC_PHRASE_AS_AND #endif +#if PG_VERSION_NUM >= 130000 +/* Since v13 TS_execute flag naming and defaults have reverted: + * - before v13 - - since v13 - + * TS_EXEC_CALC_NOT (0x01) TS_EXEC_SKIP_NOT (0x01) + */ +#define TS_EXEC_CALC_NOT (0x01) /* Defined here for use with rum_TS_execute for + * compatibility with version < 13 where this + * flag was defined globally. + * XXX Since v13 current global flag + * TS_EXEC_SKIP_NOT has reverted meaning for + * TS_execute but TS_EXEC_CALC_NOT should still + * be passed to rum_TS_execute in unchanged (previous) + * meaning but should not be passed into TS_execute: + * (TS_execute will do 'calc not' by default, and + * if you need skip it, use new TS_EXEC_SKIP_NOT) + */ +typedef TSTernaryValue RumTernaryValue; +#else +typedef enum +{ + TS_NO, /* definitely no match */ + TS_YES, /* definitely does match */ + TS_MAYBE /* can't verify match for lack of pos data */ +} RumTernaryValue; +#endif +typedef RumTernaryValue (*RumExecuteCallbackTernary) (void *arg, QueryOperand *val, ExecPhraseData *data); + PG_FUNCTION_INFO_V1(rum_extract_tsvector); PG_FUNCTION_INFO_V1(rum_extract_tsvector_hash); PG_FUNCTION_INFO_V1(rum_extract_tsquery); @@ -43,26 +73,43 @@ PG_FUNCTION_INFO_V1(rum_tsquery_distance); PG_FUNCTION_INFO_V1(rum_ts_distance_tt); PG_FUNCTION_INFO_V1(rum_ts_distance_ttf); PG_FUNCTION_INFO_V1(rum_ts_distance_td); +PG_FUNCTION_INFO_V1(rum_ts_score_tt); +PG_FUNCTION_INFO_V1(rum_ts_score_ttf); +PG_FUNCTION_INFO_V1(rum_ts_score_td); PG_FUNCTION_INFO_V1(rum_ts_join_pos); PG_FUNCTION_INFO_V1(tsquery_to_distance_query); -static int count_pos(char *ptr, int len); +static unsigned int count_pos(char *ptr, int len); static char *decompress_pos(char *ptr, WordEntryPos *pos); static Datum build_tsvector_entry(TSVector vector, WordEntry *we); static Datum build_tsvector_hash_entry(TSVector vector, WordEntry *we); static Datum build_tsquery_entry(TSQuery query, QueryOperand *operand); static Datum build_tsquery_hash_entry(TSQuery query, QueryOperand *operand); +static RumTernaryValue +rum_phrase_output(ExecPhraseData *data, ExecPhraseData *Ldata, ExecPhraseData *Rdata, + int emit, + int Loffset, + int Roffset, + int max_npos); +static RumTernaryValue +rum_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond, + ExecPhraseData *data); +static RumTernaryValue +rum_TS_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond); + typedef Datum (*TSVectorEntryBuilder)(TSVector vector, WordEntry *we); typedef Datum (*TSQueryEntryBuilder)(TSQuery query, QueryOperand *operand); static Datum *rum_extract_tsvector_internal(TSVector vector, int32 *nentries, - Datum **addInfo, + Datum **addInfo, bool **addInfoIsNull, - TSVectorEntryBuilder build_tsvector_entry); + TSVectorEntryBuilder build_tsvector_entry); static Datum *rum_extract_tsquery_internal(TSQuery query, int32 *nentries, - bool **ptr_partialmatch, + bool **ptr_partialmatch, Pointer **extra_data, int32 *searchMode, TSQueryEntryBuilder build_tsquery_entry); @@ -99,14 +146,21 @@ typedef struct int32 pos; } DocRepresentation; +typedef struct +{ + bool operandexist; + WordEntryPos pos; +} +QueryRepresentationOperand; + typedef struct { TSQuery query; /* Used in rum_tsquery_distance() */ int *map_item_operand; - bool *operandexist; - int lenght; + QueryRepresentationOperand *operandData; + int length; } QueryRepresentation; typedef struct @@ -118,7 +172,7 @@ typedef struct DocRepresentation *end; } Extention; -static float weights[] = {1.0/0.1f, 1.0/0.2f, 1.0/0.4f, 1.0/1.0f}; +static float weights[] = {1.0f/0.1f, 1.0f/0.2f, 1.0f/0.4f, 1.0f/1.0f}; /* A dummy WordEntryPos array to use when haspos is false */ static WordEntryPosVector POSNULL = { @@ -135,10 +189,20 @@ static WordEntryPosVector POSNULL = { #define RANK_NORM_RDIVRPLUS1 0x20 #define DEF_NORM_METHOD RANK_NO_NORM -#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] ) -#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true +/* + * Should not conflict with defines + * TS_EXEC_EMPTY/TS_EXEC_CALC_NOT/TS_EXEC_PHRASE_NO_POS + */ +#define TS_EXEC_IN_NEG 0x04 + +#define QR_GET_OPERAND(q, v) \ + (&((q)->operandData[ ((QueryItem*)(v)) - GETQUERY((q)->query) ])) +#if PG_VERSION_NUM >= 130000 +static TSTernaryValue +#else static bool +#endif pre_checkcondition_rum(void *checkval, QueryOperand *val, ExecPhraseData *data) { RumChkVal *gcv = (RumChkVal *) checkval; @@ -150,134 +214,696 @@ pre_checkcondition_rum(void *checkval, QueryOperand *val, ExecPhraseData *data) /* convert item's number to corresponding entry's (operand's) number */ j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item]; - /* return presence of current entry in indexed value */ + #if PG_VERSION_NUM >= 130000 + return ( *(gcv->need_recheck) ? TS_MAYBE : (gcv->check[j] ? TS_YES : TS_NO) ); + #else return gcv->check[j]; + #endif } Datum rum_tsquery_pre_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); - TSQuery query = PG_GETARG_TSQUERY(2); - - Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); - bool recheck; - bool res = FALSE; + Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); + bool recheck = false; + bool res = false; if (query->size > 0) { - QueryItem *item; RumChkVal gcv; /* * check-parameter array has one entry for each value (operand) in the * query. */ - gcv.first_item = item = GETQUERY(query); + gcv.first_item = GETQUERY(query); gcv.check = check; gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = &recheck; res = TS_execute(GETQUERY(query), &gcv, - TS_EXEC_PHRASE_NO_POS, + TS_EXEC_PHRASE_NO_POS +#if PG_VERSION_NUM >= 130000 + | TS_EXEC_SKIP_NOT +#endif + , pre_checkcondition_rum); } - PG_RETURN_BOOL(res); } -static bool + +static RumTernaryValue checkcondition_rum(void *checkval, QueryOperand *val, ExecPhraseData *data) { RumChkVal *gcv = (RumChkVal *) checkval; int j; - /* if any val requiring a weight is used, set recheck flag */ - if (val->weight != 0) - *(gcv->need_recheck) = true; - /* convert item's number to corresponding entry's (operand's) number */ j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item]; - /* return presence of current entry in indexed value */ if (!gcv->check[j]) - return false; + /* lexeme not present in indexed value */ + return TS_NO; - /* - * Fill position list for phrase operator if it's needed end it exists - */ - if (data) + else if (gcv->addInfo && gcv->addInfoIsNull[j] == false) { - /* caller wants an array of positions (phrase search) */ + bytea *positions; + int32 i; + char *ptrt; + WordEntryPos post = 0; + int32 npos; + int32 k = 0; + /* + * we don't have positions in index because we store a timestamp in + * addInfo + */ if (gcv->recheckPhrase) { /* - * we don't have a positions because we store a timestamp in - * addInfo + * We cannot return TS_YES here (if "val->weight > 0"), because + * data->npos = 0 and we have incorrect porocessing of this result + * at the upper levels. So return TS_MAYBE. */ - *(gcv->need_recheck) = true; + return TS_MAYBE; } - else if (gcv->addInfo && gcv->addInfoIsNull[j] == false) + + positions = DatumGetByteaP(gcv->addInfo[j]); + ptrt = (char *) VARDATA_ANY(positions); + npos = count_pos(VARDATA_ANY(positions), + VARSIZE_ANY_EXHDR(positions)); + + /* caller wants an array of positions (phrase search) */ + if (data) { - bytea *positions; - int32 i; - char *ptrt; - WordEntryPos post; - - positions = DatumGetByteaP(gcv->addInfo[j]); - data->npos = count_pos(VARDATA_ANY(positions), - VARSIZE_ANY_EXHDR(positions)); - data->pos = palloc(sizeof(*data->pos) * data->npos); + data->pos = palloc(sizeof(*data->pos) * npos); data->allocated = true; - ptrt = (char *) VARDATA_ANY(positions); - post = 0; + /* Fill positions that has right weight to return to a caller */ + for (i = 0; i < npos; i++) + { + ptrt = decompress_pos(ptrt, &post); - for (i = 0; i < data->npos; i++) + /* + * Weight mark is stored as 2 bits inside position mark in RUM + * index. We compare it to a list of requested positions in + * query operand (4 bits one for each weight mark). + */ + if ((val->weight == 0) || (val->weight >> WEP_GETWEIGHT(post)) & 1) + { + data->pos[k] = post; + k++; + } + } + data->npos = k; + data->pos = repalloc(data->pos, sizeof(*data->pos) * k); + return (k ? TS_YES : TS_NO); + } + + /* + * Not phrase search. We only need to know if there's at least one + * position with right weight then return TS_YES, otherwise return + * TS_NO. For this search work without recheck we need that any + * negation in recursion will give TS_MAYBE and initiate recheck as + * "!word:A" can mean both: "word:BCÐ’" or "!word" + */ + else if (val->weight == 0) + /* Query without weights */ + return TS_YES; + else + { + char KeyWeightsMask = 0; + + /* Fill KeyWeightMask contains with weights from all positions */ + for (i = 0; i < npos; i++) { ptrt = decompress_pos(ptrt, &post); - data->pos[i] = post; + KeyWeightsMask |= 1 << WEP_GETWEIGHT(post); + } + return ((KeyWeightsMask & val->weight) ? TS_YES : TS_NO); + } + } +/* Should never come here */ + return TS_MAYBE; +} + +/* + * Compute output position list for a tsquery operator in phrase mode. + * + * Merge the position lists in Ldata and Rdata as specified by "emit", + * returning the result list into *data. The input position lists must be + * sorted and unique, and the output will be as well. + * + * data: pointer to initially-all-zeroes output struct, or NULL + * Ldata, Rdata: input position lists + * emit: bitmask of TSPO_XXX flags + * Loffset: offset to be added to Ldata positions before comparing/outputting + * Roffset: offset to be added to Rdata positions before comparing/outputting + * max_npos: maximum possible required size of output position array + * + * Loffset and Roffset should not be negative, else we risk trying to output + * negative positions, which won't fit into WordEntryPos. + * + * The result is boolean (TS_YES or TS_NO), but for the caller's convenience + * we return it as RumTernaryValue. + * + * Returns TS_YES if any positions were emitted to *data; or if data is NULL, + * returns TS_YES if any positions would have been emitted. + */ +#define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */ +#define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */ +#define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */ + +static RumTernaryValue +rum_phrase_output(ExecPhraseData *data, + ExecPhraseData *Ldata, + ExecPhraseData *Rdata, + int emit, + int Loffset, + int Roffset, + int max_npos) +{ + int Lindex, + Rindex; + + /* Loop until both inputs are exhausted */ + Lindex = Rindex = 0; + while (Lindex < Ldata->npos || Rindex < Rdata->npos) + { + int Lpos, + Rpos; + int output_pos = 0; + + /* + * Fetch current values to compare. WEP_GETPOS() is needed because + * ExecPhraseData->data can point to a tsvector's WordEntryPosVector. + */ + if (Lindex < Ldata->npos) + Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset; + else + { + /* L array exhausted, so we're done if R_ONLY isn't set */ + if (!(emit & TSPO_R_ONLY)) + break; + Lpos = INT_MAX; + } + if (Rindex < Rdata->npos) + Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset; + else + { + /* R array exhausted, so we're done if L_ONLY isn't set */ + if (!(emit & TSPO_L_ONLY)) + break; + Rpos = INT_MAX; + } + + /* Merge-join the two input lists */ + if (Lpos < Rpos) + { + /* Lpos is not matched in Rdata, should we output it? */ + if (emit & TSPO_L_ONLY) + output_pos = Lpos; + Lindex++; + } + else if (Lpos == Rpos) + { + /* Lpos and Rpos match ... should we output it? */ + if (emit & TSPO_BOTH) + output_pos = Rpos; + Lindex++; + Rindex++; + } + else /* Lpos > Rpos */ + { + /* Rpos is not matched in Ldata, should we output it? */ + if (emit & TSPO_R_ONLY) + output_pos = Rpos; + Rindex++; + } + + if (output_pos > 0) + { + if (data) + { + /* Store position, first allocating output array if needed */ + if (data->pos == NULL) + { + data->pos = (WordEntryPos *) + palloc(max_npos * sizeof(WordEntryPos)); + data->allocated = true; + } + data->pos[data->npos++] = output_pos; + } + else + { + /* + * Exact positions not needed, so return TS_YES as soon as we + * know there is at least one. + */ + return TS_YES; } } } - return true; + if (data && data->npos > 0) + { + /* Let's assert we didn't overrun the array */ + Assert(data->npos <= max_npos); + return TS_YES; + } + return TS_NO; +} + +/* + * Execute tsquery at or below an OP_PHRASE operator. + * + * This handles tsquery execution at recursion levels where we need to care + * about match locations. + * + * In addition to the same arguments used for TS_execute, the caller may pass + * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme + * match position info on success. data == NULL if no position data need be + * returned. (In practice, outside callers pass NULL, and only the internal + * recursion cases pass a data pointer.) + * Note: the function assumes data != NULL for operators other than OP_PHRASE. + * This is OK because an outside call always starts from an OP_PHRASE node. + * + * The detailed semantics of the match data, given that the function returned + * TS_YES (successful match), are: + * + * npos > 0, negate = false: + * query is matched at specified position(s) (and only those positions) + * npos > 0, negate = true: + * query is matched at all positions *except* specified position(s) + * npos = 0, negate = true: + * query is matched at all positions + * npos = 0, negate = false: + * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate) + * + * Successful matches also return a "width" value which is the match width in + * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches, + * and is the sum of the phrase operator distances for phrase matches. Note + * that when width > 0, the listed positions represent the ends of matches not + * the starts. (This unintuitive rule is needed to avoid possibly generating + * negative positions, which wouldn't fit into the WordEntryPos arrays.) + * + * If the RumExecuteCallback function reports that an operand is present + * but fails to provide position(s) for it, we will return TS_MAYBE when + * it is possible but not certain that the query is matched. + * + * When the function returns TS_NO or TS_MAYBE, it must return npos = 0, + * negate = false (which is the state initialized by the caller); but the + * "width" output in such cases is undefined. + */ +static RumTernaryValue +rum_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond, + ExecPhraseData *data) +{ + ExecPhraseData Ldata, + Rdata; + RumTernaryValue lmatch, + rmatch; + int Loffset, + Roffset, + maxwidth; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (curitem->type == QI_VAL) + return (chkcond(arg, (QueryOperand *) curitem, data)); + + switch (curitem->qoperator.oper) + { + case OP_NOT: + + /* + * We need not touch data->width, since a NOT operation does not + * change the match width. + */ + if (!(flags & TS_EXEC_CALC_NOT)) + { + /* without CALC_NOT, report NOT as "match everywhere" */ + Assert(data->npos == 0 && !data->negate); + data->negate = true; + return TS_YES; + } + switch (rum_phrase_execute(curitem + 1, arg, flags, chkcond, data)) + { + case TS_NO: + /* change "match nowhere" to "match everywhere" */ + Assert(data->npos == 0 && !data->negate); + data->negate = true; + return TS_YES; + case TS_YES: + if (data->npos > 0) + { + /* we have some positions, invert negate flag */ + data->negate = !data->negate; + return TS_YES; + } + else if (data->negate) + { + /* change "match everywhere" to "match nowhere" */ + data->negate = false; + return TS_NO; + } + /* Should not get here if result was TS_YES */ + Assert(false); + break; + case TS_MAYBE: + /* match positions are, and remain, uncertain */ + return TS_MAYBE; + } + break; + + case OP_PHRASE: + case OP_AND: + memset(&Ldata, 0, sizeof(Ldata)); + memset(&Rdata, 0, sizeof(Rdata)); + + lmatch = rum_phrase_execute(curitem + curitem->qoperator.left, + arg, flags, chkcond, &Ldata); + if (lmatch == TS_NO) + return TS_NO; + + rmatch = rum_phrase_execute(curitem + 1, + arg, flags, chkcond, &Rdata); + if (rmatch == TS_NO) + return TS_NO; + + /* + * If either operand has no position information, then we can't + * return reliable position data, only a MAYBE result. + */ + if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) + return TS_MAYBE; + + if (curitem->qoperator.oper == OP_PHRASE) + { + /* In case of index where position is not available + * (e.g. addon_ops) output TS_MAYBE even in case both + * lmatch and rmatch are TS_YES. Otherwise we can lose + * results of phrase queries. + */ + if (flags & TS_EXEC_PHRASE_NO_POS) + return TS_MAYBE; + + /* + * Compute Loffset and Roffset suitable for phrase match, and + * compute overall width of whole phrase match. + */ + Loffset = curitem->qoperator.distance + Rdata.width; + Roffset = 0; + if (data) + data->width = curitem->qoperator.distance + + Ldata.width + Rdata.width; + } + else + { + /* + * For OP_AND, set output width and alignment like OP_OR (see + * comment below) + */ + maxwidth = Max(Ldata.width, Rdata.width); + Loffset = maxwidth - Ldata.width; + Roffset = maxwidth - Rdata.width; + if (data) + data->width = maxwidth; + } + + if (Ldata.negate && Rdata.negate) + { + /* !L & !R: treat as !(L | R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + Loffset, Roffset, + Ldata.npos + Rdata.npos); + if (data) + data->negate = true; + return TS_YES; + } + else if (Ldata.negate) + { + /* !L & R */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_R_ONLY, + Loffset, Roffset, + Rdata.npos); + } + else if (Rdata.negate) + { + /* L & !R */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_L_ONLY, + Loffset, Roffset, + Ldata.npos); + } + else + { + /* straight AND */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH, + Loffset, Roffset, + Min(Ldata.npos, Rdata.npos)); + } + + case OP_OR: + memset(&Ldata, 0, sizeof(Ldata)); + memset(&Rdata, 0, sizeof(Rdata)); + + lmatch = rum_phrase_execute(curitem + curitem->qoperator.left, + arg, flags, chkcond, &Ldata); + rmatch = rum_phrase_execute(curitem + 1, + arg, flags, chkcond, &Rdata); + + if (lmatch == TS_NO && rmatch == TS_NO) + return TS_NO; + + /* + * If either operand has no position information, then we can't + * return reliable position data, only a MAYBE result. + */ + if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) + return TS_MAYBE; + + /* + * Cope with undefined output width from failed submatch. (This + * takes less code than trying to ensure that all failure returns + * et data->width to zero.) + */ + if (lmatch == TS_NO) + Ldata.width = 0; + if (rmatch == TS_NO) + Rdata.width = 0; + + /* + * For OP_AND and OP_OR, report the width of the wider of the two + * inputs, and align the narrower input's positions to the right + * end of that width. This rule deals at least somewhat + * reasonably with cases like "x <-> (y | z <-> q)". + */ + maxwidth = Max(Ldata.width, Rdata.width); + Loffset = maxwidth - Ldata.width; + Roffset = maxwidth - Rdata.width; + data->width = maxwidth; + + if (Ldata.negate && Rdata.negate) + { + /* !L | !R: treat as !(L & R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH, + Loffset, Roffset, + Min(Ldata.npos, Rdata.npos)); + data->negate = true; + return TS_YES; + } + else if (Ldata.negate) + { + /* !L | R: treat as !(L & !R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_L_ONLY, + Loffset, Roffset, + Ldata.npos); + data->negate = true; + return TS_YES; + } + else if (Rdata.negate) + { + /* L | !R: treat as !(!L & R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_R_ONLY, + Loffset, Roffset, + Rdata.npos); + data->negate = true; + return TS_YES; + } + else + { + /* straight OR */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + Loffset, Roffset, + Ldata.npos + Rdata.npos); + } + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return TS_NO; +} + +/* + * Evaluates tsquery boolean expression. It is similar to adt/tsvector_op.c + * TS_execute_recurse() but in most cases when ! operator is used it should set + * TS_MAYBE to recheck. The reason is that inside negation we can have one or several + * operands with weights (which we can not easily know) and negative of them is not + * precisely defined i.e. "!word:A" can mean "word:BCD" or "!word" (the same applies to + * logical combination of them). One easily only case we can avoid recheck is when before negation there + * is QI_VAL which doesn't have weight. + * + * curitem: current tsquery item (initially, the first one) + * arg: opaque value to pass through to callback function + * flags: bitmask of flag bits shown in ts_utils.h + * chkcond: callback function to check whether a primitive value is present + */ + +static RumTernaryValue +rum_TS_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond) +{ + RumTernaryValue lmatch; + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (curitem->type == QI_VAL) + { + if ((flags & TS_EXEC_IN_NEG) && curitem->qoperand.weight && + curitem->qoperand.weight != 15) + return TS_MAYBE; + else + return chkcond(arg, (QueryOperand *) curitem, NULL); + } + + switch (curitem->qoperator.oper) + { + case OP_NOT: + if (!(flags & TS_EXEC_CALC_NOT)) + return TS_YES; + switch (rum_TS_execute(curitem + 1, arg, flags | TS_EXEC_IN_NEG, chkcond)) + { + case TS_NO: + return TS_YES; + case TS_YES: + return TS_NO; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_AND: + lmatch = rum_TS_execute(curitem + curitem->qoperator.left, arg, + flags, chkcond); + if (lmatch == TS_NO) + return TS_NO; + switch (rum_TS_execute(curitem + 1, arg, flags, chkcond)) + { + case TS_NO: + return TS_NO; + case TS_YES: + return lmatch; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_OR: + lmatch = rum_TS_execute(curitem + curitem->qoperator.left, arg, + flags, chkcond); + if (lmatch == TS_YES) + return TS_YES; + switch (rum_TS_execute(curitem + 1, arg, flags, chkcond)) + { + case TS_NO: + return lmatch; + case TS_YES: + return TS_YES; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_PHRASE: + + /* + * If we get a MAYBE result, and the caller doesn't want that, + * convert it to NO. It would be more consistent, perhaps, to + * return the result of TS_phrase_execute() verbatim and then + * convert MAYBE results at the top of the recursion. But + * converting at the topmost phrase operator gives results that + * are bug-compatible with the old implementation, so do it like + * this for now. + * + * Checking for TS_EXEC_PHRASE_NO_POS has been moved inside + * rum_phrase_execute, otherwise we can lose results of phrase + * operator when position information is not available in index + * (e.g. index built with addon_ops) + */ + switch (rum_phrase_execute(curitem, arg, flags, chkcond, NULL)) + { + case TS_NO: + return TS_NO; + case TS_YES: + return TS_YES; + case TS_MAYBE: + return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO; + } + break; + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return TS_NO; } Datum rum_tsquery_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); - /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ TSQuery query = PG_GETARG_TSQUERY(2); - /* int32 nkeys = PG_GETARG_INT32(3); */ - Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); + Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(5); Datum *addInfo = (Datum *) PG_GETARG_POINTER(8); bool *addInfoIsNull = (bool *) PG_GETARG_POINTER(9); - bool res = FALSE; + + RumTernaryValue res = TS_NO; /* - * The query requires recheck only if it involves weights + * The query doesn't require recheck by default */ *recheck = false; if (query->size > 0) { - QueryItem *item; RumChkVal gcv; /* * check-parameter array has one entry for each value (operand) in the * query. */ - gcv.first_item = item = GETQUERY(query); + gcv.first_item = GETQUERY(query); gcv.check = check; gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = recheck; @@ -285,28 +911,27 @@ rum_tsquery_consistent(PG_FUNCTION_ARGS) gcv.addInfoIsNull = addInfoIsNull; gcv.recheckPhrase = false; - res = TS_execute(GETQUERY(query), &gcv, - TS_EXEC_CALC_NOT, - checkcondition_rum); + res = rum_TS_execute(GETQUERY(query), &gcv, + TS_EXEC_CALC_NOT, + checkcondition_rum); + if (res == TS_MAYBE) + *recheck = true; } - PG_RETURN_BOOL(res); -} +} Datum rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); - /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ TSQuery query = PG_GETARG_TSQUERY(2); - /* int32 nkeys = PG_GETARG_INT32(3); */ - Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); + Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(5); Datum *addInfo = (Datum *) PG_GETARG_POINTER(8); bool *addInfoIsNull = (bool *) PG_GETARG_POINTER(9); - bool res = FALSE; + RumTernaryValue res = TS_NO; /* * The query requires recheck only if it involves weights @@ -315,14 +940,13 @@ rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS) if (query->size > 0) { - QueryItem *item; RumChkVal gcv; /* * check-parameter array has one entry for each value (operand) in the * query. */ - gcv.first_item = item = GETQUERY(query); + gcv.first_item = GETQUERY(query); gcv.check = check; gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = recheck; @@ -330,18 +954,19 @@ rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS) gcv.addInfoIsNull = addInfoIsNull; gcv.recheckPhrase = true; - res = TS_execute(GETQUERY(query), &gcv, - TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS, - checkcondition_rum); + res = rum_TS_execute(GETQUERY(query), &gcv, + TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS, + checkcondition_rum); + if (res == TS_MAYBE) + *recheck = true; } - PG_RETURN_BOOL(res); } #define SIXTHBIT 0x20 #define LOWERMASK 0x1F -static int +static unsigned int compress_pos(char *target, WordEntryPos *pos, int npos) { int i; @@ -393,6 +1018,7 @@ decompress_pos(char *ptr, WordEntryPos *pos) else { delta |= (v & LOWERMASK) << i; + Assert(delta <= 0x3fff); *pos += delta; WEP_SETWEIGHT(*pos, v >> 5); return ptr; @@ -401,7 +1027,7 @@ decompress_pos(char *ptr, WordEntryPos *pos) } } -static int +static unsigned int count_pos(char *ptr, int len) { int count = 0, @@ -412,6 +1038,7 @@ count_pos(char *ptr, int len) if (!(ptr[i] & HIGHBIT)) count++; } + Assert((ptr[i-1] & HIGHBIT) == 0); return count; } @@ -821,20 +1448,41 @@ compareDocR(const void *va, const void *vb) return (a->pos > b->pos) ? 1 : -1; } -static bool +/* + * Be carefull: clang 11+ is very sensitive to casting function + * with different return value. + */ +static +#if PG_VERSION_NUM >= 130000 +TSTernaryValue +#else +bool +#endif checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data) { QueryRepresentation *qr = (QueryRepresentation *) checkval; + QueryRepresentationOperand *qro; /* Check for rum_tsquery_distance() */ if (qr->map_item_operand != NULL) + qro = qr->operandData + + qr->map_item_operand[(QueryItem *) val - GETQUERY(qr->query)]; + else + qro = QR_GET_OPERAND(qr, val); + + if (data && qro->operandexist) { - int i = (QueryItem *) val - GETQUERY(qr->query); - return qr->operandexist[qr->map_item_operand[i]]; + data->npos = 1; + data->pos = &qro->pos; + data->allocated = false; } - return QR_GET_OPERAND_EXISTS(qr, val); + return qro->operandexist +#if PG_VERSION_NUM >= 130000 + ? TS_YES : TS_NO +#endif + ; } static bool @@ -850,25 +1498,41 @@ Cover(DocRepresentation *doc, uint32 len, QueryRepresentation *qr, lastpos = ext->pos; found = false; - memset(qr->operandexist, 0, sizeof(bool) * qr->lenght); + memset(qr->operandData, 0, sizeof(qr->operandData[0]) * qr->length); - ext->p = 0x7fffffff; + ext->p = PG_INT32_MAX; ext->q = 0; ptr = doc + ext->pos; /* find upper bound of cover from current position, move up */ while (ptr - doc < len) { + QueryRepresentationOperand *qro; + if (qr->map_item_operand != NULL) { - qr->operandexist[ptr->data.key.keyn] = true; + qro = qr->operandData + ptr->data.key.keyn; + qro->operandexist = true; + WEP_SETPOS(qro->pos, ptr->pos); + WEP_SETWEIGHT(qro->pos, ptr->wclass); } else { for (i = 0; i < ptr->data.item.nitem; i++) - QR_SET_OPERAND_EXISTS(qr, ptr->data.item.item[i]); + { + qro = QR_GET_OPERAND(qr, ptr->data.item.item[i]); + qro->operandexist = true; + WEP_SETPOS(qro->pos, ptr->pos); + WEP_SETWEIGHT(qro->pos, ptr->wclass); + } } - if (TS_execute(GETQUERY(qr->query), (void *) qr, false, + + if (TS_execute(GETQUERY(qr->query), (void *) qr, +#if PG_VERSION_NUM >= 130000 + TS_EXEC_SKIP_NOT, +#else + TS_EXEC_EMPTY, +#endif checkcondition_QueryOperand)) { if (ptr->pos > ext->q) @@ -886,7 +1550,7 @@ Cover(DocRepresentation *doc, uint32 len, QueryRepresentation *qr, if (!found) return false; - memset(qr->operandexist, 0, sizeof(bool) * qr->lenght); + memset(qr->operandData, 0, sizeof(qr->operandData[0]) * qr->length); ptr = doc + lastpos; @@ -895,14 +1559,26 @@ Cover(DocRepresentation *doc, uint32 len, QueryRepresentation *qr, { if (qr->map_item_operand != NULL) { - qr->operandexist[ptr->data.key.keyn] = true; + qr->operandData[ptr->data.key.keyn].operandexist = true; } else { for (i = 0; i < ptr->data.item.nitem; i++) - QR_SET_OPERAND_EXISTS(qr, ptr->data.item.item[i]); + { + QueryRepresentationOperand *qro = + QR_GET_OPERAND(qr, ptr->data.item.item[i]); + + qro->operandexist = true; + WEP_SETPOS(qro->pos, ptr->pos); + WEP_SETWEIGHT(qro->pos, ptr->wclass); + } } - if (TS_execute(GETQUERY(qr->query), (void *) qr, true, + if (TS_execute(GETQUERY(qr->query), (void *) qr, +#if PG_VERSION_NUM >= 130000 + TS_EXEC_EMPTY, +#else + TS_EXEC_CALC_NOT, +#endif checkcondition_QueryOperand)) { if (ptr->pos < ext->p) @@ -1083,7 +1759,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) curoperand = &item[i].qoperand; - if (QR_GET_OPERAND_EXISTS(qr, &item[i])) + if (QR_GET_OPERAND(qr, &item[i])->operandexist) continue; firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); @@ -1128,6 +1804,8 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) (item[k].type == QI_VAL && compareQueryOperand(&kptr, &iptr, operand) == 0)) { + QueryRepresentationOperand *qro; + /* * if k == i, we've already checked above that * it's type == Q_VAL @@ -1135,7 +1813,12 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) doc[cur].data.item.item[doc[cur].data.item.nitem] = item + k; doc[cur].data.item.nitem++; - QR_SET_OPERAND_EXISTS(qr, item + k); + + qro = QR_GET_OPERAND(qr, item + k); + + qro->operandexist = true; + qro->pos = post[j]; + } } } @@ -1195,7 +1878,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, int new_cover_key = 0; int nitems = 0; - while (ptr <= ext.end) + while (ptr && ptr <= ext.end) { InvSum += arrdata[ptr->wclass]; /* SK: Quick and dirty hash key. Hope collisions will be not too frequent. */ @@ -1236,8 +1919,8 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, cover_keys[new_cover_idx] = new_cover_key; /* Compute the number of query terms in the cover */ - for (i = 0; i < qr->lenght; i++) - if (qr->operandexist[i]) + for (i = 0; i < qr->length; i++) + if (qr->operandData[i].operandexist) nitems++; Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; @@ -1298,20 +1981,20 @@ calc_score_addinfo(float4 *arrdata, bool *check, TSQuery query, qr.query = query; qr.map_item_operand = map_item_operand; - qr.operandexist = (bool *) palloc0(sizeof(bool) * nkeys); - qr.lenght = nkeys; + qr.operandData = palloc0(sizeof(qr.operandData[0]) * nkeys); + qr.length = nkeys; doc = get_docrep_addinfo(check, &qr, addInfo, addInfoIsNull, &doclen); if (!doc) { - pfree(qr.operandexist); + pfree(qr.operandData); return 0.0; } Wdoc = calc_score_docr(arrdata, doc, doclen, &qr, DEF_NORM_METHOD); pfree(doc); - pfree(qr.operandexist); + pfree(qr.operandData); return (float4) Wdoc; } @@ -1327,13 +2010,13 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method) qr.query = query; qr.map_item_operand = NULL; - qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size); - qr.lenght = query->size; + qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size); + qr.length = query->size; doc = get_docrep(txt, &qr, &doclen); if (!doc) { - pfree(qr.operandexist); + pfree(qr.operandData); return 0.0; } @@ -1356,7 +2039,7 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method) Wdoc /= log((double) (txt->size + 1)) / log(2.0); pfree(doc); - pfree(qr.operandexist); + pfree(qr.operandData); return (float4) Wdoc; } @@ -1429,15 +2112,9 @@ rum_ts_distance_ttf(PG_FUNCTION_ARGS) PG_RETURN_FLOAT4(1.0 / res); } -/* - * Implementation of <=> operator. Uses specified normalization method. - */ -Datum -rum_ts_distance_td(PG_FUNCTION_ARGS) +static float4 +calc_score_parse_opt(TSVector txt, HeapTupleHeader d) { - TSVector txt = PG_GETARG_TSVECTOR(0); - HeapTupleHeader d = PG_GETARG_HEAPTUPLEHEADER(1); - Oid tupType = HeapTupleHeaderGetTypeId(d); int32 tupTypmod = HeapTupleHeaderGetTypMod(d); TupleDesc tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod); @@ -1457,8 +2134,6 @@ rum_ts_distance_td(PG_FUNCTION_ARGS) if (isnull) { ReleaseTupleDesc(tupdesc); - PG_FREE_IF_COPY(txt, 0); - PG_FREE_IF_COPY(d, 1); elog(ERROR, "NULL query value is not allowed"); } @@ -1469,6 +2144,22 @@ rum_ts_distance_td(PG_FUNCTION_ARGS) res = calc_score(weights, txt, query, method); ReleaseTupleDesc(tupdesc); + + return res; +} + +/* + * Implementation of <=> operator. Uses specified normalization method. + */ +Datum +rum_ts_distance_td(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + HeapTupleHeader d = PG_GETARG_HEAPTUPLEHEADER(1); + float4 res; + + res = calc_score_parse_opt(txt, d); + PG_FREE_IF_COPY(txt, 0); PG_FREE_IF_COPY(d, 1); @@ -1478,6 +2169,61 @@ rum_ts_distance_td(PG_FUNCTION_ARGS) PG_RETURN_FLOAT4(1.0 / res); } +/* + * Calculate score (inverted distance). Uses default normalization method. + */ +Datum +rum_ts_score_tt(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + float4 res; + + res = calc_score(weights, txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + + PG_RETURN_FLOAT4(res); +} + +/* + * Calculate score (inverted distance). Uses specified normalization method. + */ +Datum +rum_ts_score_ttf(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + int method = PG_GETARG_INT32(2); + float4 res; + + res = calc_score(weights, txt, query, method); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + + PG_RETURN_FLOAT4(res); +} + +/* + * Calculate score (inverted distance). Uses specified normalization method. + */ +Datum +rum_ts_score_td(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + HeapTupleHeader d = PG_GETARG_HEAPTUPLEHEADER(1); + float4 res; + + res = calc_score_parse_opt(txt, d); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(d, 1); + + PG_RETURN_FLOAT4(res); +} + /* * Casts tsquery to rum_distance_query type. */ @@ -1532,7 +2278,8 @@ rum_ts_join_pos(PG_FUNCTION_ARGS) count2 = count_pos(in2, VARSIZE_ANY_EXHDR(addInfo2)), countRes = 0; int i1 = 0, i2 = 0; - Size size; + Size size, + size_compressed; WordEntryPos pos1 = 0, pos2 = 0, *pos; @@ -1544,61 +2291,71 @@ rum_ts_join_pos(PG_FUNCTION_ARGS) in1 = decompress_pos(in1, &pos1); in2 = decompress_pos(in2, &pos2); - while(i1 < count1 && i2 < count2) + for(;;) { if (WEP_GETPOS(pos1) > WEP_GETPOS(pos2)) { pos[countRes++] = pos2; - if (i2 < count2) - in2 = decompress_pos(in2, &pos2); i2++; + if (i2 >= count2) + break; + in2 = decompress_pos(in2, &pos2); } else if (WEP_GETPOS(pos1) < WEP_GETPOS(pos2)) { pos[countRes++] = pos1; - if (i1 < count1) - in1 = decompress_pos(in1, &pos1); i1++; + if (i1 >= count1) + break; + in1 = decompress_pos(in1, &pos1); } else { pos[countRes++] = pos1; + i1++; + i2++; if (i1 < count1) in1 = decompress_pos(in1, &pos1); if (i2 < count2) in2 = decompress_pos(in2, &pos2); - i1++; - i2++; + if (i2 >= count2 || i1 >= count1) + break; } } - while(i1 < count1) - { - pos[countRes++] = pos1; - if (i1 < count1) + if (i1 < count1) + for(;;) + { + pos[countRes++] = pos1; + i1++; + if (i1 >= count1) + break; in1 = decompress_pos(in1, &pos1); - i1++; - } - - while(i2 < count2) + } + else if (i2 < count2) { - pos[countRes++] = pos2; - if (i2 < count2) + for(;;) + { + pos[countRes++] = pos2; + i2++; + if (i2 >= count2) + break; in2 = decompress_pos(in2, &pos2); - i2++; + } } - Assert(countRes <= (count1 + count2)); + Assert(countRes <= count1 + count2); /* * In some cases compressed positions may take more memory than * uncompressed positions. So allocate memory with a margin. */ size = VARHDRSZ + 2 * sizeof(WordEntryPos) * countRes; - result = palloc(size); + result = palloc0(size); - size = compress_pos(result->vl_dat, pos, countRes) + VARHDRSZ; - SET_VARSIZE(result, size); + size_compressed = compress_pos(result->vl_dat, pos, countRes) + VARHDRSZ; + Assert(size >= size_compressed); + SET_VARSIZE(result, size_compressed); PG_RETURN_BYTEA_P(result); } diff --git a/src/rumbtree.c b/src/rumbtree.c index 674fdc2f91..dfe2f10c30 100644 --- a/src/rumbtree.c +++ b/src/rumbtree.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -15,6 +15,7 @@ #include "access/generic_xlog.h" #include "miscadmin.h" +#include "storage/predicate.h" #include "rum.h" @@ -31,7 +32,7 @@ rumTraverseLock(Buffer buffer, bool searchMode) page = BufferGetPage(buffer); if (RumPageIsLeaf(page)) { - if (searchMode == FALSE) + if (searchMode == false) { /* we should relock our page */ LockBuffer(buffer, RUM_UNLOCK); @@ -100,8 +101,8 @@ rumReFindLeafPage(RumBtree btree, RumBtreeStack * stack) * that requested leaf page is in this subtree only when requested * item pointer is less than item pointer previous to rightmost. */ - if (compareRumKey(btree->rumstate, btree->entryAttnum, - &(((PostingItem *) RumDataPageGetItem(page, maxoff - 1))->key), + if (compareRumItem(btree->rumstate, btree->entryAttnum, + &(((PostingItem *) RumDataPageGetItem(page, maxoff - 1))->item), &btree->items[btree->curitem]) >= 0) { break; @@ -119,7 +120,7 @@ rumReFindLeafPage(RumBtree btree, RumBtreeStack * stack) RumBtreeStack * rumFindLeafPage(RumBtree btree, RumBtreeStack * stack) { - bool isfirst = TRUE; + bool isfirst = true; BlockNumber rootBlkno; if (!stack) @@ -140,7 +141,7 @@ rumFindLeafPage(RumBtree btree, RumBtreeStack * stack) { if (RumPageIsLeaf(page) && !btree->searchMode) access = RUM_EXCLUSIVE; - isfirst = FALSE; + isfirst = false; } else access = rumTraverseLock(stack->buffer, btree->searchMode); @@ -149,7 +150,7 @@ rumFindLeafPage(RumBtree btree, RumBtreeStack * stack) * ok, page is correctly locked, we should check to move right .., * root never has a right link, so small optimization */ - while (btree->fullScan == FALSE && stack->blkno != rootBlkno && + while (btree->fullScan == false && stack->blkno != rootBlkno && btree->isMoveRight(btree, page)) { BlockNumber rightlink = RumPageGetOpaque(page)->rightlink; @@ -485,6 +486,14 @@ rumInsertValue(Relation index, RumBtree btree, RumBtreeStack * stack, btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer, page, lpage, rpage); + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(lbuffer)); + + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(rbuffer)); + if (btree->rumstate->isBuild) { START_CRIT_SECTION(); @@ -518,7 +527,7 @@ rumInsertValue(Relation index, RumBtree btree, RumBtreeStack * stack, else { BlockNumber rightrightBlkno = InvalidBlockNumber; - Buffer rightrightBuffer; + Buffer rightrightBuffer = InvalidBuffer; /* split non-root page */ if (btree->rumstate->isBuild) @@ -548,6 +557,10 @@ rumInsertValue(Relation index, RumBtree btree, RumBtreeStack * stack, RumPageGetOpaque(rpage)->leftlink = BufferGetBlockNumber(stack->buffer); RumPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(rbuffer)); + /* * it's safe because we don't have right-to-left walking * with locking bth pages except vacuum. But vacuum will @@ -592,7 +605,7 @@ rumInsertValue(Relation index, RumBtree btree, RumBtreeStack * stack, } } - btree->isDelete = FALSE; + btree->isDelete = false; /* search parent to lock */ LockBuffer(parent->buffer, RUM_EXCLUSIVE); diff --git a/src/rumbulk.c b/src/rumbulk.c index dcabe44add..7a03bf64b4 100644 --- a/src/rumbulk.c +++ b/src/rumbulk.c @@ -4,7 +4,7 @@ * routines for fast build of inverted index * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -14,17 +14,24 @@ #include "postgres.h" #include "utils/datum.h" -#include "utils/memutils.h" #include "rum.h" #define DEF_NENTRY 2048 /* RumEntryAccumulator allocation quantum */ #define DEF_NPTR 5 /* ItemPointer initial allocation quantum */ +/* PostgreSQL pre 10 has different names for this functions */ +#if PG_VERSION_NUM <= 100006 || PG_VERSION_NUM == 110000 +#define rbt_create(node_size, comparator, combiner, allocfunc, freefunc, arg) \ + (rb_create(node_size, comparator, combiner, allocfunc, freefunc, arg)) +#define rbt_insert(rbt, data, isNew) \ + (rb_insert(rbt, data, isNew)) +#endif + /* Combiner function for rbtree.c */ static void -rumCombineData(RBNode *existing, const RBNode *newdata, void *arg) +rumCombineData(RBTNode *existing, const RBTNode *newdata, void *arg) { RumEntryAccumulator *eo = (RumEntryAccumulator *) existing; const RumEntryAccumulator *en = (const RumEntryAccumulator *) newdata; @@ -37,7 +44,7 @@ rumCombineData(RBNode *existing, const RBNode *newdata, void *arg) { accum->allocatedMemory -= GetMemoryChunkSpace(eo->list); eo->maxcount *= 2; - eo->list = (RumKey *) repalloc(eo->list, sizeof(RumKey) * eo->maxcount); + eo->list = (RumItem *) repalloc(eo->list, sizeof(RumItem) * eo->maxcount); accum->allocatedMemory += GetMemoryChunkSpace(eo->list); } @@ -56,7 +63,7 @@ rumCombineData(RBNode *existing, const RBNode *newdata, void *arg) Assert(res != 0); if (res > 0) - eo->shouldSort = TRUE; + eo->shouldSort = true; } eo->list[eo->count] = en->list[0]; @@ -65,7 +72,7 @@ rumCombineData(RBNode *existing, const RBNode *newdata, void *arg) /* Comparator function for rbtree.c */ static int -cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg) +cmpEntryAccumulator(const RBTNode *a, const RBTNode *b, void *arg) { const RumEntryAccumulator *ea = (const RumEntryAccumulator *) a; const RumEntryAccumulator *eb = (const RumEntryAccumulator *) b; @@ -77,7 +84,7 @@ cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg) } /* Allocator function for rbtree.c */ -static RBNode * +static RBTNode * rumAllocEntryAccumulator(void *arg) { BuildAccumulator *accum = (BuildAccumulator *) arg; @@ -85,7 +92,7 @@ rumAllocEntryAccumulator(void *arg) /* * Allocate memory by rather big chunks to decrease overhead. We have no - * need to reclaim RBNodes individually, so this costs nothing. + * need to reclaim RBTNodes individually, so this costs nothing. */ if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY) { @@ -94,11 +101,11 @@ rumAllocEntryAccumulator(void *arg) accum->eas_used = 0; } - /* Allocate new RBNode from current chunk */ + /* Allocate new RBTNode from current chunk */ ea = accum->entryallocator + accum->eas_used; accum->eas_used++; - return (RBNode *) ea; + return (RBTNode *) ea; } void @@ -108,12 +115,12 @@ rumInitBA(BuildAccumulator *accum) accum->allocatedMemory = 0; accum->entryallocator = NULL; accum->eas_used = 0; - accum->tree = rb_create(sizeof(RumEntryAccumulator), - cmpEntryAccumulator, - rumCombineData, - rumAllocEntryAccumulator, - NULL, /* no freefunc needed */ - (void *) accum); + accum->tree = rbt_create(sizeof(RumEntryAccumulator), + cmpEntryAccumulator, + rumCombineData, + rumAllocEntryAccumulator, + NULL, /* no freefunc needed */ + (void *) accum); } /* @@ -123,7 +130,7 @@ rumInitBA(BuildAccumulator *accum) static Datum getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value) { - Form_pg_attribute att = accum->rumstate->origTupdesc->attrs[attnum - 1]; + Form_pg_attribute att = RumTupleDescAttr(accum->rumstate->origTupdesc, attnum - 1); Datum res; if (att->attbyval) @@ -148,7 +155,7 @@ rumInsertBAEntry(BuildAccumulator *accum, RumEntryAccumulator eatmp; RumEntryAccumulator *ea; bool isNew; - RumKey item; + RumItem item; /* * For the moment, fill only the fields of eatmp that will be looked at by @@ -159,12 +166,13 @@ rumInsertBAEntry(BuildAccumulator *accum, eatmp.category = category; /* temporarily set up single-entry itempointer list */ eatmp.list = &item; + memset(&item, 0, sizeof(item)); item.iptr = *heapptr; item.addInfo = addInfo; item.addInfoIsNull = addInfoIsNull; - ea = (RumEntryAccumulator *) rb_insert(accum->tree, (RBNode *) &eatmp, - &isNew); + ea = (RumEntryAccumulator *) rbt_insert(accum->tree, (RBTNode *) &eatmp, + &isNew); if (isNew) { @@ -184,7 +192,7 @@ rumInsertBAEntry(BuildAccumulator *accum, */ ea->shouldSort = (accum->rumstate->useAlternativeOrder && attnum == accum->rumstate->attrnAddToColumn); - ea->list = (RumKey *) palloc(sizeof(RumKey) * DEF_NPTR); + ea->list = (RumItem *) palloc(sizeof(RumItem) * DEF_NPTR); ea->list[0].iptr = *heapptr; ea->list[0].addInfo = addInfo; ea->list[0].addInfoIsNull = addInfoIsNull; @@ -263,16 +271,18 @@ qsortCompareItemPointers(const void *a, const void *b) static AttrNumber AttrNumberQsort = 0; static int -qsortCompareRumKey(const void *a, const void *b, void *arg) +qsortCompareRumItem(const void *a, const void *b, void *arg) { - return compareRumKey(arg, AttrNumberQsort, a, b); + return compareRumItem(arg, AttrNumberQsort, a, b); } /* Prepare to read out the rbtree contents using rumGetBAEntry */ void rumBeginBAScan(BuildAccumulator *accum) { -#if PG_VERSION_NUM >= 100000 +#if (PG_VERSION_NUM > 100006 && PG_VERSION_NUM < 110000) || PG_VERSION_NUM >= 110001 + rbt_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk); +#elif PG_VERSION_NUM >= 100000 rb_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk); #else rb_begin_iterate(accum->tree, LeftRightWalk); @@ -284,15 +294,17 @@ rumBeginBAScan(BuildAccumulator *accum) * This consists of a single key datum and a list (array) of one or more * heap TIDs in which that key is found. The list is guaranteed sorted. */ -RumKey * +RumItem * rumGetBAEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *key, RumNullCategory * category, uint32 *n) { RumEntryAccumulator *entry; - RumKey *list; + RumItem *list; -#if PG_VERSION_NUM >= 100000 +#if (PG_VERSION_NUM > 100006 && PG_VERSION_NUM < 110000) || PG_VERSION_NUM >= 110001 + entry = (RumEntryAccumulator *) rbt_iterate(&accum->tree_walk); +#elif PG_VERSION_NUM >= 100000 entry = (RumEntryAccumulator *) rb_iterate(&accum->tree_walk); #else entry = (RumEntryAccumulator *) rb_iterate(accum->tree); @@ -315,10 +327,10 @@ rumGetBAEntry(BuildAccumulator *accum, if (accum->rumstate->useAlternativeOrder && entry->attnum == accum->rumstate->attrnAddToColumn) - qsort_arg(list, entry->count, sizeof(RumKey), - qsortCompareRumKey, accum->rumstate); + qsort_arg(list, entry->count, sizeof(RumItem), + qsortCompareRumItem, accum->rumstate); else if (entry->shouldSort) - qsort(list, entry->count, sizeof(RumKey), qsortCompareItemPointers); + qsort(list, entry->count, sizeof(RumItem), qsortCompareItemPointers); } return list; diff --git a/src/rumdatapage.c b/src/rumdatapage.c index ae4fd41f38..922bb7d19a 100644 --- a/src/rumdatapage.c +++ b/src/rumdatapage.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -99,7 +99,7 @@ rumDatumWrite(Pointer ptr, Datum datum, bool typbyval, char typalign, elog(ERROR, "unsupported byval length: %d", (int) (typlen)); } - data_length = typlen; + data_length = (Size)typlen; } else if (typlen == -1) { @@ -149,7 +149,7 @@ rumDatumWrite(Pointer ptr, Datum datum, bool typbyval, char typalign, /* fixed-length pass-by-reference */ ptr = (char *) att_align_nominal(ptr, typalign); Assert(typlen > 0); - data_length = typlen; + data_length = (Size)typlen; memmove(ptr, DatumGetPointer(datum), data_length); } @@ -226,7 +226,7 @@ rumDataPageLeafWriteItemPointer(RumState * rumstate, char *ptr, ItemPointer iptr */ Pointer rumPlaceToDataPageLeaf(Pointer ptr, OffsetNumber attnum, - RumKey * item, ItemPointer prev, RumState * rumstate) + RumItem * item, ItemPointer prev, RumState * rumstate) { Form_pg_attribute attr; @@ -283,7 +283,7 @@ rumDataPageLeafGetItemPointerSize(ItemPointer iptr, ItemPointer prev) */ Size rumCheckPlaceToDataPageLeaf(OffsetNumber attnum, - RumKey * item, ItemPointer prev, RumState * rumstate, Size size) + RumItem * item, ItemPointer prev, RumState * rumstate, Size size) { Form_pg_attribute attr; @@ -324,8 +324,8 @@ rumCompareItemPointers(const ItemPointerData *a, const ItemPointerData *b) } int -compareRumKey(RumState * state, const AttrNumber attno, - const RumKey * a, const RumKey * b) +compareRumItem(RumState * state, const AttrNumber attno, + const RumItem * a, const RumItem * b) { if (state->useAlternativeOrder && attno == state->attrnAddToColumn) { @@ -333,7 +333,7 @@ compareRumKey(RumState * state, const AttrNumber attno, if (a->addInfoIsNull == false && b->addInfoIsNull == false) { int res; - AttrNumber attnum = state->attrnOrderByColumn; + AttrNumber attnum = state->attrnAttachColumn; res = DatumGetInt32(FunctionCall2Coll( &state->compareFn[attnum - 1], @@ -365,18 +365,18 @@ compareRumKey(RumState * state, const AttrNumber attno, * Caller is responsible that there is enough space at *dst. */ uint32 -rumMergeItemPointers(RumState * rumstate, AttrNumber attno, RumKey * dst, - RumKey * a, uint32 na, RumKey * b, uint32 nb) +rumMergeRumItems(RumState * rumstate, AttrNumber attno, RumItem * dst, + RumItem * a, uint32 na, RumItem * b, uint32 nb) { - RumKey *dptr = dst; - RumKey *aptr = a, + RumItem *dptr = dst; + RumItem *aptr = a, *bptr = b; while (aptr - a < na && bptr - b < nb) { int cmp; - cmp = compareRumKey(rumstate, attno, aptr, bptr); + cmp = compareRumItem(rumstate, attno, aptr, bptr); if (cmp > 0) { @@ -409,7 +409,7 @@ rumMergeItemPointers(RumState * rumstate, AttrNumber attno, RumKey * dst, /* * Checks, should we move to right link... - * Compares inserting itemp pointer with right bound of current page + * Compares inserting item pointer with right bound of current page */ static bool dataIsMoveRight(RumBtree btree, Page page) @@ -419,10 +419,10 @@ dataIsMoveRight(RumBtree btree, Page page) if (RumPageRightMost(page)) return false; - res = compareRumKey(btree->rumstate, - btree->entryAttnum, - &btree->items[btree->curitem], - RumDataPageGetRightBound(page)); + res = compareRumItem(btree->rumstate, + btree->entryAttnum, + &btree->items[btree->curitem], + RumDataPageGetRightBound(page)); return (res > 0) ? true : false; } @@ -476,10 +476,10 @@ dataLocateItem(RumBtree btree, RumBtreeStack * stack) } else { - result = compareRumKey(btree->rumstate, - btree->entryAttnum, - &btree->items[btree->curitem], - &pitem->key); + result = compareRumItem(btree->rumstate, + btree->entryAttnum, + &btree->items[btree->curitem], + &pitem->item); } if (result == 0) @@ -505,7 +505,7 @@ dataLocateItem(RumBtree btree, RumBtreeStack * stack) } void -convertIndexToKey(RumDataLeafItemIndex *src, RumKey *dst) +convertIndexToKey(RumDataLeafItemIndex *src, RumItem *dst) { dst->iptr = src->iptr; if (dst->iptr.ip_posid & ALT_ADD_INFO_NULL_FLAG) @@ -520,7 +520,7 @@ convertIndexToKey(RumDataLeafItemIndex *src, RumKey *dst) } } -/** +/* * Find item pointer in leaf data page. Returns true if given item pointer is * found and false if it's not. Sets offset and iptrOut to last item pointer * which is less than given one. Sets ptrOut ahead that item pointer. @@ -533,7 +533,7 @@ findInLeafPage(RumBtree btree, Page page, OffsetNumber *offset, OffsetNumber i, maxoff, first = FirstOffsetNumber; - RumKey item; + RumItem item; int cmp; Assert(RumPageIsData(page)); @@ -555,12 +555,12 @@ findInLeafPage(RumBtree btree, Page page, OffsetNumber *offset, if (btree->rumstate->useAlternativeOrder) { - RumKey k; + RumItem k; convertIndexToKey(index, &k); - cmp = compareRumKey(btree->rumstate, - btree->entryAttnum, &k, - &btree->items[btree->curitem]); + cmp = compareRumItem(btree->rumstate, + btree->entryAttnum, &k, + &btree->items[btree->curitem]); } else @@ -589,10 +589,10 @@ findInLeafPage(RumBtree btree, Page page, OffsetNumber *offset, *iptrOut = item.iptr; ptr = rumDataPageLeafRead(ptr, btree->entryAttnum, &item, - btree->rumstate); + false, btree->rumstate); - cmp = compareRumKey(btree->rumstate, btree->entryAttnum, - &btree->items[btree->curitem], &item); + cmp = compareRumItem(btree->rumstate, btree->entryAttnum, + &btree->items[btree->curitem], &item); if (cmp == 0) { @@ -631,7 +631,7 @@ dataLocateLeafItem(RumBtree btree, RumBtreeStack * stack) if (btree->fullScan) { stack->off = FirstOffsetNumber; - return TRUE; + return true; } return findInLeafPage(btree, page, &stack->off, &iptr, &ptr); @@ -736,7 +736,7 @@ RumDataPageAddItem(Page page, void *data, OffsetNumber offset) if (offset <= maxoff) memmove(ptr + sizeof(PostingItem), ptr, - (maxoff - offset + 1) * sizeof(PostingItem)); + ((uint16)(maxoff - offset + 1)) * sizeof(PostingItem)); } memcpy(ptr, data, sizeof(PostingItem)); RumPageGetOpaque(page)->maxoff++; @@ -763,13 +763,15 @@ RumPageDeletePostingItem(Page page, OffsetNumber offset) char *dstptr = RumDataPageGetItem(page, offset), *sourceptr = RumDataPageGetItem(page, offset + 1); - memmove(dstptr, sourceptr, sizeof(PostingItem) * (maxoff - offset)); - /* Adjust pd_lower */ - ((PageHeader) page)->pd_lower = sourceptr - page; - Assert(((PageHeader) page)->pd_lower <= ((PageHeader) page)->pd_upper); + memmove(dstptr, sourceptr, sizeof(PostingItem) * (uint16)(maxoff - offset)); } RumPageGetOpaque(page)->maxoff--; + + /* Adjust pd_lower */ + ((PageHeader) page)->pd_lower = + RumDataPageGetItem(page, RumPageGetOpaque(page)->maxoff + 1) - page; + Assert(((PageHeader) page)->pd_lower <= ((PageHeader) page)->pd_upper); } /* @@ -849,9 +851,16 @@ dataPlaceToPage(RumBtree btree, Page page, OffsetNumber off) Pointer ptr = RumDataPageGetData(page), copyPtr = NULL; ItemPointerData iptr = {{0, 0}, 0}; - RumKey copyItem; + RumItem copyItem; bool copyItemEmpty = true; - char pageCopy[BLCKSZ]; + /* + * Must have pageCopy MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * pageCopy overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() or elsewhere. + */ + char pageCopyStorage[BLCKSZ + MAXIMUM_ALIGNOF]; + char *pageCopy = (char *) MAXALIGN(pageCopyStorage); int maxoff = RumPageGetOpaque(page)->maxoff; int freespace, insertCount = 0; @@ -874,6 +883,13 @@ dataPlaceToPage(RumBtree btree, Page page, OffsetNumber off) copyPtr = pageCopy + (ptr - page); copyItem.iptr = iptr; } + else + { + /* + * Force insertion of new items until insertion items are less than + * right bound. + */ + } freespace = RumPageGetOpaque(page)->freespace; @@ -890,7 +906,8 @@ dataPlaceToPage(RumBtree btree, Page page, OffsetNumber off) if (copyItemEmpty == true && off <= maxoff) { copyPtr = rumDataPageLeafRead(copyPtr, btree->entryAttnum, - ©Item, btree->rumstate); + ©Item, false, + btree->rumstate); copyItemEmpty = false; } @@ -899,24 +916,30 @@ dataPlaceToPage(RumBtree btree, Page page, OffsetNumber off) if (stopAppend) cmp = -1; /* force copy */ else - cmp = compareRumKey(btree->rumstate, btree->entryAttnum, - ©Item, - btree->items + btree->curitem); + cmp = compareRumItem(btree->rumstate, btree->entryAttnum, + ©Item, + btree->items + btree->curitem); } else if (btree->curitem < btree->nitem) { /* we copied all old items but we have to add more new items */ if (stopAppend) - /* there is no free space on page */ + /* there is no free space on page */ break; else if (RumPageRightMost(page)) /* force insertion of new item */ cmp = 1; - else if ((cmp = compareRumKey(btree->rumstate, btree->entryAttnum, - RumDataPageGetRightBound(page), - btree->items + btree->curitem)) >= 0) + else if ((cmp = compareRumItem(btree->rumstate, btree->entryAttnum, + RumDataPageGetRightBound(page), + btree->items + btree->curitem)) >= 0) { - /* force insertion of new item */ + /* + * Force insertion if current item is greater than last item + * of the page but less than right bound. + */ + if (off > maxoff) + /* force insertion of new item */ + cmp = 1; } else /* new items should be inserted on next page */ @@ -997,7 +1020,7 @@ dataPlaceToPage(RumBtree btree, Page page, OffsetNumber off) if (ptr - RumDataPageGetData(page) > \ totalsize / 2 && page == newlPage) \ { \ - maxLeftIptr = curIptr; \ + maxLeftItem = curItem; \ ItemPointerSetMin(&prevIptr); \ RumPageGetOpaque(newlPage)->maxoff = j; \ page = rPage; \ @@ -1034,13 +1057,20 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, Size pageSize = PageGetPageSize(newlPage); Size maxItemSize = 0; ItemPointerData prevIptr; - RumKey maxLeftIptr, - curIptr; - RumKey item; - int totalCount = 0; + RumItem maxLeftItem, + curItem; + RumItem item; int maxItemIndex = btree->curitem; - static char lpageCopy[BLCKSZ]; + /* + * Must have lpageCopy MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * lpageCopy overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() etc. + */ + static char lpageCopyStorage[BLCKSZ + MAXIMUM_ALIGNOF]; + char *lpageCopy = (char *) MAXALIGN(lpageCopyStorage); + memset(&item, 0, sizeof(item)); dataPrepareData(btree, newlPage, off); maxoff = RumPageGetOpaque(newlPage)->maxoff; @@ -1069,19 +1099,17 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, &item, &prevIptr, btree->rumstate, totalsize); maxItemIndex++; - totalCount++; maxItemSize = Max(maxItemSize, totalsize - prevTotalsize); } prevIptr = item.iptr; copyPtr = rumDataPageLeafRead(copyPtr, btree->entryAttnum, &item, - btree->rumstate); + false, btree->rumstate); prevTotalsize = totalsize; totalsize = rumCheckPlaceToDataPageLeaf(btree->entryAttnum, &item, &prevIptr, btree->rumstate, totalsize); - totalCount++; maxItemSize = Max(maxItemSize, totalsize - prevTotalsize); } @@ -1103,7 +1131,6 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, 2 * RumDataPageSize - 2 * maxItemSize - 2 * MAXIMUM_ALIGNOF) { maxItemIndex++; - totalCount++; maxItemSize = Max(maxItemSize, newTotalsize - totalsize); totalsize = newTotalsize; @@ -1117,8 +1144,6 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, totalsize = rumCheckPlaceToDataPageLeaf(btree->entryAttnum, &item, &prevIptr, btree->rumstate, totalsize); maxItemIndex++; - - totalCount++; } } @@ -1139,7 +1164,7 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, { while (btree->curitem < maxItemIndex) { - curIptr = btree->items[btree->curitem]; + curItem = btree->items[btree->curitem]; ptr = rumPlaceToDataPageLeaf(ptr, btree->entryAttnum, &btree->items[btree->curitem], &prevIptr, btree->rumstate); @@ -1153,9 +1178,9 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, } copyPtr = rumDataPageLeafRead(copyPtr, btree->entryAttnum, &item, - btree->rumstate); + false, btree->rumstate); - curIptr = item; + curItem = item; ptr = rumPlaceToDataPageLeaf(ptr, btree->entryAttnum, &item, &prevIptr, btree->rumstate); Assert(RumDataPageFreeSpacePre(page, ptr) >= 0); @@ -1169,7 +1194,7 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, { while (btree->curitem < maxItemIndex) { - curIptr = btree->items[btree->curitem]; + curItem = btree->items[btree->curitem]; ptr = rumPlaceToDataPageLeaf(ptr, btree->entryAttnum, &btree->items[btree->curitem], &prevIptr, btree->rumstate); Assert(RumDataPageFreeSpacePre(page, ptr) >= 0); @@ -1184,11 +1209,11 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, RumPageGetOpaque(rPage)->maxoff = j - 1; PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf)); - btree->pitem.key = maxLeftIptr; + btree->pitem.item = maxLeftItem; btree->rightblkno = BufferGetBlockNumber(rbuf); *RumDataPageGetRightBound(rPage) = *RumDataPageGetRightBound(lpageCopy); - *RumDataPageGetRightBound(newlPage) = maxLeftIptr; + *RumDataPageGetRightBound(newlPage) = maxLeftItem; /* Fill indexes at the end of pages */ updateItemIndexes(newlPage, btree->entryAttnum, btree->rumstate); @@ -1209,15 +1234,21 @@ dataSplitPageInternal(RumBtree btree, Buffer lbuf, Buffer rbuf, { char *ptr; OffsetNumber separator; - RumKey* bound; + RumItem *bound; Page newlPage = PageGetTempPageCopy(BufferGetPage(lbuf)); - RumKey oldbound = *RumDataPageGetRightBound(newlPage); - int sizeofitem = sizeof(PostingItem); + RumItem oldbound = *RumDataPageGetRightBound(newlPage); + unsigned int sizeofitem = sizeof(PostingItem); OffsetNumber maxoff = RumPageGetOpaque(newlPage)->maxoff; Size pageSize = PageGetPageSize(newlPage); Size freeSpace; - - static char vector[2 * BLCKSZ]; + /* + * Must have vector MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * vector overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() etc. + */ + static char vectorStorage[2 * BLCKSZ + MAXIMUM_ALIGNOF]; + char *vector = (char *) MAXALIGN(vectorStorage); RumInitPage(rPage, RumPageGetOpaque(newlPage)->flags, pageSize); freeSpace = RumDataPageGetFreeSpace(rPage); @@ -1229,7 +1260,7 @@ dataSplitPageInternal(RumBtree btree, Buffer lbuf, Buffer rbuf, Assert(!RumPageIsLeaf(newlPage)); ptr = vector + (off - 1) * sizeofitem; if (maxoff + 1 - off != 0) - memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem); + memmove(ptr + sizeofitem, ptr, (uint16)(maxoff - off + 1) * sizeofitem); memcpy(ptr, &(btree->pitem), sizeofitem); maxoff++; @@ -1256,25 +1287,25 @@ dataSplitPageInternal(RumBtree btree, Buffer lbuf, Buffer rbuf, ptr = RumDataPageGetItem(rPage, FirstOffsetNumber); memcpy(ptr, vector + separator * sizeofitem, - (maxoff - separator) * sizeofitem); + (uint16)(maxoff - separator) * sizeofitem); RumPageGetOpaque(rPage)->maxoff = maxoff - separator; /* Adjust pd_lower */ ((PageHeader) rPage)->pd_lower = (ptr + (maxoff - separator) * sizeofitem) - - rPage; + rPage; PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf)); if (RumPageIsLeaf(newlPage)) - btree->pitem.key.iptr = *(ItemPointerData *) RumDataPageGetItem(newlPage, - RumPageGetOpaque(newlPage)->maxoff); + btree->pitem.item.iptr = ((PostingItem *) RumDataPageGetItem(newlPage, + RumPageGetOpaque(newlPage)->maxoff))->item.iptr; else - btree->pitem.key = ((PostingItem *) RumDataPageGetItem(newlPage, - RumPageGetOpaque(newlPage)->maxoff))->key; + btree->pitem.item = ((PostingItem *) RumDataPageGetItem(newlPage, + RumPageGetOpaque(newlPage)->maxoff))->item; btree->rightblkno = BufferGetBlockNumber(rbuf); /* set up right bound for left page */ bound = RumDataPageGetRightBound(newlPage); - *bound = btree->pitem.key; + *bound = btree->pitem.item; /* set up right bound for right page */ bound = RumDataPageGetRightBound(rPage); @@ -1306,7 +1337,7 @@ void updateItemIndexes(Page page, OffsetNumber attnum, RumState * rumstate) { Pointer ptr; - RumKey item; + RumItem item; int j = 0, maxoff, i; @@ -1335,7 +1366,7 @@ updateItemIndexes(Page page, OffsetNumber attnum, RumState * rumstate) } j++; } - ptr = rumDataPageLeafRead(ptr, attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, attnum, &item, false, rumstate); } /* Fill rest of page indexes with InvalidOffsetNumber if any */ for (; j < RumDataLeafIndexCount; j++) @@ -1362,7 +1393,7 @@ checkLeafDataPage(RumState * rumstate, AttrNumber attnum, Page page) Offset maxoff, i; char *ptr; - RumKey item; + RumItem item; RumDataLeafItemIndex *index, *previndex = NULL; @@ -1418,11 +1449,13 @@ rumDataFillRoot(RumBtree btree, Buffer root, Buffer lbuf, Buffer rbuf, PostingItem li, ri; - li.key = *RumDataPageGetRightBound(lpage); + memset(&li, 0, sizeof(PostingItem)); + li.item = *RumDataPageGetRightBound(lpage); PostingItemSetBlockNumber(&li, BufferGetBlockNumber(lbuf)); RumDataPageAddItem(page, &li, InvalidOffsetNumber); - ri.key = *RumDataPageGetRightBound(rpage); + memset(&ri, 0, sizeof(PostingItem)); + ri.item = *RumDataPageGetRightBound(rpage); PostingItemSetBlockNumber(&ri, BufferGetBlockNumber(rbuf)); RumDataPageAddItem(page, &ri, InvalidOffsetNumber); } @@ -1445,10 +1478,10 @@ rumPrepareDataScan(RumBtree btree, Relation index, OffsetNumber attnum, RumState btree->splitPage = dataSplitPage; btree->fillRoot = rumDataFillRoot; - btree->isData = TRUE; - btree->searchMode = FALSE; - btree->isDelete = FALSE; - btree->fullScan = FALSE; + btree->isData = true; + btree->searchMode = false; + btree->isDelete = false; + btree->fullScan = false; btree->scanDirection = ForwardScanDirection; btree->entryAttnum = attnum; @@ -1479,11 +1512,13 @@ void rumInsertItemPointers(RumState * rumstate, OffsetNumber attnum, RumPostingTreeScan * gdi, - RumKey * items, uint32 nitem, + RumItem * items, uint32 nitem, GinStatsData *buildStats) { - BlockNumber rootBlkno = gdi->stack->blkno; + BlockNumber rootBlkno; + Assert(gdi->stack); + rootBlkno = gdi->stack->blkno; gdi->btree.items = items; gdi->btree.nitem = nitem; gdi->btree.curitem = 0; @@ -1512,12 +1547,12 @@ rumInsertItemPointers(RumState * rumstate, } Buffer -rumScanBeginPostingTree(RumPostingTreeScan * gdi, RumKey *key) +rumScanBeginPostingTree(RumPostingTreeScan * gdi, RumItem *item) { - if (key) + if (item) { gdi->btree.fullScan = false; - gdi->btree.items = key; + gdi->btree.items = item; gdi->btree.curitem = 0; gdi->btree.nitem = 1; } diff --git a/src/rumentrypage.c b/src/rumentrypage.c index b18dde2466..29e1dd25bb 100644 --- a/src/rumentrypage.c +++ b/src/rumentrypage.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -21,17 +21,17 @@ */ void rumReadTuple(RumState * rumstate, OffsetNumber attnum, - IndexTuple itup, RumKey * items) + IndexTuple itup, RumItem * items, bool copyAddInfo) { Pointer ptr = RumGetPosting(itup); - RumKey item; + RumItem item; int nipd = RumGetNPosting(itup), i; ItemPointerSetMin(&item.iptr); for (i = 0; i < nipd; i++) { - ptr = rumDataPageLeafRead(ptr, attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, attnum, &item, copyAddInfo, rumstate); items[i] = item; } } @@ -47,7 +47,7 @@ rumReadTuplePointers(RumState * rumstate, OffsetNumber attnum, Pointer ptr = RumGetPosting(itup); int nipd = RumGetNPosting(itup), i; - RumKey item; + RumItem item; ItemPointerSetMin(&item.iptr); for (i = 0; i < nipd; i++) @@ -113,6 +113,8 @@ getRightMostTuple(Page page) { OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + Assert(maxoff != InvalidOffsetNumber); + return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff)); } @@ -125,7 +127,7 @@ entryIsMoveRight(RumBtree btree, Page page) RumNullCategory category; if (RumPageRightMost(page)) - return FALSE; + return false; itup = getRightMostTuple(page); attnum = rumtuple_get_attrnum(btree->rumstate, itup); @@ -134,9 +136,9 @@ entryIsMoveRight(RumBtree btree, Page page) if (rumCompareAttEntries(btree->rumstate, btree->entryAttnum, btree->entryKey, btree->entryCategory, attnum, key, category) > 0) - return TRUE; + return true; - return FALSE; + return false; } /* @@ -232,7 +234,7 @@ entryLocateLeafEntry(RumBtree btree, RumBtreeStack * stack) if (btree->fullScan) { stack->off = FirstOffsetNumber; - return TRUE; + return true; } low = FirstOffsetNumber; @@ -426,8 +428,14 @@ entrySplitPage(RumBtree btree, Buffer lbuf, Buffer rbuf, Page page; Page newlPage = PageGetTempPageCopy(lPage); Size pageSize = PageGetPageSize(newlPage); - - static char tupstore[2 * BLCKSZ]; + /* + * Must have tupstore MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * tupstore overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() or elsewhere. + */ + static char tupstoreStorage[2 * BLCKSZ + MAXIMUM_ALIGNOF]; + char *tupstore = (char *) MAXALIGN(tupstoreStorage); entryPreparePage(btree, newlPage, off); @@ -558,12 +566,12 @@ rumPrepareEntryScan(RumBtree btree, OffsetNumber attnum, btree->splitPage = entrySplitPage; btree->fillRoot = rumEntryFillRoot; - btree->isData = FALSE; - btree->searchMode = FALSE; - btree->fullScan = FALSE; + btree->isData = false; + btree->searchMode = false; + btree->fullScan = false; btree->entryAttnum = attnum; btree->entryKey = key; btree->entryCategory = category; - btree->isDelete = FALSE; + btree->isDelete = false; } diff --git a/src/rumfast.c b/src/rumfast.c deleted file mode 100644 index 40eb3d4623..0000000000 --- a/src/rumfast.c +++ /dev/null @@ -1,927 +0,0 @@ -/*------------------------------------------------------------------------- - * - * rumfast.c - * Fast insert routines for the Postgres inverted index access method. - * Pending entries are stored in linear list of pages. Later on - * (typically during VACUUM), rumInsertCleanup() will be invoked to - * transfer pending entries into the regular index structure. This - * wins because bulk insertion is much more efficient than retail. - * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" - -#include "access/generic_xlog.h" -#include "access/htup_details.h" -#include "commands/vacuum.h" -#include "miscadmin.h" -#include "utils/memutils.h" -#include "utils/datum.h" - -#include "rum.h" - -#define RUM_NDELETE_AT_ONCE 16 - -#define RUM_PAGE_FREESIZE \ - ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(RumPageOpaqueData)) ) - -typedef struct KeyArray -{ - Datum *keys; /* expansible array of keys */ - Datum *addInfo; /* expansible array of additional information */ - bool *addInfoIsNull; /* expansible array of NULL flag of - * additional information */ - RumNullCategory *categories; /* another expansible array */ - int32 nvalues; /* current number of valid entries */ - int32 maxvalues; /* allocated size of arrays */ -} KeyArray; - - -/* - * Build a pending-list page from the given array of tuples, and write it out. - * - * Returns amount of free space left on the page. - */ -static uint32 -writeListPage(RumState *rumstate, Buffer buffer, - IndexTuple *tuples, uint32 ntuples, BlockNumber rightlink) -{ - Page page; - uint32 i, - freesize; - OffsetNumber l, - off; - GenericXLogState *state; - - state = GenericXLogStart(rumstate->index); - - page = GenericXLogRegisterBuffer(state, buffer, 0); - RumInitPage(page, RUM_LIST, BufferGetPageSize(buffer)); - - off = FirstOffsetNumber; - - for (i = 0; i < ntuples; i++) - { - Size this_size = IndexTupleSize(tuples[i]); - - l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); - - if (l == InvalidOffsetNumber) - elog(ERROR, "failed to add item to index page in \"%s\"", - RelationGetRelationName(rumstate->index)); - - off++; - } - - RumPageGetOpaque(page)->rightlink = rightlink; - - /* - * tail page may contain only whole row(s) or final part of row placed on - * previous pages (a "row" here meaning all the index tuples generated for - * one heap tuple) - */ - if (rightlink == InvalidBlockNumber) - { - RumPageSetFullRow(page); - RumPageGetOpaque(page)->maxoff = 1; - } - else - { - RumPageGetOpaque(page)->maxoff = 0; - } - - /* get free space before releasing buffer */ - freesize = PageGetExactFreeSpace(page); - GenericXLogFinish(state); - UnlockReleaseBuffer(buffer); - - return freesize; -} - -static void -makeSublist(RumState *rumstate, IndexTuple *tuples, uint32 ntuples, - RumMetaPageData * res) -{ - Buffer curBuffer = InvalidBuffer; - Buffer prevBuffer = InvalidBuffer; - uint32 i, - startTuple = 0; - uint64 size = 0, - tupsize; - - Assert(ntuples > 0); - - /* - * Split tuples into pages - */ - for (i = 0; i < ntuples; i++) - { - if (curBuffer == InvalidBuffer) - { - curBuffer = RumNewBuffer(rumstate->index); - - if (prevBuffer != InvalidBuffer) - { - res->nPendingPages++; - writeListPage(rumstate, prevBuffer, - tuples + startTuple, - i - startTuple, - BufferGetBlockNumber(curBuffer)); - } - else - { - res->head = BufferGetBlockNumber(curBuffer); - } - - prevBuffer = curBuffer; - startTuple = i; - size = 0; - } - - tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData); - - if (size + tupsize > RumListPageSize) - { - /* won't fit, force a new page and reprocess */ - i--; - curBuffer = InvalidBuffer; - } - else - { - size += tupsize; - } - } - - /* - * Write last page - */ - res->tail = BufferGetBlockNumber(curBuffer); - res->tailFreeSize = writeListPage(rumstate, curBuffer, - tuples + startTuple, - ntuples - startTuple, - InvalidBlockNumber); - res->nPendingPages++; - /* that was only one heap tuple */ - res->nPendingHeapTuples = 1; -} - -/* - * Write the index tuples contained in *collector into the index's - * pending list. - * - * Function guarantees that all these tuples will be inserted consecutively, - * preserving order - */ -void -rumHeapTupleFastInsert(RumState * rumstate, RumTupleCollector * collector) -{ - Relation index = rumstate->index; - Buffer metabuffer; - Page metapage; - RumMetaPageData *metadata = NULL; - Buffer buffer = InvalidBuffer; - Page page = NULL; - bool separateList = false; - bool needCleanup = false; - GenericXLogState *state; - - if (collector->ntuples == 0) - return; - - state = GenericXLogStart(rumstate->index); - metabuffer = ReadBuffer(index, RUM_METAPAGE_BLKNO); - - if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > RumListPageSize) - { - /* - * Total size is greater than one page => make sublist - */ - separateList = true; - } - else - { - LockBuffer(metabuffer, RUM_EXCLUSIVE); - metadata = RumPageGetMeta(BufferGetPage(metabuffer)); - - if (metadata->head == InvalidBlockNumber || - collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) - { - /* - * Pending list is empty or total size is greater than freespace - * on tail page => make sublist - * - * We unlock metabuffer to keep high concurrency - */ - separateList = true; - LockBuffer(metabuffer, RUM_UNLOCK); - } - } - - if (separateList) - { - /* - * We should make sublist separately and append it to the tail - */ - RumMetaPageData sublist; - - memset(&sublist, 0, sizeof(RumMetaPageData)); - makeSublist(rumstate, collector->tuples, collector->ntuples, &sublist); - - /* - * metapage was unlocked, see above - */ - LockBuffer(metabuffer, RUM_EXCLUSIVE); - metapage = GenericXLogRegisterBuffer(state, metabuffer, 0); - metadata = RumPageGetMeta(metapage); - - if (metadata->head == InvalidBlockNumber) - { - /* - * Main list is empty, so just insert sublist as main list - */ - metadata->head = sublist.head; - metadata->tail = sublist.tail; - metadata->tailFreeSize = sublist.tailFreeSize; - - metadata->nPendingPages = sublist.nPendingPages; - metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; - } - else - { - /* - * Merge lists - */ - buffer = ReadBuffer(index, metadata->tail); - LockBuffer(buffer, RUM_EXCLUSIVE); - page = GenericXLogRegisterBuffer(state, buffer, 0); - - Assert(RumPageGetOpaque(page)->rightlink == InvalidBlockNumber); - - RumPageGetOpaque(page)->rightlink = sublist.head; - - metadata->tail = sublist.tail; - metadata->tailFreeSize = sublist.tailFreeSize; - - metadata->nPendingPages += sublist.nPendingPages; - metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; - } - } - else - { - /* - * Insert into tail page. Metapage is already locked - */ - OffsetNumber l, - off; - uint32 i; - Size tupsize; - - metapage = GenericXLogRegisterBuffer(state, metabuffer, 0); - metadata = RumPageGetMeta(metapage); - - buffer = ReadBuffer(index, metadata->tail); - LockBuffer(buffer, RUM_EXCLUSIVE); - page = GenericXLogRegisterBuffer(state, buffer, 0); - - off = (PageIsEmpty(page)) ? FirstOffsetNumber : - OffsetNumberNext(PageGetMaxOffsetNumber(page)); - - /* - * Increase counter of heap tuples - */ - Assert(RumPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); - RumPageGetOpaque(page)->maxoff++; - metadata->nPendingHeapTuples++; - - for (i = 0; i < collector->ntuples; i++) - { - tupsize = IndexTupleSize(collector->tuples[i]); - l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); - - if (l == InvalidOffsetNumber) - { - GenericXLogAbort(state); - elog(ERROR, "failed to add item to index page in \"%s\"", - RelationGetRelationName(index)); - } - - off++; - } - - metadata->tailFreeSize = PageGetExactFreeSpace(page); - } - - /* - * Force pending list cleanup when it becomes too long. And, - * rumInsertCleanup could take significant amount of time, so we prefer to - * call it when it can do all the work in a single collection cycle. In - * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it - * while pending list is still small enough to fit into work_mem. - * - * rumInsertCleanup() should not be called inside our CRIT_SECTION. - */ - if (metadata->nPendingPages * RUM_PAGE_FREESIZE > work_mem * 1024L) - needCleanup = true; - - GenericXLogFinish(state); - - if (buffer != InvalidBuffer) - UnlockReleaseBuffer(buffer); - - UnlockReleaseBuffer(metabuffer); - - if (needCleanup) - rumInsertCleanup(rumstate, false, NULL); -} - -static IndexTuple -RumFastFormTuple(RumState * rumstate, - OffsetNumber attnum, Datum key, RumNullCategory category, - Datum addInfo, - bool addInfoIsNull) -{ - Datum datums[3]; - bool isnull[3]; - IndexTuple itup; - Size newsize; - - /* Build the basic tuple: optional column number, plus key datum */ - - if (rumstate->oneCol) - { - datums[0] = key; - isnull[0] = (category != RUM_CAT_NORM_KEY); - datums[1] = addInfo; - isnull[1] = addInfoIsNull; - } - else - { - datums[0] = UInt16GetDatum(attnum); - isnull[0] = false; - datums[1] = key; - isnull[1] = (category != RUM_CAT_NORM_KEY); - datums[2] = addInfo; - isnull[2] = addInfoIsNull; - } - - itup = index_form_tuple(rumstate->tupdesc[attnum - 1], datums, isnull); - - /* - * Place category to the last byte of index tuple extending it's size if - * needed - */ - newsize = IndexTupleSize(itup); - - if (category != RUM_CAT_NORM_KEY) - { - Size minsize; - - Assert(IndexTupleHasNulls(itup)); - minsize = IndexInfoFindDataOffset(itup->t_info) + - heap_compute_data_size(rumstate->tupdesc[attnum - 1], datums, isnull) + - sizeof(RumNullCategory); - newsize = Max(newsize, minsize); - } - - newsize = MAXALIGN(newsize); - - if (newsize > RumMaxItemSize) - { - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", - (unsigned long) newsize, - (unsigned long) RumMaxItemSize, - RelationGetRelationName(rumstate->index)))); - pfree(itup); - return NULL; - } - - /* - * Resize tuple if needed - */ - if (newsize != IndexTupleSize(itup)) - { - itup = repalloc(itup, newsize); - - /* set new size in tuple header */ - itup->t_info &= ~INDEX_SIZE_MASK; - itup->t_info |= newsize; - } - - /* - * Insert category byte, if needed - */ - if (category != RUM_CAT_NORM_KEY) - { - Assert(IndexTupleHasNulls(itup)); - RumSetNullCategory(itup, category); - } - - return itup; -} - - -/* - * Create temporary index tuples for a single indexable item (one index column - * for the heap tuple specified by ht_ctid), and append them to the array - * in *collector. They will subsequently be written out using - * rumHeapTupleFastInsert. Note that to guarantee consistent state, all - * temp tuples for a given heap tuple must be written in one call to - * rumHeapTupleFastInsert. - */ -void -rumHeapTupleFastCollect(RumState * rumstate, - RumTupleCollector * collector, - OffsetNumber attnum, Datum value, bool isNull, - ItemPointer ht_ctid) -{ - Datum *entries; - RumNullCategory *categories; - int32 i, - nentries; - Datum *addInfo; - bool *addInfoIsNull; - - /* - * Extract the key values that need to be inserted in the index - */ - entries = rumExtractEntries(rumstate, attnum, value, isNull, - &nentries, &categories, &addInfo, &addInfoIsNull); - - /* - * Allocate/reallocate memory for storing collected tuples - */ - if (collector->tuples == NULL) - { - collector->lentuples = nentries * rumstate->origTupdesc->natts; - collector->tuples = (IndexTuple *) palloc(sizeof(IndexTuple) * collector->lentuples); - } - - while (collector->ntuples + nentries > collector->lentuples) - { - collector->lentuples *= 2; - collector->tuples = (IndexTuple *) repalloc(collector->tuples, - sizeof(IndexTuple) * collector->lentuples); - } - - /* - * Build an index tuple for each key value, and add to array. In pending - * tuples we just stick the heap TID into t_tid. - */ - for (i = 0; i < nentries; i++) - { - IndexTuple itup; - - itup = RumFastFormTuple(rumstate, attnum, entries[i], categories[i], addInfo[i], addInfoIsNull[i]); - itup->t_tid = *ht_ctid; - collector->tuples[collector->ntuples++] = itup; - collector->sumsize += IndexTupleSize(itup); - } -} - -/* - * Deletes pending list pages up to (not including) newHead page. - * If newHead == InvalidBlockNumber then function drops the whole list. - * - * metapage is pinned and exclusive-locked throughout this function. - * - * Returns true if another cleanup process is running concurrently - * (if so, we can just abandon our own efforts) - */ -static bool -shiftList(RumState *rumstate, Buffer metabuffer, BlockNumber newHead, - IndexBulkDeleteResult *stats) -{ - Page metapage; - RumMetaPageData *metadata; - BlockNumber blknoToDelete; - GenericXLogState *metastate; - - metastate = GenericXLogStart(rumstate->index); - metapage = GenericXLogRegisterBuffer(metastate, metabuffer, - GENERIC_XLOG_FULL_IMAGE); - metadata = RumPageGetMeta(metapage); - blknoToDelete = metadata->head; - - do - { - Page page; - int64 nDeletedHeapTuples = 0; - uint32 i, - nDeleted = 0; - Buffer buffers[RUM_NDELETE_AT_ONCE]; - GenericXLogState *state; - - while (nDeleted < RUM_NDELETE_AT_ONCE && blknoToDelete != newHead) - { - buffers[nDeleted] = ReadBuffer(rumstate->index, blknoToDelete); - LockBuffer(buffers[nDeleted], RUM_EXCLUSIVE); - - page = BufferGetPage(buffers[nDeleted]); - - nDeleted++; - - if (RumPageIsDeleted(page)) - { - GenericXLogAbort(metastate); - /* concurrent cleanup process is detected */ - for (i = 0; i < nDeleted; i++) - UnlockReleaseBuffer(buffers[i]); - - return true; - } - - nDeletedHeapTuples += RumPageGetOpaque(page)->maxoff; - blknoToDelete = RumPageGetOpaque(page)->rightlink; - } - - if (stats) - stats->pages_deleted += nDeleted; - - metadata->head = blknoToDelete; - - Assert(metadata->nPendingPages >= nDeleted); - metadata->nPendingPages -= nDeleted; - Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); - metadata->nPendingHeapTuples -= nDeletedHeapTuples; - - if (blknoToDelete == InvalidBlockNumber) - { - metadata->tail = InvalidBlockNumber; - metadata->tailFreeSize = 0; - metadata->nPendingPages = 0; - metadata->nPendingHeapTuples = 0; - } - -// MarkBufferDirty(metabuffer); - - for (i = 0; i < nDeleted; i++) - { - state = GenericXLogStart(rumstate->index); - page = GenericXLogRegisterBuffer(state, buffers[i], 0); - - RumPageGetOpaque(page)->flags = RUM_DELETED; - GenericXLogFinish(state); - } - - for (i = 0; i < nDeleted; i++) - UnlockReleaseBuffer(buffers[i]); - } while (blknoToDelete != newHead); - - GenericXLogFinish(metastate); - - return false; -} - -/* Initialize empty KeyArray */ -static void -initKeyArray(KeyArray *keys, int32 maxvalues) -{ - keys->keys = (Datum *) palloc(sizeof(Datum) * maxvalues); - keys->addInfo = (Datum *) palloc(sizeof(Datum) * maxvalues); - keys->addInfoIsNull = (bool *) palloc(sizeof(bool) * maxvalues); - keys->categories = (RumNullCategory *) - palloc(sizeof(RumNullCategory) * maxvalues); - keys->nvalues = 0; - keys->maxvalues = maxvalues; -} - -/* Add datum to KeyArray, resizing if needed */ -static void -addDatum(KeyArray *keys, Datum datum, Datum addInfo, bool addInfoIsNull, RumNullCategory category) -{ - if (keys->nvalues >= keys->maxvalues) - { - keys->maxvalues *= 2; - keys->keys = (Datum *) - repalloc(keys->keys, sizeof(Datum) * keys->maxvalues); - keys->addInfo = (Datum *) - repalloc(keys->addInfo, sizeof(Datum) * keys->maxvalues); - keys->addInfoIsNull = (bool *) - repalloc(keys->addInfoIsNull, sizeof(bool) * keys->maxvalues); - keys->categories = (RumNullCategory *) - repalloc(keys->categories, sizeof(RumNullCategory) * keys->maxvalues); - } - - keys->keys[keys->nvalues] = datum; - keys->categories[keys->nvalues] = category; - keys->addInfo[keys->nvalues] = addInfo; - keys->addInfoIsNull[keys->nvalues] = addInfoIsNull; - keys->nvalues++; -} - -/* - * Collect data from a pending-list page in preparation for insertion into - * the main index. - * - * Go through all tuples >= startoff on page and collect values in accum - * - * Note that ka is just workspace --- it does not carry any state across - * calls. - */ -static void -processPendingPage(BuildAccumulator *accum, KeyArray *ka, - Page page, OffsetNumber startoff) -{ - ItemPointerData heapptr; - OffsetNumber i, - maxoff; - OffsetNumber attrnum; - - /* reset *ka to empty */ - ka->nvalues = 0; - - maxoff = PageGetMaxOffsetNumber(page); - Assert(maxoff >= FirstOffsetNumber); - ItemPointerSetInvalid(&heapptr); - attrnum = 0; - - for (i = startoff; i <= maxoff; i = OffsetNumberNext(i)) - { - IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); - OffsetNumber curattnum; - Datum curkey, - addInfo = 0; - bool addInfoIsNull = true; - RumNullCategory curcategory; - - /* Check for change of heap TID or attnum */ - curattnum = rumtuple_get_attrnum(accum->rumstate, itup); - - if (OidIsValid(accum->rumstate->addInfoTypeOid[curattnum - 1])) - { - Form_pg_attribute attr = accum->rumstate->addAttrs[curattnum - 1]; - Assert(attr); - - if (accum->rumstate->oneCol) - addInfo = index_getattr(itup, 2, - accum->rumstate->tupdesc[curattnum - 1], &addInfoIsNull); - else - addInfo = index_getattr(itup, 3, - accum->rumstate->tupdesc[curattnum - 1], &addInfoIsNull); - addInfo = datumCopy(addInfo, attr->attbyval, attr->attlen); - } - - if (!ItemPointerIsValid(&heapptr)) - { - heapptr = itup->t_tid; - attrnum = curattnum; - } - else if (!(ItemPointerEquals(&heapptr, &itup->t_tid) && - curattnum == attrnum)) - { - /* - * rumInsertBAEntries can insert several datums per call, but only - * for one heap tuple and one column. So call it at a boundary, - * and reset ka. - */ - rumInsertBAEntries(accum, &heapptr, attrnum, - ka->keys, ka->addInfo, ka->addInfoIsNull, ka->categories, ka->nvalues); - ka->nvalues = 0; - heapptr = itup->t_tid; - attrnum = curattnum; - } - - /* Add key to KeyArray */ - curkey = rumtuple_get_key(accum->rumstate, itup, &curcategory); - addDatum(ka, curkey, addInfo, addInfoIsNull, curcategory); - } - - /* Dump out all remaining keys */ - rumInsertBAEntries(accum, &heapptr, attrnum, - ka->keys, ka->addInfo, ka->addInfoIsNull, ka->categories, ka->nvalues); -} - -/* - * Move tuples from pending pages into regular RUM structure. - * - * This can be called concurrently by multiple backends, so it must cope. - * On first glance it looks completely not concurrent-safe and not crash-safe - * either. The reason it's okay is that multiple insertion of the same entry - * is detected and treated as a no-op by ruminsert.c. If we crash after - * posting entries to the main index and before removing them from the - * pending list, it's okay because when we redo the posting later on, nothing - * bad will happen. Likewise, if two backends simultaneously try to post - * a pending entry into the main index, one will succeed and one will do - * nothing. We try to notice when someone else is a little bit ahead of - * us in the process, but that's just to avoid wasting cycles. Only the - * action of removing a page from the pending list really needs exclusive - * lock. - * - * vac_delay indicates that rumInsertCleanup is called from vacuum process, - * so call vacuum_delay_point() periodically. - * If stats isn't null, we count deleted pending pages into the counts. - */ -void -rumInsertCleanup(RumState * rumstate, - bool vac_delay, IndexBulkDeleteResult *stats) -{ - Relation index = rumstate->index; - Buffer metabuffer, - buffer; - Page metapage, - page; - RumMetaPageData *metadata; - MemoryContext opCtx, - oldCtx; - BuildAccumulator accum; - KeyArray datums; - BlockNumber blkno; - - metabuffer = ReadBuffer(index, RUM_METAPAGE_BLKNO); - LockBuffer(metabuffer, RUM_SHARE); - - metapage = BufferGetPage(metabuffer); - metadata = RumPageGetMeta(metapage); - - if (metadata->head == InvalidBlockNumber) - { - /* Nothing to do */ - UnlockReleaseBuffer(metabuffer); - return; - } - - /* - * Read and lock head of pending list - */ - blkno = metadata->head; - buffer = ReadBuffer(index, blkno); - LockBuffer(buffer, RUM_SHARE); - page = BufferGetPage(buffer); - - LockBuffer(metabuffer, RUM_UNLOCK); - - /* - * Initialize. All temporary space will be in opCtx - */ - opCtx = AllocSetContextCreate(CurrentMemoryContext, - "RUM insert cleanup temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - - oldCtx = MemoryContextSwitchTo(opCtx); - - initKeyArray(&datums, 128); - rumInitBA(&accum); - accum.rumstate = rumstate; - - /* - * At the top of this loop, we have pin and lock on the current page of - * the pending list. However, we'll release that before exiting the loop. - * Note we also have pin but not lock on the metapage. - */ - for (;;) - { - if (RumPageIsDeleted(page)) - { - /* another cleanup process is running concurrently */ - UnlockReleaseBuffer(buffer); - break; - } - - /* - * read page's datums into accum - */ - processPendingPage(&accum, &datums, page, FirstOffsetNumber); - - vacuum_delay_point(); - - /* - * Is it time to flush memory to disk? Flush if we are at the end of - * the pending list, or if we have a full row and memory is getting - * full. - * - * XXX using up maintenance_work_mem here is probably unreasonably - * much, since vacuum might already be using that much. - */ - if (RumPageGetOpaque(page)->rightlink == InvalidBlockNumber || - (RumPageHasFullRow(page) && - (accum.allocatedMemory >= maintenance_work_mem * 1024L))) - { - RumKey *items; - uint32 nlist; - Datum key; - RumNullCategory category; - OffsetNumber maxoff, - attnum; - - /* - * Unlock current page to increase performance. Changes of page - * will be checked later by comparing maxoff after completion of - * memory flush. - */ - maxoff = PageGetMaxOffsetNumber(page); - LockBuffer(buffer, RUM_UNLOCK); - - /* - * Moving collected data into regular structure can take - * significant amount of time - so, run it without locking pending - * list. - */ - rumBeginBAScan(&accum); - while ((items = rumGetBAEntry(&accum, - &attnum, &key, &category, &nlist)) != NULL) - { - rumEntryInsert(rumstate, attnum, key, category, - items, nlist, NULL); - vacuum_delay_point(); - } - - /* - * Lock the whole list to remove pages - */ - LockBuffer(metabuffer, RUM_EXCLUSIVE); - LockBuffer(buffer, RUM_SHARE); - - if (RumPageIsDeleted(page)) - { - /* another cleanup process is running concurrently */ - UnlockReleaseBuffer(buffer); - LockBuffer(metabuffer, RUM_UNLOCK); - break; - } - - /* - * While we left the page unlocked, more stuff might have gotten - * added to it. If so, process those entries immediately. There - * shouldn't be very many, so we don't worry about the fact that - * we're doing this with exclusive lock. Insertion algorithm - * guarantees that inserted row(s) will not continue on next page. - * NOTE: intentionally no vacuum_delay_point in this loop. - */ - if (PageGetMaxOffsetNumber(page) != maxoff) - { - rumInitBA(&accum); - processPendingPage(&accum, &datums, page, maxoff + 1); - - rumBeginBAScan(&accum); - while ((items = rumGetBAEntry(&accum, - &attnum, &key, &category, &nlist)) != NULL) - { - rumEntryInsert(rumstate, attnum, key, category, - items, nlist, NULL); - } - } - - /* - * Remember next page - it will become the new list head - */ - blkno = RumPageGetOpaque(page)->rightlink; - UnlockReleaseBuffer(buffer); /* shiftList will do exclusive - * locking */ - - /* - * remove read pages from pending list, at this point all content - * of read pages is in regular structure - */ - if (shiftList(rumstate, metabuffer, blkno, stats)) - { - /* another cleanup process is running concurrently */ - LockBuffer(metabuffer, RUM_UNLOCK); - break; - } - - Assert(blkno == metadata->head); - LockBuffer(metabuffer, RUM_UNLOCK); - - /* - * if we removed the whole pending list just exit - */ - if (blkno == InvalidBlockNumber) - break; - - /* - * release memory used so far and reinit state - */ - MemoryContextReset(opCtx); - initKeyArray(&datums, datums.maxvalues); - rumInitBA(&accum); - } - else - { - blkno = RumPageGetOpaque(page)->rightlink; - UnlockReleaseBuffer(buffer); - } - - /* - * Read next page in pending list - */ - vacuum_delay_point(); - buffer = ReadBuffer(index, blkno); - LockBuffer(buffer, RUM_SHARE); - page = BufferGetPage(buffer); - } - - ReleaseBuffer(metabuffer); - - /* Clean up temporary space */ - MemoryContextSwitchTo(oldCtx); - MemoryContextDelete(opCtx); -} diff --git a/src/rumget.c b/src/rumget.c index cf6c60c46e..84b3697dc7 100644 --- a/src/rumget.c +++ b/src/rumget.c @@ -4,8 +4,8 @@ * fetch tuples from a RUM scan. * * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- @@ -15,22 +15,127 @@ #include "rumsort.h" #include "access/relscan.h" +#include "storage/predicate.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/datum.h" -#include "utils/memutils.h" - +#if PG_VERSION_NUM >= 120000 +#include "utils/float.h" +#endif +#if PG_VERSION_NUM >= 150000 +#include "common/pg_prng.h" +#endif #include "rum.h" /* GUC parameter */ int RumFuzzySearchLimit = 0; -static bool scanPage(RumState * rumstate, RumScanEntry entry, RumKey *item, +static bool scanPage(RumState * rumstate, RumScanEntry entry, RumItem *item, bool equalOk); static void insertScanItem(RumScanOpaque so, bool recheck); static int scan_entry_cmp(const void *p1, const void *p2, void *arg); -static void entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList); +static void entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList, Snapshot snapshot); +/* + * Extract key value for ordering. + * + * XXX FIXME only pass-by-value!!! Value should be copied to + * long-lived memory context and, somehow, freeed. Seems, the + * last is real problem. + */ +#define SCAN_ENTRY_GET_KEY(entry, rumstate, itup) \ +do { \ + if ((entry)->useCurKey) \ + (entry)->curKey = rumtuple_get_key(rumstate, itup, &(entry)->curKeyCategory); \ +} while(0) + +/* + * Assign key value for ordering. + * + * XXX FIXME only pass-by-value!!! Value should be copied to + * long-lived memory context and, somehow, freeed. Seems, the + * last is real problem. + */ +#define SCAN_ITEM_PUT_KEY(entry, item, key, category) \ +do { \ + if ((entry)->useCurKey) \ + { \ + (item).keyValue = key; \ + (item).keyCategory = category; \ + } \ +} while(0) + +static bool +callAddInfoConsistentFn(RumState * rumstate, RumScanKey key) +{ + uint32 i; + bool res = true; + + /* it should be true for search key, but it could be false for order key */ + Assert(key->attnum == key->attnumOrig); + + if (key->attnum != rumstate->attrnAddToColumn) + return true; + + /* + * remember some addinfo value for later ordering by addinfo from + * another column + */ + + key->outerAddInfoIsNull = true; + + if (key->addInfoKeys == NULL && key->willSort == false) + return true; + + for (i = 0; i < key->nentries; i++) + { + if (key->entryRes[i] && key->addInfoIsNull[i] == false) + { + key->outerAddInfoIsNull = false; + + /* + * XXX FIXME only pass-by-value!!! Value should be copied to + * long-lived memory context and, somehow, freeed. Seems, the + * last is real problem. + * But actually it's a problem only for ordering, as restricting + * clause it used only inside this function. + */ + key->outerAddInfo = key->addInfo[i]; + break; + } + } + + if (key->addInfoKeys) + { + if (key->outerAddInfoIsNull) + res = false; /* assume strict operator */ + + for(i = 0; res && i < key->addInfoNKeys; i++) + { + RumScanKey subkey = key->addInfoKeys[i]; + int j; + + for(j=0; res && jnentries; j++) + { + RumScanEntry scanSubEntry = subkey->scanEntry[j]; + int cmp = + DatumGetInt32(FunctionCall4Coll( + &rumstate->comparePartialFn[scanSubEntry->attnumOrig - 1], + rumstate->supportCollation[scanSubEntry->attnumOrig - 1], + scanSubEntry->queryKey, + key->outerAddInfo, + UInt16GetDatum(scanSubEntry->strategy), + PointerGetDatum(scanSubEntry->extra_data) + )); + + if (cmp != 0) + res = false; + } + } + } + + return res; +} /* * Convenience function for invoking a key's consistentFn @@ -42,6 +147,7 @@ callConsistentFn(RumState * rumstate, RumScanKey key) /* it should be true for search key, but it could be false for order key */ Assert(key->attnum == key->attnumOrig); + /* * If we're dealing with a dummy EVERYTHING key, we don't want to call the * consistentFn; just claim it matches. @@ -75,64 +181,7 @@ callConsistentFn(RumState * rumstate, RumScanKey key) )); } - if (res && key->attnum == rumstate->attrnAddToColumn) - { - uint32 i; - - /* - * remember some addinfo value for later ordering by addinfo from - * another column - */ - - key->outerAddInfoIsNull = true; - - for (i = 0; i < key->nentries; i++) - { - if (key->entryRes[i] && key->addInfoIsNull[i] == false) - { - key->outerAddInfoIsNull = false; - - /* - * XXX FIXME only pass-by-value!!! Value should be copied to - * long-lived memory context and, somehow, freeed. Seems, the - * last is real problem - */ - key->outerAddInfo = key->addInfo[i]; - break; - } - } - - if (key->addInfoKeys) - { - if (key->outerAddInfoIsNull) - res = false; /* assume strict operator */ - - for(i = 0; res && i < key->addInfoNKeys; i++) - { - RumScanKey subkey = key->addInfoKeys[i]; - int j; - - for(j=0; res && jnentries; j++) - { - RumScanEntry scanSubEntry = subkey->scanEntry[j]; - int cmp = - DatumGetInt32(FunctionCall4Coll( - &rumstate->comparePartialFn[scanSubEntry->attnumOrig - 1], - rumstate->supportCollation[scanSubEntry->attnumOrig - 1], - scanSubEntry->queryKey, - key->outerAddInfo, - UInt16GetDatum(scanSubEntry->strategy), - PointerGetDatum(scanSubEntry->extra_data) - )); - - if (cmp != 0) - res = false; - } - } - } - } - - return res; + return res && callAddInfoConsistentFn(rumstate, key); } /* @@ -166,7 +215,9 @@ moveRightIfItNeeded(RumBtreeData * btree, RumBtreeStack * stack) */ static void scanPostingTree(Relation index, RumScanEntry scanEntry, - BlockNumber rootPostingTree, OffsetNumber attnum, RumState * rumstate) + BlockNumber rootPostingTree, OffsetNumber attnum, + RumState * rumstate, Datum idatum, RumNullCategory icategory, + Snapshot snapshot) { RumPostingTreeScan *gdi; Buffer buffer; @@ -174,12 +225,15 @@ scanPostingTree(Relation index, RumScanEntry scanEntry, Assert(ScanDirectionIsForward(scanEntry->scanDirection)); /* Descend to the leftmost leaf page */ - gdi = rumPrepareScanPostingTree(index, rootPostingTree, TRUE, + gdi = rumPrepareScanPostingTree(index, rootPostingTree, true, ForwardScanDirection, attnum, rumstate); buffer = rumScanBeginPostingTree(gdi, NULL); + IncrBufferRefCount(buffer); /* prevent unpin in freeRumBtreeStack */ + PredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot); + freeRumBtreeStack(gdi->stack); pfree(gdi); @@ -197,16 +251,19 @@ scanPostingTree(Relation index, RumScanEntry scanEntry, if ((RumPageGetOpaque(page)->flags & RUM_DELETED) == 0 && maxoff >= FirstOffsetNumber) { - RumKey item; + RumScanItem item; Pointer ptr; - ItemPointerSetMin(&item.iptr); + MemSet(&item, 0, sizeof(item)); + ItemPointerSetMin(&item.item.iptr); ptr = RumDataPageGetData(page); for (i = FirstOffsetNumber; i <= maxoff; i++) { - ptr = rumDataPageLeafRead(ptr, attnum, &item, rumstate); - rum_tuplesort_putrumkey(scanEntry->matchSortstate, &item); + ptr = rumDataPageLeafRead(ptr, attnum, &item.item, false, + rumstate); + SCAN_ITEM_PUT_KEY(scanEntry, item, idatum, icategory); + rum_tuplesort_putrumitem(scanEntry->matchSortstate, &item); } scanEntry->predictNumberResult += maxoff; @@ -216,6 +273,9 @@ scanPostingTree(Relation index, RumScanEntry scanEntry, break; /* no more pages */ buffer = rumStep(buffer, index, RUM_SHARE, ForwardScanDirection); + + PredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot); + } UnlockReleaseBuffer(buffer); @@ -236,7 +296,7 @@ scanPostingTree(Relation index, RumScanEntry scanEntry, */ static bool collectMatchBitmap(RumBtreeData * btree, RumBtreeStack * stack, - RumScanEntry scanEntry) + RumScanEntry scanEntry, Snapshot snapshot) { OffsetNumber attnum; Form_pg_attribute attr; @@ -246,11 +306,11 @@ collectMatchBitmap(RumBtreeData * btree, RumBtreeStack * stack, if (rumstate->useAlternativeOrder && scanEntry->attnumOrig == rumstate->attrnAddToColumn) { - cmp = &rumstate->compareFn[rumstate->attrnOrderByColumn - 1]; + cmp = &rumstate->compareFn[rumstate->attrnAttachColumn - 1]; } /* Initialize */ - scanEntry->matchSortstate = rum_tuplesort_begin_rumkey(work_mem, cmp); + scanEntry->matchSortstate = rum_tuplesort_begin_rumitem(work_mem, cmp); /* Null query cannot partial-match anything */ if (scanEntry->isPartialMatch && @@ -259,7 +319,7 @@ collectMatchBitmap(RumBtreeData * btree, RumBtreeStack * stack, /* Locate tupdesc entry for key column (for attbyval/attlen data) */ attnum = scanEntry->attnumOrig; - attr = rumstate->origTupdesc->attrs[attnum - 1]; + attr = RumTupleDescAttr(rumstate->origTupdesc, attnum - 1); for (;;) { @@ -354,7 +414,8 @@ collectMatchBitmap(RumBtreeData * btree, RumBtreeStack * stack, LockBuffer(stack->buffer, RUM_UNLOCK); /* Collect all the TIDs in this entry's posting tree */ - scanPostingTree(btree->index, scanEntry, rootPostingTree, attnum, rumstate); + scanPostingTree(btree->index, scanEntry, rootPostingTree, attnum, + rumstate, idatum, icategory, snapshot); /* * We lock again the entry page and while it was unlocked insert @@ -404,21 +465,21 @@ collectMatchBitmap(RumBtreeData * btree, RumBtreeStack * stack, { int i; char *ptr = RumGetPosting(itup); - RumKey item; + RumScanItem item; - ItemPointerSetMin(&item.iptr); + MemSet(&item, 0, sizeof(item)); + ItemPointerSetMin(&item.item.iptr); for (i = 0; i < RumGetNPosting(itup); i++) { - ptr = rumDataPageLeafRead(ptr, scanEntry->attnum, &item, - rumstate); - rum_tuplesort_putrumkey(scanEntry->matchSortstate, &item); + ptr = rumDataPageLeafRead(ptr, scanEntry->attnum, &item.item, + true, rumstate); + SCAN_ITEM_PUT_KEY(scanEntry, item, idatum, icategory); + rum_tuplesort_putrumitem(scanEntry->matchSortstate, &item); } scanEntry->predictNumberResult += RumGetNPosting(itup); } - if (scanEntry->forceUseBitmap) - return true; /* * Done with this entry, go to the next */ @@ -448,7 +509,7 @@ setListPositionScanEntry(RumState * rumstate, RumScanEntry entry) int res; entry->offset = StopLow + ((StopHigh - StopLow) >> 1); - res = compareRumKey(rumstate, entry->attnumOrig, &entry->markAddInfo, + res = compareRumItem(rumstate, entry->attnumOrig, &entry->markAddInfo, entry->list + entry->offset); if (res < 0) @@ -480,7 +541,7 @@ setListPositionScanEntry(RumState * rumstate, RumScanEntry entry) * Start* functions setup beginning state of searches: finds correct buffer and pins it. */ static void -startScanEntry(RumState * rumstate, RumScanEntry entry) +startScanEntry(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) { RumBtreeData btreeEntry; RumBtreeStack *stackEntry; @@ -489,14 +550,14 @@ startScanEntry(RumState * rumstate, RumScanEntry entry) restartScanEntry: entry->buffer = InvalidBuffer; - RumItemSetMin(&entry->curRumKey); + RumItemSetMin(&entry->curItem); entry->offset = InvalidOffsetNumber; entry->list = NULL; entry->gdi = NULL; entry->stack = NULL; entry->nlist = 0; entry->matchSortstate = NULL; - entry->reduceResult = FALSE; + entry->reduceResult = false; entry->predictNumberResult = 0; /* @@ -506,14 +567,16 @@ startScanEntry(RumState * rumstate, RumScanEntry entry) rumPrepareEntryScan(&btreeEntry, entry->attnum, entry->queryKey, entry->queryCategory, rumstate); - btreeEntry.searchMode = TRUE; + btreeEntry.searchMode = true; stackEntry = rumFindLeafPage(&btreeEntry, NULL); page = BufferGetPage(stackEntry->buffer); - needUnlock = TRUE; + needUnlock = true; + + entry->isFinished = true; - entry->isFinished = TRUE; + PredicateLockPage(rumstate->index, BufferGetBlockNumber(stackEntry->buffer), snapshot); - if (entry->isPartialMatch || entry->forceUseBitmap || + if (entry->isPartialMatch || (entry->queryCategory == RUM_CAT_EMPTY_QUERY && !entry->scanWithAddInfo)) { @@ -525,7 +588,7 @@ startScanEntry(RumState * rumstate, RumScanEntry entry) * for the entry type. */ btreeEntry.findItem(&btreeEntry, stackEntry); - if (collectMatchBitmap(&btreeEntry, stackEntry, entry) == false) + if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot) == false) { /* * RUM tree was seriously restructured, so we will cleanup all @@ -545,25 +608,34 @@ startScanEntry(RumState * rumstate, RumScanEntry entry) if (entry->matchSortstate) { rum_tuplesort_performsort(entry->matchSortstate); - ItemPointerSetMin(&entry->collectRumKey.iptr); - entry->isFinished = FALSE; + ItemPointerSetMin(&entry->collectRumItem.item.iptr); + entry->isFinished = false; } } else if (btreeEntry.findItem(&btreeEntry, stackEntry) || (entry->queryCategory == RUM_CAT_EMPTY_QUERY && entry->scanWithAddInfo)) { - IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); + IndexTuple itup; + ItemId itemid = PageGetItemId(page, stackEntry->off); + + /* + * We don't want to crash if line pointer is not used. + */ + if (entry->queryCategory == RUM_CAT_EMPTY_QUERY && + !ItemIdHasStorage(itemid)) + goto endScanEntry; + + itup = (IndexTuple) PageGetItem(page, itemid); if (RumIsPostingTree(itup)) { BlockNumber rootPostingTree = RumGetPostingTree(itup); RumPostingTreeScan *gdi; - Page page; OffsetNumber maxoff, i; Pointer ptr; - RumKey item; + RumItem item; ItemPointerSetMin(&item.iptr); @@ -575,19 +647,16 @@ startScanEntry(RumState * rumstate, RumScanEntry entry) * root of posting tree. */ LockBuffer(stackEntry->buffer, RUM_UNLOCK); - needUnlock = FALSE; - gdi = rumPrepareScanPostingTree(rumstate->index, rootPostingTree, TRUE, + needUnlock = false; + gdi = rumPrepareScanPostingTree(rumstate->index, rootPostingTree, true, entry->scanDirection, entry->attnum, rumstate); entry->buffer = rumScanBeginPostingTree(gdi, entry->useMarkAddInfo ? &entry->markAddInfo : NULL); entry->gdi = gdi; - entry->context = AllocSetContextCreate(CurrentMemoryContext, - "RUM entry temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + + PredicateLockPage(rumstate->index, BufferGetBlockNumber(entry->buffer), snapshot); /* * We keep buffer pinned because we need to prevent deletion of @@ -600,7 +669,7 @@ startScanEntry(RumState * rumstate, RumScanEntry entry) /* * Keep page content in memory to prevent durable page locking */ - entry->list = (RumKey *) palloc(BLCKSZ * sizeof(RumKey)); + entry->list = (RumItem *) palloc(BLCKSZ * sizeof(RumItem)); maxoff = RumPageGetOpaque(page)->maxoff; entry->nlist = maxoff; @@ -608,32 +677,36 @@ startScanEntry(RumState * rumstate, RumScanEntry entry) for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, true, + rumstate); entry->list[i - FirstOffsetNumber] = item; } LockBuffer(entry->buffer, RUM_UNLOCK); entry->isFinished = setListPositionScanEntry(rumstate, entry); if (!entry->isFinished) - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; } else if (RumGetNPosting(itup) > 0) { entry->nlist = RumGetNPosting(itup); - entry->predictNumberResult = entry->nlist; - entry->list = (RumKey *) palloc(sizeof(RumKey) * entry->nlist); + entry->predictNumberResult = (uint32)entry->nlist; + entry->list = (RumItem *) palloc(sizeof(RumItem) * entry->nlist); - rumReadTuple(rumstate, entry->attnum, itup, entry->list); + rumReadTuple(rumstate, entry->attnum, itup, entry->list, true); entry->isFinished = setListPositionScanEntry(rumstate, entry); if (!entry->isFinished) - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; } if (entry->queryCategory == RUM_CAT_EMPTY_QUERY && entry->scanWithAddInfo) entry->stack = stackEntry; + + SCAN_ENTRY_GET_KEY(entry, rumstate, itup); } +endScanEntry: if (needUnlock) LockBuffer(stackEntry->buffer, RUM_UNLOCK); if (entry->stack == NULL) @@ -658,9 +731,9 @@ cmpEntries(RumState *rumstate, RumScanEntry e1, RumScanEntry e2) { int res; - if (e1->isFinished == TRUE) + if (e1->isFinished == true) { - if (e2->isFinished == TRUE) + if (e2->isFinished == true) return 0; else return 1; @@ -671,8 +744,8 @@ cmpEntries(RumState *rumstate, RumScanEntry e1, RumScanEntry e2) if (e1->attnumOrig != e2->attnumOrig) return (e1->attnumOrig < e2->attnumOrig) ? 1 : -1; - res = compareRumKey(rumstate, e1->attnumOrig, &e1->curRumKey, - &e2->curRumKey); + res = compareRumItem(rumstate, e1->attnumOrig, &e1->curItem, + &e2->curItem); return (ScanDirectionIsForward(e1->scanDirection)) ? res : -res; } @@ -693,12 +766,12 @@ startScan(IndexScanDesc scan) RumScanOpaque so = (RumScanOpaque) scan->opaque; RumState *rumstate = &so->rumstate; uint32 i; - RumScanType scanType = RumRegularScan; + RumScanType scanType = RumFastScan; MemoryContextSwitchTo(so->keyCtx); for (i = 0; i < so->totalentries; i++) { - startScanEntry(rumstate, so->entries[i]); + startScanEntry(rumstate, so->entries[i], scan->xs_snapshot); } MemoryContextSwitchTo(oldCtx); @@ -725,7 +798,7 @@ startScan(IndexScanDesc scan) for (i = 0; i < so->totalentries; i++) { so->entries[i]->predictNumberResult /= so->totalentries; - so->entries[i]->reduceResult = TRUE; + so->entries[i]->reduceResult = true; } } } @@ -734,8 +807,10 @@ startScan(IndexScanDesc scan) startScanKey(rumstate, so->keys[i]); /* - * Check if we can use a fast scan: should exists at least one - * preConsistent method. + * Check if we can use a fast scan. + * Use fast scan iff all keys have preConsistent method. But we can stop + * checking if at least one key have not preConsistent method and use + * regular scan. */ for (i = 0; i < so->nkeys; i++) { @@ -747,9 +822,10 @@ startScan(IndexScanDesc scan) scanType = RumFullScan; break; } - else if (so->rumstate.canPreConsistent[key->attnum - 1]) + /* Else check keys for preConsistent method */ + else if (!so->rumstate.canPreConsistent[key->attnum - 1]) { - scanType = RumFastScan; + scanType = RumRegularScan; break; } } @@ -760,7 +836,7 @@ startScan(IndexScanDesc scan) { RumScanEntry entry = so->entries[i]; - if (entry->isPartialMatch || entry->forceUseBitmap) + if (entry->isPartialMatch) { scanType = RumRegularScan; break; @@ -768,7 +844,7 @@ startScan(IndexScanDesc scan) } } - ItemPointerSetInvalid(&so->key.iptr); + ItemPointerSetInvalid(&so->item.iptr); if (scanType == RumFastScan) { @@ -783,7 +859,7 @@ startScan(IndexScanDesc scan) for (i = 0; i < so->totalentries; i++) { if (!so->sortedEntries[i]->isFinished) - entryGetItem(&so->rumstate, so->sortedEntries[i], NULL); + entryGetItem(&so->rumstate, so->sortedEntries[i], NULL, scan->xs_snapshot); } qsort_arg(so->sortedEntries, so->totalentries, sizeof(RumScanEntry), scan_entry_cmp, rumstate); @@ -798,7 +874,7 @@ startScan(IndexScanDesc scan) * to prevent interference with vacuum */ static void -entryGetNextItem(RumState * rumstate, RumScanEntry entry) +entryGetNextItem(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) { Page page; @@ -806,7 +882,7 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) { if (entry->offset >= 0 && entry->offset < entry->nlist) { - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; return; } @@ -814,7 +890,9 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) LockBuffer(entry->buffer, RUM_SHARE); page = BufferGetPage(entry->buffer); - if (scanPage(rumstate, entry, &entry->curRumKey, false)) + PredicateLockPage(rumstate->index, BufferGetBlockNumber(entry->buffer), snapshot); + + if (scanPage(rumstate, entry, &entry->curItem, false)) { LockBuffer(entry->buffer, RUM_UNLOCK); return; @@ -825,10 +903,10 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) OffsetNumber maxoff, i; Pointer ptr; - RumKey item; + RumItem item; bool searchBorder = (ScanDirectionIsForward(entry->scanDirection) && - ItemPointerIsValid(&entry->curRumKey.iptr)); + ItemPointerIsValid(&entry->curItem.iptr)); /* * It's needed to go by right link. During that we should refind * first ItemPointer greater that stored @@ -837,10 +915,10 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) (ScanDirectionIsBackward(entry->scanDirection) && RumPageLeftMost(page))) { UnlockReleaseBuffer(entry->buffer); - ItemPointerSetInvalid(&entry->curRumKey.iptr); + ItemPointerSetInvalid(&entry->curItem.iptr); entry->buffer = InvalidBuffer; - entry->isFinished = TRUE; + entry->isFinished = true; entry->gdi->stack->buffer = InvalidBuffer; return; } @@ -851,6 +929,8 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) entry->gdi->stack->blkno = BufferGetBlockNumber(entry->buffer); page = BufferGetPage(entry->buffer); + PredicateLockPage(rumstate->index, BufferGetBlockNumber(entry->buffer), snapshot); + entry->offset = -1; maxoff = RumPageGetOpaque(page)->maxoff; entry->nlist = maxoff; @@ -859,17 +939,18 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, true, + rumstate); entry->list[i - FirstOffsetNumber] = item; if (searchBorder) { /* don't search position for backward scan, because of split algorithm */ - int cmp = compareRumKey(rumstate, - entry->attnumOrig, - &entry->curRumKey, - &item); + int cmp = compareRumItem(rumstate, + entry->attnumOrig, + &entry->curItem, + &item); if (cmp > 0) { @@ -884,14 +965,14 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) if (entry->offset < 0) { if (ScanDirectionIsForward(entry->scanDirection) && - ItemPointerIsValid(&entry->curRumKey.iptr)) + ItemPointerIsValid(&entry->curItem.iptr)) /* go on next page */ break; entry->offset = (ScanDirectionIsForward(entry->scanDirection)) ? 0 : entry->nlist - 1; } - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; return; } @@ -899,7 +980,7 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry) } static bool -entryGetNextItemList(RumState * rumstate, RumScanEntry entry) +entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) { Page page; IndexTuple itup; @@ -911,7 +992,7 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) Assert(ScanDirectionIsForward(entry->scanDirection)); entry->buffer = InvalidBuffer; - RumItemSetMin(&entry->curRumKey); + RumItemSetMin(&entry->curItem); entry->offset = InvalidOffsetNumber; entry->list = NULL; if (entry->gdi) @@ -927,7 +1008,7 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) entry->nlist = 0; } entry->matchSortstate = NULL; - entry->reduceResult = FALSE; + entry->reduceResult = false; entry->predictNumberResult = 0; rumPrepareEntryScan(&btree, entry->attnum, @@ -940,8 +1021,8 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) */ if (!moveRightIfItNeeded(&btree, entry->stack)) { - ItemPointerSetInvalid(&entry->curRumKey.iptr); - entry->isFinished = TRUE; + ItemPointerSetInvalid(&entry->curItem.iptr); + entry->isFinished = true; LockBuffer(entry->stack->buffer, RUM_UNLOCK); return false; } @@ -956,8 +1037,8 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) */ if (rumtuple_get_attrnum(btree.rumstate, itup) != entry->attnum) { - ItemPointerSetInvalid(&entry->curRumKey.iptr); - entry->isFinished = TRUE; + ItemPointerSetInvalid(&entry->curItem.iptr); + entry->isFinished = true; LockBuffer(entry->stack->buffer, RUM_UNLOCK); return false; } @@ -969,11 +1050,10 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) { BlockNumber rootPostingTree = RumGetPostingTree(itup); RumPostingTreeScan *gdi; - Page page; OffsetNumber maxoff, i; Pointer ptr; - RumKey item; + RumItem item; ItemPointerSetMin(&item.iptr); @@ -987,16 +1067,13 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) LockBuffer(entry->stack->buffer, RUM_UNLOCK); needUnlock = false; gdi = rumPrepareScanPostingTree(rumstate->index, - rootPostingTree, TRUE, entry->scanDirection, + rootPostingTree, true, entry->scanDirection, entry->attnumOrig, rumstate); entry->buffer = rumScanBeginPostingTree(gdi, NULL); entry->gdi = gdi; - entry->context = AllocSetContextCreate(CurrentMemoryContext, - "RUM entry temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + + PredicateLockPage(rumstate->index, BufferGetBlockNumber(entry->buffer), snapshot); /* * We keep buffer pinned because we need to prevent deletion of @@ -1010,7 +1087,7 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) /* * Keep page content in memory to prevent durable page locking */ - entry->list = (RumKey *) palloc(BLCKSZ * sizeof(RumKey)); + entry->list = (RumItem *) palloc(BLCKSZ * sizeof(RumItem)); maxoff = RumPageGetOpaque(page)->maxoff; entry->nlist = maxoff; @@ -1018,28 +1095,31 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, true, + rumstate); entry->list[i - FirstOffsetNumber] = item; } LockBuffer(entry->buffer, RUM_UNLOCK); - entry->isFinished = FALSE; + entry->isFinished = false; } else if (RumGetNPosting(itup) > 0) { entry->nlist = RumGetNPosting(itup); - entry->predictNumberResult = entry->nlist; - entry->list = (RumKey *) palloc(sizeof(RumKey) * entry->nlist); + entry->predictNumberResult = (uint32)entry->nlist; + entry->list = (RumItem *) palloc(sizeof(RumItem) * entry->nlist); - rumReadTuple(rumstate, entry->attnum, itup, entry->list); + rumReadTuple(rumstate, entry->attnum, itup, entry->list, true); entry->isFinished = setListPositionScanEntry(rumstate, entry); } - Assert(entry->nlist > 0); + Assert(entry->nlist > 0 && entry->list); - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; + SCAN_ENTRY_GET_KEY(entry, rumstate, itup); + /* * Done with this entry, go to the next for the future. */ @@ -1051,17 +1131,22 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry) return true; } +#if PG_VERSION_NUM < 150000 #define rum_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) +#else +#define rum_rand() pg_prng_double(&pg_global_prng_state) +#endif + #define dropItem(e) ( rum_rand() > ((double)RumFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* * Sets entry->curItem to next heap item pointer for one entry of one scan key, - * or sets entry->isFinished to TRUE if there are no more. + * or sets entry->isFinished to true if there are no more. * * Item pointers must be returned in ascending order. */ static void -entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList) +entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList, Snapshot snapshot) { Assert(!entry->isFinished); @@ -1074,91 +1159,107 @@ entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList) do { - RumKey collected; - RumKey *current_collected; + RumScanItem collected; + RumScanItem *current_collected; /* We are finished, but should return last result */ - if (ItemPointerIsMax(&entry->collectRumKey.iptr)) + if (ItemPointerIsMax(&entry->collectRumItem.item.iptr)) { - entry->isFinished = TRUE; + entry->isFinished = true; rum_tuplesort_end(entry->matchSortstate); entry->matchSortstate = NULL; break; } - /* collectRumKey could store the begining of current result */ - if (!ItemPointerIsMin(&entry->collectRumKey.iptr)) - collected = entry->collectRumKey; + /* collectRumItem could store the begining of current result */ + if (!ItemPointerIsMin(&entry->collectRumItem.item.iptr)) + collected = entry->collectRumItem; else - ItemPointerSetMin(&collected.iptr); + MemSet(&collected, 0, sizeof(collected)); - ItemPointerSetMin(&entry->curRumKey.iptr); + ItemPointerSetMin(&entry->curItem.iptr); for(;;) { bool should_free; - current_collected = rum_tuplesort_getrumkey( + current_collected = rum_tuplesort_getrumitem( entry->matchSortstate, ScanDirectionIsForward(entry->scanDirection) ? true : false, &should_free); if (current_collected == NULL) { - entry->curRumKey = collected; + entry->curItem = collected.item; + if (entry->useCurKey) + { + entry->curKey = collected.keyValue; + entry->curKeyCategory = collected.keyCategory; + } break; } - if (ItemPointerIsMin(&collected.iptr) || - rumCompareItemPointers(&collected.iptr, - ¤t_collected->iptr) == 0) + if (ItemPointerIsMin(&collected.item.iptr) || + rumCompareItemPointers(&collected.item.iptr, + ¤t_collected->item.iptr) == 0) { Datum joinedAddInfo = (Datum)0; bool joinedAddInfoIsNull; - if (ItemPointerIsMin(&collected.iptr)) + if (ItemPointerIsMin(&collected.item.iptr)) { - joinedAddInfoIsNull = true; /* wiil change later */ - collected.addInfoIsNull = true; + joinedAddInfoIsNull = true; /* will change later */ + collected.item.addInfoIsNull = true; } else - joinedAddInfoIsNull = collected.addInfoIsNull || - current_collected->addInfoIsNull; + joinedAddInfoIsNull = collected.item.addInfoIsNull || + current_collected->item.addInfoIsNull; if (joinedAddInfoIsNull) { joinedAddInfoIsNull = - (collected.addInfoIsNull && current_collected->addInfoIsNull); + (collected.item.addInfoIsNull && + current_collected->item.addInfoIsNull); - if (collected.addInfoIsNull == false) - joinedAddInfo = collected.addInfo; - else if (current_collected->addInfoIsNull == false) - joinedAddInfo = current_collected->addInfo; + if (collected.item.addInfoIsNull == false) + joinedAddInfo = collected.item.addInfo; + else if (current_collected->item.addInfoIsNull == false) + joinedAddInfo = current_collected->item.addInfo; } else if (rumstate->canJoinAddInfo[entry->attnumOrig - 1]) { joinedAddInfo = FunctionCall2( &rumstate->joinAddInfoFn[entry->attnumOrig - 1], - collected.addInfo, - current_collected->addInfo); + collected.item.addInfo, + current_collected->item.addInfo); } else { - joinedAddInfo = current_collected->addInfo; + joinedAddInfo = current_collected->item.addInfo; } - collected.iptr = current_collected->iptr; - collected.addInfoIsNull = joinedAddInfoIsNull; - collected.addInfo = joinedAddInfo; + collected.item.iptr = current_collected->item.iptr; + collected.item.addInfoIsNull = joinedAddInfoIsNull; + collected.item.addInfo = joinedAddInfo; + if (entry->useCurKey) + { + collected.keyValue = current_collected->keyValue; + collected.keyCategory = current_collected->keyCategory; + } if (should_free) pfree(current_collected); } else { - entry->curRumKey = collected; - entry->collectRumKey = *current_collected; + entry->curItem = collected.item; + entry->collectRumItem = *current_collected; + if (entry->useCurKey) + { + entry->curKey = collected.keyValue; + entry->curKeyCategory = collected.keyCategory; + } if (should_free) pfree(current_collected); break; @@ -1168,12 +1269,12 @@ entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList) if (current_collected == NULL) { /* mark next call as last */ - ItemPointerSetMax(&entry->collectRumKey.iptr); + ItemPointerSetMax(&entry->collectRumItem.item.iptr); /* even current call is last */ - if (ItemPointerIsMin(&entry->curRumKey.iptr)) + if (ItemPointerIsMin(&entry->curItem.iptr)) { - entry->isFinished = TRUE; + entry->isFinished = true; rum_tuplesort_end(entry->matchSortstate); entry->matchSortstate = NULL; break; @@ -1185,33 +1286,34 @@ entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList) { if (entry->offset >= 0 && entry->offset < entry->nlist) { - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; } else if (entry->stack) { entry->offset++; - if (entryGetNextItemList(rumstate, entry) && nextEntryList) + if (entryGetNextItemList(rumstate, entry, snapshot) && nextEntryList) *nextEntryList = true; } else { - ItemPointerSetInvalid(&entry->curRumKey.iptr); - entry->isFinished = TRUE; + ItemPointerSetInvalid(&entry->curItem.iptr); + entry->isFinished = true; } } + /* Get next item from posting tree */ else { do { - entryGetNextItem(rumstate, entry); - } while (entry->isFinished == FALSE && - entry->reduceResult == TRUE && + entryGetNextItem(rumstate, entry, snapshot); + } while (entry->isFinished == false && + entry->reduceResult == true && dropItem(entry)); if (entry->stack && entry->isFinished) { - entry->isFinished = FALSE; - if (entryGetNextItemList(rumstate, entry) && nextEntryList) + entry->isFinished = false; + if (entryGetNextItemList(rumstate, entry, snapshot) && nextEntryList) *nextEntryList = true; } } @@ -1226,37 +1328,36 @@ entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList) * TID passes the consistentFn test. If so, key->recheckCurItem is set true * iff recheck is needed for this item pointer * - * If all entry streams are exhausted, sets key->isFinished to TRUE. + * If all entry streams are exhausted, sets key->isFinished to true. * * Item pointers must be returned in ascending order. */ static int -compareRumKeyScanDirection(RumState *rumstate, AttrNumber attno, - ScanDirection scanDirection, - RumKey *a, RumKey *b) +compareRumItemScanDirection(RumState *rumstate, AttrNumber attno, + ScanDirection scanDirection, + RumItem *a, RumItem *b) { - int res = compareRumKey(rumstate, attno, a, b); + int res = compareRumItem(rumstate, attno, a, b); return (ScanDirectionIsForward(scanDirection)) ? res : -res; } static int -compareCurRumKeyScanDirection(RumState *rumstate, RumScanEntry entry, - RumKey *minItem) +compareCurRumItemScanDirection(RumState *rumstate, RumScanEntry entry, + RumItem *minItem) { - return compareRumKeyScanDirection(rumstate, - (entry->forceUseBitmap) ? - InvalidAttrNumber : entry->attnumOrig, + return compareRumItemScanDirection(rumstate, + entry->attnumOrig, entry->scanDirection, - &entry->curRumKey, minItem); + &entry->curItem, minItem); } static void keyGetItem(RumState * rumstate, MemoryContext tempCtx, RumScanKey key) { - RumKey minItem; + RumItem minItem; uint32 i; RumScanEntry entry; bool res; @@ -1278,9 +1379,9 @@ keyGetItem(RumState * rumstate, MemoryContext tempCtx, RumScanKey key) allFinished = false; if (minItemInited == false || - compareCurRumKeyScanDirection(rumstate, entry, &minItem) < 0) + compareCurRumItemScanDirection(rumstate, entry, &minItem) < 0) { - minItem = entry->curRumKey; + minItem = entry->curItem; minItemInited = true; } } @@ -1289,7 +1390,7 @@ keyGetItem(RumState * rumstate, MemoryContext tempCtx, RumScanKey key) if (allFinished) { /* all entries are finished */ - key->isFinished = TRUE; + key->isFinished = true; return; } @@ -1313,16 +1414,16 @@ keyGetItem(RumState * rumstate, MemoryContext tempCtx, RumScanKey key) for (i = 0; i < key->nentries; i++) { entry = key->scanEntry[i]; - if (entry->isFinished == FALSE && - rumCompareItemPointers(&entry->curRumKey.iptr, &key->curItem.iptr) == 0) + if (entry->isFinished == false && + rumCompareItemPointers(&entry->curItem.iptr, &key->curItem.iptr) == 0) { - key->entryRes[i] = TRUE; - key->addInfo[i] = entry->curRumKey.addInfo; - key->addInfoIsNull[i] = entry->curRumKey.addInfoIsNull; + key->entryRes[i] = true; + key->addInfo[i] = entry->curItem.addInfo; + key->addInfoIsNull[i] = entry->curItem.addInfoIsNull; } else { - key->entryRes[i] = FALSE; + key->entryRes[i] = false; key->addInfo[i] = (Datum) 0; key->addInfoIsNull[i] = true; } @@ -1347,12 +1448,12 @@ keyGetItem(RumState * rumstate, MemoryContext tempCtx, RumScanKey key) * keyGetItem() the combination logic is known only to the consistentFn. */ static bool -scanGetItemRegular(IndexScanDesc scan, RumKey *advancePast, - RumKey *item, bool *recheck) +scanGetItemRegular(IndexScanDesc scan, RumItem *advancePast, + RumItem *item, bool *recheck) { RumScanOpaque so = (RumScanOpaque) scan->opaque; RumState *rumstate = &so->rumstate; - RumKey myAdvancePast = *advancePast; + RumItem myAdvancePast = *advancePast; uint32 i; bool allFinished; bool match, itemSet; @@ -1364,18 +1465,18 @@ scanGetItemRegular(IndexScanDesc scan, RumKey *advancePast, * scan direction. On first call myAdvancePast is invalid, * so anyway we are needed to call entryGetItem() */ - allFinished = TRUE; + allFinished = true; for (i = 0; i < so->totalentries; i++) { RumScanEntry entry = so->entries[i]; - while (entry->isFinished == FALSE && + while (entry->isFinished == false && (!ItemPointerIsValid(&myAdvancePast.iptr) || - compareCurRumKeyScanDirection(rumstate, entry, + compareCurRumItemScanDirection(rumstate, entry, &myAdvancePast) <= 0)) { - entryGetItem(rumstate, entry, NULL); + entryGetItem(rumstate, entry, NULL, scan->xs_snapshot); if (!ItemPointerIsValid(&myAdvancePast.iptr)) break; @@ -1417,8 +1518,8 @@ scanGetItemRegular(IndexScanDesc scan, RumKey *advancePast, *item = key->curItem; itemSet = true; } - cmp = compareRumKey(rumstate, key->attnumOrig, - &key->curItem, item); + cmp = compareRumItem(rumstate, key->attnumOrig, + &key->curItem, item); if ((ScanDirectionIsForward(key->scanDirection) && cmp < 0) || (ScanDirectionIsBackward(key->scanDirection) && cmp > 0)) *item = key->curItem; @@ -1442,6 +1543,7 @@ scanGetItemRegular(IndexScanDesc scan, RumKey *advancePast, continue; } match = false; + break; } if (match) @@ -1463,9 +1565,23 @@ scanGetItemRegular(IndexScanDesc scan, RumKey *advancePast, RumScanKey key = so->keys[i]; if (key->orderBy) - continue; + { + int j; + + /* Catch up order key with *item */ + for (j = 0; j < key->nentries; j++) + { + RumScanEntry entry = key->scanEntry[j]; - if (key->recheckCurItem) + while (entry->isFinished == false && + compareRumItem(rumstate, key->attnumOrig, + &entry->curItem, item) < 0) + { + entryGetItem(rumstate, entry, NULL, scan->xs_snapshot); + } + } + } + else if (key->recheckCurItem) { *recheck = true; break; @@ -1480,11 +1596,10 @@ scanGetItemRegular(IndexScanDesc scan, RumKey *advancePast, * of page. */ static bool -scanPage(RumState * rumstate, RumScanEntry entry, RumKey *item, - bool equalOk) +scanPage(RumState * rumstate, RumScanEntry entry, RumItem *item, bool equalOk) { int j; - RumKey iter_item; + RumItem iter_item; Pointer ptr; OffsetNumber first = FirstOffsetNumber, i, @@ -1498,8 +1613,8 @@ scanPage(RumState * rumstate, RumScanEntry entry, RumKey *item, if (ScanDirectionIsForward(entry->scanDirection) && !RumPageRightMost(page)) { - cmp = compareRumKey(rumstate, entry->attnumOrig, - RumDataPageGetRightBound(page), item); + cmp = compareRumItem(rumstate, entry->attnumOrig, + RumDataPageGetRightBound(page), item); if (cmp < 0 || (cmp <= 0 && !equalOk)) return false; } @@ -1516,11 +1631,10 @@ scanPage(RumState * rumstate, RumScanEntry entry, RumKey *item, if (rumstate->useAlternativeOrder) { - RumKey k; + RumItem k; convertIndexToKey(index, &k); - cmp = compareRumKey(rumstate, entry->attnumOrig, - &k, item); + cmp = compareRumItem(rumstate, entry->attnumOrig, &k, item); } else cmp = rumCompareItemPointers(&index->iptr, &item->iptr); @@ -1555,14 +1669,15 @@ scanPage(RumState * rumstate, RumScanEntry entry, RumKey *item, bound = -1; for (i = first; i <= maxoff; i++) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &iter_item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &iter_item, true, + rumstate); entry->list[i - first] = iter_item; if (bound != -1) continue; - cmp = compareRumKey(rumstate, entry->attnumOrig, - item, &iter_item); + cmp = compareRumItem(rumstate, entry->attnumOrig, + item, &iter_item); if (cmp <= 0) { @@ -1598,7 +1713,7 @@ scanPage(RumState * rumstate, RumScanEntry entry, RumKey *item, return false; end: - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; return true; } @@ -1608,36 +1723,38 @@ scanPage(RumState * rumstate, RumScanEntry entry, RumKey *item, */ static void -entryFindItem(RumState * rumstate, RumScanEntry entry, RumKey * item) +entryFindItem(RumState * rumstate, RumScanEntry entry, RumItem * item, Snapshot snapshot) { if (entry->nlist == 0) { - entry->isFinished = TRUE; + entry->isFinished = true; return; } - Assert(!entry->forceUseBitmap); - /* Try to find in loaded part of page */ if ((ScanDirectionIsForward(entry->scanDirection) && - compareRumKey(rumstate, entry->attnumOrig, - &entry->list[entry->nlist - 1], item) >= 0) || + compareRumItem(rumstate, entry->attnumOrig, + &entry->list[entry->nlist - 1], item) >= 0) || (ScanDirectionIsBackward(entry->scanDirection) && - compareRumKey(rumstate, entry->attnumOrig, - &entry->list[0], item) <= 0)) + compareRumItem(rumstate, entry->attnumOrig, + &entry->list[0], item) <= 0)) { - if (compareRumKeyScanDirection(rumstate, entry->attnumOrig, + if (compareRumItemScanDirection(rumstate, entry->attnumOrig, entry->scanDirection, - &entry->curRumKey, item) >= 0) + &entry->curItem, item) >= 0 && + entry->offset >= 0 && + entry->offset < entry->nlist && + rumCompareItemPointers(&entry->curItem.iptr, + &entry->list[entry->offset].iptr) == 0) return; while (entry->offset >= 0 && entry->offset < entry->nlist) { - if (compareRumKeyScanDirection(rumstate, entry->attnumOrig, + if (compareRumItemScanDirection(rumstate, entry->attnumOrig, entry->scanDirection, &entry->list[entry->offset], item) >= 0) { - entry->curRumKey = entry->list[entry->offset]; + entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; return; } @@ -1647,13 +1764,15 @@ entryFindItem(RumState * rumstate, RumScanEntry entry, RumKey * item) if (!BufferIsValid(entry->buffer)) { - entry->isFinished = TRUE; + entry->isFinished = true; return; } /* Check rest of page */ LockBuffer(entry->buffer, RUM_SHARE); + PredicateLockPage(rumstate->index, BufferGetBlockNumber(entry->buffer), snapshot); + if (scanPage(rumstate, entry, item, true)) { LockBuffer(entry->buffer, RUM_UNLOCK); @@ -1669,6 +1788,8 @@ entryFindItem(RumState * rumstate, RumScanEntry entry, RumKey * item) entry->gdi->stack = rumReFindLeafPage(&entry->gdi->btree, entry->gdi->stack); entry->buffer = entry->gdi->stack->buffer; + PredicateLockPage(rumstate->index, BufferGetBlockNumber(entry->buffer), snapshot); + if (scanPage(rumstate, entry, item, true)) { LockBuffer(entry->buffer, RUM_UNLOCK); @@ -1684,11 +1805,13 @@ entryFindItem(RumState * rumstate, RumScanEntry entry, RumKey * item) if (entry->buffer == InvalidBuffer) { - ItemPointerSetInvalid(&entry->curRumKey.iptr); - entry->isFinished = TRUE; + ItemPointerSetInvalid(&entry->curItem.iptr); + entry->isFinished = true; return; } + PredicateLockPage(rumstate->index, BufferGetBlockNumber(entry->buffer), snapshot); + entry->gdi->stack->blkno = BufferGetBlockNumber(entry->buffer); if (scanPage(rumstate, entry, item, true)) @@ -1758,7 +1881,7 @@ preConsistentCheck(RumScanOpaque so) * to i. */ static void -entryShift(int i, RumScanOpaque so, bool find) +entryShift(int i, RumScanOpaque so, bool find, Snapshot snapshot) { int minIndex = -1, j; @@ -1782,9 +1905,9 @@ entryShift(int i, RumScanOpaque so, bool find) /* Do shift of required type */ if (find) entryFindItem(rumstate, so->sortedEntries[minIndex], - &so->sortedEntries[i - 1]->curRumKey); + &so->sortedEntries[i - 1]->curItem, snapshot); else if (!so->sortedEntries[minIndex]->isFinished) - entryGetItem(rumstate, so->sortedEntries[minIndex], NULL); + entryGetItem(rumstate, so->sortedEntries[minIndex], NULL, snapshot); /* Restore order of so->sortedEntries */ while (minIndex > 0 && @@ -1804,8 +1927,8 @@ entryShift(int i, RumScanOpaque so, bool find) * Get next item pointer using fast scan. */ static bool -scanGetItemFast(IndexScanDesc scan, RumKey *advancePast, - RumKey *item, bool *recheck) +scanGetItemFast(IndexScanDesc scan, RumItem *advancePast, + RumItem *item, bool *recheck) { RumScanOpaque so = (RumScanOpaque) scan->opaque; int i, @@ -1817,7 +1940,7 @@ scanGetItemFast(IndexScanDesc scan, RumKey *advancePast, if (so->entriesIncrIndex >= 0) { for (k = so->entriesIncrIndex; k < so->totalentries; k++) - entryShift(k, so, false); + entryShift(k, so, false, scan->xs_snapshot); } for (;;) @@ -1848,12 +1971,12 @@ scanGetItemFast(IndexScanDesc scan, RumKey *advancePast, * If we found false in preConsistent then we can safely move entries * which was true in preConsistent argument. */ - if (so->sortedEntries[i - 1]->isFinished == TRUE) + if (so->sortedEntries[i - 1]->isFinished == true) return false; if (preConsistentResult == false) { - entryShift(i, so, true); + entryShift(i, so, true, scan->xs_snapshot); continue; } @@ -1870,17 +1993,17 @@ scanGetItemFast(IndexScanDesc scan, RumKey *advancePast, { RumScanEntry entry = key->scanEntry[j]; - if (entry->isFinished == FALSE && - rumCompareItemPointers(&entry->curRumKey.iptr, - &so->sortedEntries[so->totalentries - 1]->curRumKey.iptr) == 0) + if (entry->isFinished == false && + rumCompareItemPointers(&entry->curItem.iptr, + &so->sortedEntries[so->totalentries - 1]->curItem.iptr) == 0) { - key->entryRes[j] = TRUE; - key->addInfo[j] = entry->curRumKey.addInfo; - key->addInfoIsNull[j] = entry->curRumKey.addInfoIsNull; + key->entryRes[j] = true; + key->addInfo[j] = entry->curItem.addInfo; + key->addInfoIsNull[j] = entry->curItem.addInfoIsNull; } else { - key->entryRes[j] = FALSE; + key->entryRes[j] = false; key->addInfo[j] = (Datum) 0; key->addInfoIsNull[j] = true; } @@ -1890,7 +2013,7 @@ scanGetItemFast(IndexScanDesc scan, RumKey *advancePast, { consistentResult = false; for (j = k; j < so->totalentries; j++) - entryShift(j, so, false); + entryShift(j, so, false, scan->xs_snapshot); continue; } } @@ -1914,7 +2037,7 @@ scanGetItemFast(IndexScanDesc scan, RumKey *advancePast, } } - *item = so->sortedEntries[so->totalentries - 1]->curRumKey; + *item = so->sortedEntries[so->totalentries - 1]->curItem; so->entriesIncrIndex = k; return true; @@ -1928,8 +2051,8 @@ scanGetItemFast(IndexScanDesc scan, RumKey *advancePast, * First key is used to full scan, other keys are only used for ranking. */ static bool -scanGetItemFull(IndexScanDesc scan, RumKey *advancePast, - RumKey *item, bool *recheck) +scanGetItemFull(IndexScanDesc scan, RumItem *advancePast, + RumItem *item, bool *recheck) { RumScanOpaque so = (RumScanOpaque) scan->opaque; RumScanKey key; @@ -1949,15 +2072,19 @@ scanGetItemFull(IndexScanDesc scan, RumKey *advancePast, */ entry = so->entries[0]; - entryGetItem(&so->rumstate, entry, &nextEntryList); - if (entry->isFinished == TRUE) + if (entry->isFinished) + return false; + + entryGetItem(&so->rumstate, entry, &nextEntryList, scan->xs_snapshot); + + if (entry->isFinished) return false; - /* Fill outerAddInfo using callConstistentFn() */ - key->entryRes[0] = TRUE; - key->addInfo[0] = entry->curRumKey.addInfo; - key->addInfoIsNull[0] = entry->curRumKey.addInfoIsNull; - callConsistentFn(&so->rumstate, key); + /* Fill outerAddInfo */ + key->entryRes[0] = true; + key->addInfo[0] = entry->curItem.addInfo; + key->addInfoIsNull[0] = entry->curItem.addInfoIsNull; + callAddInfoConsistentFn(&so->rumstate, key); /* Move related order by entries */ if (nextEntryList) @@ -1966,9 +2093,9 @@ scanGetItemFull(IndexScanDesc scan, RumKey *advancePast, RumScanEntry orderEntry = so->entries[i]; if (orderEntry->nlist > 0) { - orderEntry->isFinished = FALSE; + orderEntry->isFinished = false; orderEntry->offset = InvalidOffsetNumber; - RumItemSetMin(&orderEntry->curRumKey); + RumItemSetMin(&orderEntry->curItem); } } @@ -1976,14 +2103,14 @@ scanGetItemFull(IndexScanDesc scan, RumKey *advancePast, { RumScanEntry orderEntry = so->entries[i]; - while (orderEntry->isFinished == FALSE && - (!ItemPointerIsValid(&orderEntry->curRumKey.iptr) || - compareCurRumKeyScanDirection(&so->rumstate, orderEntry, - &entry->curRumKey) < 0)) - entryGetItem(&so->rumstate, orderEntry, NULL); + while (orderEntry->isFinished == false && + (!ItemPointerIsValid(&orderEntry->curItem.iptr) || + compareCurRumItemScanDirection(&so->rumstate, orderEntry, + &entry->curItem) < 0)) + entryGetItem(&so->rumstate, orderEntry, NULL, scan->xs_snapshot); } - *item = entry->curRumKey; + *item = entry->curItem; *recheck = false; return true; } @@ -1992,8 +2119,8 @@ scanGetItemFull(IndexScanDesc scan, RumKey *advancePast, * Get next item whether using regular or fast scan. */ static bool -scanGetItem(IndexScanDesc scan, RumKey *advancePast, - RumKey *item, bool *recheck) +scanGetItem(IndexScanDesc scan, RumItem *advancePast, + RumItem *item, bool *recheck) { RumScanOpaque so = (RumScanOpaque) scan->opaque; @@ -2014,6 +2141,7 @@ rumgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) RumScanOpaque so = (RumScanOpaque) scan->opaque; int64 ntids = 0; bool recheck; + RumItem item; /* * Set up the scan keys, and check for unsatisfiable query. @@ -2026,17 +2154,6 @@ rumgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) ntids = 0; - /* - * First, scan the pending list and collect any matching entries into the - * bitmap. After we scan a pending item, some other backend could post it - * into the main index, and so we might visit it a second time during the - * main scan. This is okay because we'll just re-set the same bit in the - * bitmap. (The possibility of duplicate visits is a major reason why RUM - * can't support the amgettuple API, however.) Note that it would not do - * to scan the main index before the pending list, since concurrent - * cleanup could then make us miss entries entirely. - */ - so->tbm = tbm; so->entriesIncrIndex = -1; /* @@ -2044,14 +2161,16 @@ rumgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) */ startScan(scan); + ItemPointerSetInvalid(&item.iptr); + for (;;) { CHECK_FOR_INTERRUPTS(); - if (!scanGetItem(scan, &so->key, &so->key, &recheck)) + if (!scanGetItem(scan, &item, &item, &recheck)) break; - tbm_add_tuples(tbm, &so->key.iptr, 1, recheck); + tbm_add_tuples(tbm, &item.iptr, 1, recheck); ntids++; } @@ -2074,21 +2193,36 @@ keyGetOrdering(RumState * rumstate, MemoryContext tempCtx, RumScanKey key, return get_float8_infinity(); return DatumGetFloat8(FunctionCall3( - &rumstate->outerOrderingFn[rumstate->attrnOrderByColumn - 1], + &rumstate->outerOrderingFn[rumstate->attrnAttachColumn - 1], key->outerAddInfo, key->queryValues[0], UInt16GetDatum(key->strategy) )); } + else if (key->useCurKey) + { + Assert(key->nentries == 0); + Assert(key->nuserentries == 0); + + if (key->curKeyCategory != RUM_CAT_NORM_KEY) + return get_float8_infinity(); + + return DatumGetFloat8(FunctionCall3( + &rumstate->orderingFn[key->attnum - 1], + key->curKey, + key->query, + UInt16GetDatum(key->strategy) + )); + } for (i = 0; i < key->nentries; i++) { entry = key->scanEntry[i]; - if (entry->isFinished == FALSE && - rumCompareItemPointers(&entry->curRumKey.iptr, iptr) == 0) + if (entry->isFinished == false && + rumCompareItemPointers(&entry->curItem.iptr, iptr) == 0) { - key->addInfo[i] = entry->curRumKey.addInfo; - key->addInfoIsNull[i] = entry->curRumKey.addInfoIsNull; + key->addInfo[i] = entry->curItem.addInfo; + key->addInfoIsNull[i] = entry->curItem.addInfoIsNull; key->entryRes[i] = true; } else @@ -2122,15 +2256,17 @@ insertScanItem(RumScanOpaque so, bool recheck) j; item = (RumSortItem *) - MemoryContextAlloc(rum_tuplesort_get_memorycontext(so->sortstate), - RumSortItemSize(so->norderbys)); - item->iptr = so->key.iptr; + MemoryContextAllocZero(rum_tuplesort_get_memorycontext(so->sortstate), + RumSortItemSize(so->norderbys)); + item->iptr = so->item.iptr; item->recheck = recheck; - if (AttributeNumberIsValid(so->rumstate.attrnAddToColumn)) + if (AttributeNumberIsValid(so->rumstate.attrnAddToColumn) || so->willSort) { int nOrderByAnother = 0, - count = 0; + nOrderByKey = 0, + countByAnother = 0, + countByKey = 0; for (i = 0; i < so->nkeys; i++) { @@ -2139,11 +2275,15 @@ insertScanItem(RumScanOpaque so, bool recheck) so->keys[i]->outerAddInfoIsNull = true; nOrderByAnother++; } + else if (so->keys[i]->useCurKey) + nOrderByKey++; } - for (i = 0; count < nOrderByAnother && i < so->nkeys; i++) + for (i = 0; (countByAnother < nOrderByAnother || countByKey < nOrderByKey) && + i < so->nkeys; i++) { - if (so->keys[i]->attnum == so->rumstate.attrnAddToColumn && + if (countByAnother < nOrderByAnother && + so->keys[i]->attnum == so->rumstate.attrnAddToColumn && so->keys[i]->outerAddInfoIsNull == false) { Assert(!so->keys[i]->orderBy); @@ -2156,7 +2296,23 @@ insertScanItem(RumScanOpaque so, bool recheck) { so->keys[j]->outerAddInfoIsNull = false; so->keys[j]->outerAddInfo = so->keys[i]->outerAddInfo; - count++; + countByAnother++; + } + } + } + else if (countByKey < nOrderByKey && so->keys[i]->nentries > 0 && + so->keys[i]->scanEntry[0]->useCurKey) + { + Assert(!so->keys[i]->orderBy); + + for (j = i + 1; j < so->nkeys; j++) + { + if (so->keys[j]->useCurKey) + { + so->keys[j]->curKey = so->keys[i]->scanEntry[0]->curKey; + so->keys[j]->curKeyCategory = + so->keys[i]->scanEntry[0]->curKeyCategory; + countByKey++; } } } @@ -2170,14 +2326,7 @@ insertScanItem(RumScanOpaque so, bool recheck) continue; item->data[j] = keyGetOrdering(&so->rumstate, so->tempCtx, so->keys[i], - &so->key.iptr); - -#if 0 - elog(NOTICE, "%f %u:%u", item->data[j], - RumItemPointerGetBlockNumber(&item->iptr), - RumItemPointerGetOffsetNumber(&item->iptr)); -#endif - + &so->item.iptr); j++; } @@ -2218,6 +2367,14 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) RumSortItem *item; bool should_free; +#if PG_VERSION_NUM >= 120000 +#define GET_SCAN_TID(scan) ((scan)->xs_heaptid) +#define SET_SCAN_TID(scan, tid) ((scan)->xs_heaptid = (tid)) +#else +#define GET_SCAN_TID(scan) ((scan)->xs_ctup.t_self) +#define SET_SCAN_TID(scan, tid) ((scan)->xs_ctup.t_self = (tid)) +#endif + if (so->firstCall) { /* @@ -2227,6 +2384,7 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) rumNewScanKey(scan); so->firstCall = false; + ItemPointerSetInvalid(&GET_SCAN_TID(scan)); if (RumIsVoidRes(scan)) return false; @@ -2235,13 +2393,10 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) if (so->naturalOrder == NoMovementScanDirection) { so->sortstate = rum_tuplesort_begin_rum(work_mem, so->norderbys, - false, - so->totalentries > 0 && - so->entries[0]->queryCategory == RUM_CAT_EMPTY_QUERY && - so->entries[0]->scanWithAddInfo); + false, so->scanType == RumFullScan); - while (scanGetItem(scan, &so->key, &so->key, &recheck)) + while (scanGetItem(scan, &so->item, &so->item, &recheck)) { insertScanItem(so, recheck); } @@ -2251,9 +2406,9 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) if (so->naturalOrder != NoMovementScanDirection) { - if (scanGetItem(scan, &so->key, &so->key, &recheck)) + if (scanGetItem(scan, &so->item, &so->item, &recheck)) { - scan->xs_ctup.t_self = so->key.iptr; + SET_SCAN_TID(scan, so->item.iptr); scan->xs_recheck = recheck; scan->xs_recheckorderby = false; @@ -2275,7 +2430,7 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) uint32 i, j = 0; - if (rumCompareItemPointers(&scan->xs_ctup.t_self, &item->iptr) == 0) + if (rumCompareItemPointers(&GET_SCAN_TID(scan), &item->iptr) == 0) { if (should_free) pfree(item); @@ -2283,7 +2438,7 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) continue; } - scan->xs_ctup.t_self = item->iptr; + SET_SCAN_TID(scan, item->iptr); scan->xs_recheck = item->recheck; scan->xs_recheckorderby = false; diff --git a/src/ruminsert.c b/src/ruminsert.c index 23cb99541c..255e616c99 100644 --- a/src/ruminsert.c +++ b/src/ruminsert.c @@ -4,7 +4,7 @@ * insert routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -14,9 +14,12 @@ #include "postgres.h" #include "access/generic_xlog.h" +#if PG_VERSION_NUM >= 120000 +#include "access/tableam.h" +#endif +#include "storage/predicate.h" #include "catalog/index.h" #include "miscadmin.h" -#include "utils/memutils.h" #include "utils/datum.h" #include "rum.h" @@ -31,6 +34,15 @@ typedef struct BuildAccumulator accum; } RumBuildState; + +#if PG_VERSION_NUM >= 120000 +#define IndexBuildHeapScan(A, B, C, D, E, F) \ +table_index_build_scan(A, B, C, D, true, E, F, NULL) +#elif PG_VERSION_NUM >= 110000 +#define IndexBuildHeapScan(A, B, C, D, E, F) \ +IndexBuildHeapScan(A, B, C, D, E, F, NULL) +#endif + /* * Creates new posting tree with one page, containing the given TIDs. * Returns the page number (which will be the root of this posting tree). @@ -39,7 +51,7 @@ typedef struct */ static BlockNumber createPostingTree(RumState * rumstate, OffsetNumber attnum, Relation index, - RumKey * items, uint32 nitems) + RumItem * items, uint32 nitems) { BlockNumber blkno; Buffer buffer = RumNewBuffer(index); @@ -92,21 +104,21 @@ createPostingTree(RumState * rumstate, OffsetNumber attnum, Relation index, * Form a tuple for entry tree. * * If the tuple would be too big to be stored, function throws a suitable - * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE. + * error if errorTooBig is true, or returns NULL if errorTooBig is false. * * See src/backend/access/gin/README for a description of the index tuple * format that is being built here. We build on the assumption that we * are making a leaf-level key entry containing a posting list of nipd items. * If the caller is actually trying to make a posting-tree entry, non-leaf * entry, or pending-list entry, it should pass nipd = 0 and then overwrite - * the t_tid fields as necessary. In any case, ipd can be NULL to skip + * the t_tid fields as necessary. In any case, items can be NULL to skip * copying any itempointers into the posting list; the caller is responsible - * for filling the posting list afterwards, if ipd = NULL and nipd > 0. + * for filling the posting list afterwards, if items = NULL and nipd > 0. */ static IndexTuple RumFormTuple(RumState * rumstate, OffsetNumber attnum, Datum key, RumNullCategory category, - RumKey * items, uint32 nipd, bool errorTooBig) + RumItem * items, uint32 nipd, bool errorTooBig) { Datum datums[3]; bool isnull[3]; @@ -195,6 +207,8 @@ RumFormTuple(RumState * rumstate, { itup = repalloc(itup, newsize); + memset((char *) itup + IndexTupleSize(itup), + 0, newsize - IndexTupleSize(itup)); /* set new size in tuple header */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; @@ -241,14 +255,14 @@ RumFormTuple(RumState * rumstate, */ static IndexTuple addItemPointersToLeafTuple(RumState * rumstate, - IndexTuple old, RumKey * items, uint32 nitem, + IndexTuple old, RumItem * items, uint32 nitem, GinStatsData *buildStats) { OffsetNumber attnum; Datum key; RumNullCategory category; IndexTuple res; - RumKey *newItems, + RumItem *newItems, *oldItems; int oldNPosting, newNPosting; @@ -259,15 +273,15 @@ addItemPointersToLeafTuple(RumState * rumstate, key = rumtuple_get_key(rumstate, old, &category); oldNPosting = RumGetNPosting(old); - oldItems = (RumKey *) palloc(sizeof(RumKey) * oldNPosting); + oldItems = (RumItem *) palloc(sizeof(RumItem) * oldNPosting); newNPosting = oldNPosting + nitem; - newItems = (RumKey *) palloc(sizeof(RumKey) * newNPosting); + newItems = (RumItem *) palloc(sizeof(RumItem) * newNPosting); - rumReadTuple(rumstate, attnum, old, oldItems); + rumReadTuple(rumstate, attnum, old, oldItems, false); - newNPosting = rumMergeItemPointers(rumstate, attnum, newItems, - items, nitem, oldItems, oldNPosting); + newNPosting = rumMergeRumItems(rumstate, attnum, newItems, + items, nitem, oldItems, oldNPosting); /* try to build tuple with room for all the items */ @@ -296,7 +310,7 @@ addItemPointersToLeafTuple(RumState * rumstate, buildStats->nDataPages++; /* Now insert the TIDs-to-be-added into the posting tree */ - gdi = rumPrepareScanPostingTree(rumstate->index, postingRoot, FALSE, + gdi = rumPrepareScanPostingTree(rumstate->index, postingRoot, false, ForwardScanDirection, attnum, rumstate); rumInsertItemPointers(rumstate, attnum, gdi, items, nitem, buildStats); @@ -321,7 +335,7 @@ addItemPointersToLeafTuple(RumState * rumstate, static IndexTuple buildFreshLeafTuple(RumState * rumstate, OffsetNumber attnum, Datum key, RumNullCategory category, - RumKey * items, uint32 nitem, GinStatsData *buildStats) + RumItem * items, uint32 nitem, GinStatsData *buildStats) { IndexTuple res; @@ -348,7 +362,6 @@ buildFreshLeafTuple(RumState * rumstate, if (size >= RumDataPageSize) itemsCount--; - /* * Build posting-tree-only result tuple. We do this first so as to * fail quickly if the key is too big. @@ -374,7 +387,7 @@ buildFreshLeafTuple(RumState * rumstate, { RumPostingTreeScan *gdi; - gdi = rumPrepareScanPostingTree(rumstate->index, postingRoot, FALSE, + gdi = rumPrepareScanPostingTree(rumstate->index, postingRoot, false, ForwardScanDirection, attnum, rumstate); @@ -405,7 +418,7 @@ buildFreshLeafTuple(RumState * rumstate, void rumEntryInsert(RumState * rumstate, OffsetNumber attnum, Datum key, RumNullCategory category, - RumKey * items, uint32 nitem, + RumItem * items, uint32 nitem, GinStatsData *buildStats) { RumBtreeData btree; @@ -422,6 +435,8 @@ rumEntryInsert(RumState * rumstate, stack = rumFindLeafPage(&btree, NULL); page = BufferGetPage(stack->buffer); + CheckForSerializableConflictIn(btree.index, NULL, stack->buffer); + if (btree.findItem(&btree, stack)) { /* found pre-existing entry */ @@ -439,7 +454,7 @@ rumEntryInsert(RumState * rumstate, /* insert into posting tree */ gdi = rumPrepareScanPostingTree(rumstate->index, rootPostingTree, - FALSE, ForwardScanDirection, + false, ForwardScanDirection, attnum, rumstate); rumInsertItemPointers(rumstate, attnum, gdi, items, nitem, buildStats); @@ -452,7 +467,7 @@ rumEntryInsert(RumState * rumstate, itup = addItemPointersToLeafTuple(rumstate, itup, items, nitem, buildStats); - btree.isDelete = TRUE; + btree.isDelete = true; } else { @@ -514,8 +529,13 @@ rumHeapTupleBulkInsert(RumBuildState * buildstate, OffsetNumber attnum, { /* Check existance of additional information attribute in index */ if (!attr) + { + Form_pg_attribute current_attr = RumTupleDescAttr( + buildstate->rumstate.origTupdesc, attnum - 1); + elog(ERROR, "additional information attribute \"%s\" is not found in index", - NameStr(buildstate->rumstate.origTupdesc->attrs[attnum - 1]->attname)); + NameStr(current_attr->attname)); + } addInfo[i] = datumCopy(addInfo[i], attr->attbyval, attr->attlen); } @@ -530,7 +550,13 @@ rumHeapTupleBulkInsert(RumBuildState * buildstate, OffsetNumber attnum, } static void -rumBuildCallback(Relation index, HeapTuple htup, Datum *values, +rumBuildCallback(Relation index, +#if PG_VERSION_NUM < 130000 + HeapTuple htup, +#else + ItemPointer tid, +#endif + Datum *values, bool *isnull, bool tupleIsAlive, void *state) { RumBuildState *buildstate = (RumBuildState *) state; @@ -538,11 +564,14 @@ rumBuildCallback(Relation index, HeapTuple htup, Datum *values, int i; Datum outerAddInfo = (Datum) 0; bool outerAddInfoIsNull = true; +#if PG_VERSION_NUM < 130000 + ItemPointer tid = &htup->t_self; +#endif - if (AttributeNumberIsValid(buildstate->rumstate.attrnOrderByColumn)) + if (AttributeNumberIsValid(buildstate->rumstate.attrnAttachColumn)) { - outerAddInfo = values[buildstate->rumstate.attrnOrderByColumn - 1]; - outerAddInfoIsNull = isnull[buildstate->rumstate.attrnOrderByColumn - 1]; + outerAddInfo = values[buildstate->rumstate.attrnAttachColumn - 1]; + outerAddInfoIsNull = isnull[buildstate->rumstate.attrnAttachColumn - 1]; } oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); @@ -550,13 +579,13 @@ rumBuildCallback(Relation index, HeapTuple htup, Datum *values, for (i = 0; i < buildstate->rumstate.origTupdesc->natts; i++) rumHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1), values[i], isnull[i], - &htup->t_self, + tid, outerAddInfo, outerAddInfoIsNull); /* If we've maxed out our available memory, dump everything to the index */ if (buildstate->accum.allocatedMemory >= maintenance_work_mem * 1024L) { - RumKey *items; + RumItem *items; Datum key; RumNullCategory category; uint32 nlist; @@ -587,7 +616,7 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) RumBuildState buildstate; Buffer RootBuffer, MetaBuffer; - RumKey *items; + RumItem *items; Datum key; RumNullCategory category; uint32 nlist; @@ -626,17 +655,11 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) * create a temporary memory context that is reset once for each tuple * inserted into the index */ - buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, - "Rum build temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - - buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, - "Rum build temporary context for user-defined function", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + buildstate.tmpCtx = RumContextCreate(CurrentMemoryContext, + "Rum build temporary context"); + + buildstate.funcCtx = RumContextCreate(CurrentMemoryContext, + "Rum build temporary context for user-defined function"); buildstate.accum.rumstate = &buildstate.rumstate; rumInitBA(&buildstate.accum); @@ -769,13 +792,19 @@ rumHeapTupleInsert(RumState * rumstate, OffsetNumber attnum, for (i = 0; i < nentries; i++) { - RumKey insert_item; + RumItem insert_item; /* Check existance of additional information attribute in index */ if (!addInfoIsNull[i] && !rumstate->addAttrs[attnum - 1]) + { + Form_pg_attribute attr = RumTupleDescAttr(rumstate->origTupdesc, + attnum - 1); + elog(ERROR, "additional information attribute \"%s\" is not found in index", - NameStr(rumstate->origTupdesc->attrs[attnum - 1]->attname)); + NameStr(attr->attname)); + } + memset(&insert_item, 0, sizeof(insert_item)); insert_item.iptr = *item; insert_item.addInfo = addInfo[i]; insert_item.addInfoIsNull = addInfoIsNull[i]; @@ -789,6 +818,9 @@ bool ruminsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique +#if PG_VERSION_NUM >= 140000 + , bool indexUnchanged +#endif #if PG_VERSION_NUM >= 100000 , struct IndexInfo *indexInfo #endif @@ -801,20 +833,17 @@ ruminsert(Relation index, Datum *values, bool *isnull, Datum outerAddInfo = (Datum) 0; bool outerAddInfoIsNull = true; - insertCtx = AllocSetContextCreate(CurrentMemoryContext, - "Rum insert temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + insertCtx = RumContextCreate(CurrentMemoryContext, + "Rum insert temporary context"); oldCtx = MemoryContextSwitchTo(insertCtx); initRumState(&rumstate, index); - if (AttributeNumberIsValid(rumstate.attrnOrderByColumn)) + if (AttributeNumberIsValid(rumstate.attrnAttachColumn)) { - outerAddInfo = values[rumstate.attrnOrderByColumn - 1]; - outerAddInfoIsNull = isnull[rumstate.attrnOrderByColumn - 1]; + outerAddInfo = values[rumstate.attrnAttachColumn - 1]; + outerAddInfoIsNull = isnull[rumstate.attrnAttachColumn - 1]; } for (i = 0; i < rumstate.origTupdesc->natts; i++) diff --git a/src/rumscan.c b/src/rumscan.c index 2c864219d4..089730fac4 100644 --- a/src/rumscan.c +++ b/src/rumscan.c @@ -4,7 +4,7 @@ * routines to manage scans of inverted index relations * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -15,7 +15,6 @@ #include "access/relscan.h" #include "pgstat.h" -#include "utils/memutils.h" #include "rum.h" @@ -35,19 +34,20 @@ rumbeginscan(Relation rel, int nkeys, int norderbys) so->firstCall = true; so->totalentries = 0; so->sortedEntries = NULL; - so->tempCtx = AllocSetContextCreate(CurrentMemoryContext, - "Rum scan temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - so->keyCtx = AllocSetContextCreate(CurrentMemoryContext, - "Rum scan key context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + so->tempCtx = RumContextCreate(CurrentMemoryContext, + "Rum scan temporary context"); + so->keyCtx = RumContextCreate(CurrentMemoryContext, + "Rum scan key context"); initRumState(&so->rumstate, scan->indexRelation); +#if PG_VERSION_NUM >= 120000 + /* + * Starting from PG 12 we need to invalidate result's item pointer. Earlier + * it was done by invalidating scan->xs_ctup by RelationGetIndexScan(). + */ + ItemPointerSetInvalid(&scan->xs_heaptid); +#endif scan->opaque = so; return scan; @@ -105,15 +105,20 @@ rumFillScanEntry(RumScanOpaque so, OffsetNumber attnum, scanEntry->strategy = strategy; scanEntry->searchMode = searchMode; scanEntry->attnum = scanEntry->attnumOrig = attnum; - scanEntry->forceUseBitmap = false; scanEntry->buffer = InvalidBuffer; - RumItemSetMin(&scanEntry->curRumKey); + RumItemSetMin(&scanEntry->curItem); + scanEntry->curKey = (Datum) 0; + scanEntry->curKeyCategory = RUM_CAT_NULL_KEY; + scanEntry->useCurKey = false; scanEntry->matchSortstate = NULL; scanEntry->stack = NULL; scanEntry->scanWithAddInfo = false; scanEntry->list = NULL; + scanEntry->gdi = NULL; + scanEntry->stack = NULL; scanEntry->nlist = 0; + scanEntry->matchSortstate = NULL; scanEntry->offset = InvalidOffsetNumber; scanEntry->isFinished = false; scanEntry->reduceResult = false; @@ -154,6 +159,7 @@ rumFillScanKey(RumScanOpaque so, OffsetNumber attnum, key->searchMode = searchMode; key->attnum = key->attnumOrig = attnum; key->useAddToColumn = false; + key->useCurKey = false; key->scanDirection = ForwardScanDirection; RumItemSetMin(&key->curItem); @@ -164,26 +170,77 @@ rumFillScanKey(RumScanOpaque so, OffsetNumber attnum, key->addInfoKeys = NULL; key->addInfoNKeys = 0; - if (key->orderBy && key->attnum == rumstate->attrnOrderByColumn) + if (key->orderBy) { - if (nQueryValues != 1) - elog(ERROR, "extractQuery should return only one value for ordering"); - if (rumstate->canOuterOrdering[attnum - 1] == false) - elog(ERROR, "doesn't support ordering as additional info"); + if (key->attnum != rumstate->attrnAttachColumn) + key->useCurKey = rumstate->canOrdering[attnum - 1] && + /* ordering function by index key value has 3 arguments */ + rumstate->orderingFn[attnum - 1].fn_nargs == 3; + + /* Add key to order by additional information... */ + if (key->attnum == rumstate->attrnAttachColumn || + /* ...add key to order by index key value */ + key->useCurKey) + { + Form_pg_attribute attr = RumTupleDescAttr(rumstate->origTupdesc, + attnum - 1); - key->useAddToColumn = true; - key->attnum = rumstate->attrnAddToColumn; - key->nentries = 0; - key->nuserentries = 0; + if (nQueryValues != 1) + elog(ERROR, "extractQuery should return only one value for ordering"); + if (attr->attbyval == false) + elog(ERROR, "doesn't support order by over pass-by-reference column"); - key->outerAddInfoIsNull = true; + if (key->attnum == rumstate->attrnAttachColumn) + { + if (rumstate->canOuterOrdering[attnum - 1] == false) + elog(ERROR, "doesn't support ordering as additional info"); - key->scanEntry = NULL; - key->entryRes = NULL; - key->addInfo = NULL; - key->addInfoIsNull = NULL; + key->useAddToColumn = true; + key->outerAddInfoIsNull = true; + key->attnum = rumstate->attrnAddToColumn; + } + else if (key->useCurKey) + { + RumScanKey scanKey = NULL; - return; + for (i = 0; i < so->nkeys; i++) + { + if (so->keys[i]->orderBy == false && + so->keys[i]->attnum == key->attnum) + { + scanKey = so->keys[i]; + break; + } + } + + if (scanKey == NULL) + elog(ERROR, "cannot order without attribute %d in ORDER BY clause", + key->attnum); + else if (scanKey->nentries > 1) + elog(ERROR, "scan key should contain only one value"); + else if (scanKey->nentries == 0) /* Should not happen */ + elog(ERROR, "scan key should contain key value"); + + key->useCurKey = true; + scanKey->scanEntry[0]->useCurKey = true; + } + + key->nentries = 0; + key->nuserentries = 0; + + key->scanEntry = NULL; + key->entryRes = NULL; + key->addInfo = NULL; + key->addInfoIsNull = NULL; + + so->willSort = true; + + return; + } + else if (rumstate->canOrdering[attnum - 1] == false) + { + elog(ERROR,"doesn't support ordering, check operator class definition"); + } } key->nentries = nQueryValues; @@ -311,8 +368,7 @@ freeScanKeys(RumScanOpaque so) } static void -initScanKey(RumScanOpaque so, ScanKey skey, bool *hasNullQuery, - bool *hasPartialMatch) +initScanKey(RumScanOpaque so, ScanKey skey, bool *hasPartialMatch) { Datum *queryValues; int32 nQueryValues = 0; @@ -327,7 +383,9 @@ initScanKey(RumScanOpaque so, ScanKey skey, bool *hasNullQuery, */ if (skey->sk_flags & SK_ISNULL) { - so->isVoidRes = true; + /* Do not set isVoidRes for order keys */ + if ((skey->sk_flags & SK_ORDER_BY) == 0) + so->isVoidRes = true; return; } @@ -352,10 +410,6 @@ initScanKey(RumScanOpaque so, ScanKey skey, bool *hasNullQuery, searchMode > GIN_SEARCH_MODE_ALL) searchMode = GIN_SEARCH_MODE_ALL; - /* Non-default modes require the index to have placeholders */ - if (searchMode != GIN_SEARCH_MODE_DEFAULT) - *hasNullQuery = true; - /* * In default mode, no keys means an unsatisfiable query. */ @@ -363,7 +417,9 @@ initScanKey(RumScanOpaque so, ScanKey skey, bool *hasNullQuery, { if (searchMode == GIN_SEARCH_MODE_DEFAULT) { - so->isVoidRes = true; + /* Do not set isVoidRes for order keys */ + if ((skey->sk_flags & SK_ORDER_BY) == 0) + so->isVoidRes = true; return; } nQueryValues = 0; /* ensure sane value */ @@ -385,10 +441,7 @@ initScanKey(RumScanOpaque so, ScanKey skey, bool *hasNullQuery, for (j = 0; j < nQueryValues; j++) { if (nullFlags[j]) - { nullFlags[j] = true; /* not any other nonzero value */ - *hasNullQuery = true; - } } } /* now we can use the nullFlags as category codes */ @@ -416,12 +469,12 @@ lookupScanDirection(RumState *state, AttrNumber attno, StrategyNumber strategy) int i; RumConfig *rumConfig = state->rumConfig + attno - 1; - for(i = 0; rumConfig->strategyInfo[i].strategy != InvalidStrategy && - i < MAX_STRATEGIES; i++) + for(i = 0; i < MAX_STRATEGIES; i++) { + if (rumConfig->strategyInfo[i].strategy != InvalidStrategy) + break; if (rumConfig->strategyInfo[i].strategy == strategy) return rumConfig->strategyInfo[i].direction; - } return NoMovementScanDirection; @@ -511,7 +564,6 @@ rumNewScanKey(IndexScanDesc scan) { RumScanOpaque so = (RumScanOpaque) scan->opaque; int i; - bool hasNullQuery = false; bool checkEmptyEntry = false; bool hasPartialMatch = false; MemoryContext oldCtx; @@ -523,9 +575,9 @@ rumNewScanKey(IndexScanDesc scan) so->naturalOrder = NoMovementScanDirection; so->secondPass = false; - so->tbm = NULL; so->entriesIncrIndex = -1; so->norderbys = scan->numberOfOrderBys; + so->willSort = false; /* * Allocate all the scan key information in the key context. (If @@ -535,7 +587,7 @@ rumNewScanKey(IndexScanDesc scan) oldCtx = MemoryContextSwitchTo(so->keyCtx); /* if no scan keys provided, allocate extra EVERYTHING RumScanKey */ - so->keys = (RumScanKey*) + so->keys = (RumScanKey *) palloc((Max(scan->numberOfKeys, 1) + scan->numberOfOrderBys) * sizeof(*so->keys)); so->nkeys = 0; @@ -544,7 +596,7 @@ rumNewScanKey(IndexScanDesc scan) for (i = 0; i < scan->numberOfKeys; i++) { - initScanKey(so, &scan->keyData[i], &hasNullQuery, &hasPartialMatch); + initScanKey(so, &scan->keyData[i], &hasPartialMatch); if (so->isVoidRes) break; } @@ -555,7 +607,6 @@ rumNewScanKey(IndexScanDesc scan) */ if (so->nkeys == 0 && !so->isVoidRes) { - hasNullQuery = true; rumFillScanKey(so, FirstOffsetNumber, InvalidStrategy, GIN_SEARCH_MODE_EVERYTHING, @@ -565,11 +616,7 @@ rumNewScanKey(IndexScanDesc scan) } for (i = 0; i < scan->numberOfOrderBys; i++) - { - initScanKey(so, &scan->orderByData[i], &hasNullQuery, NULL); - if (so->isVoidRes) - break; - } + initScanKey(so, &scan->orderByData[i], NULL); /* * Fill markAddInfo if possible @@ -587,9 +634,11 @@ rumNewScanKey(IndexScanDesc scan) { if (key->attnumOrig == so->rumstate.attrnAddToColumn) hasAddOnFilter |= haofHasAddToRestriction; - if (key->attnumOrig == so->rumstate.attrnOrderByColumn) + if (key->attnumOrig == so->rumstate.attrnAttachColumn) hasAddOnFilter |= haofHasAddOnRestriction; } + + key->willSort = so->willSort; } if ((hasAddOnFilter & haofHasAddToRestriction) && @@ -605,7 +654,7 @@ rumNewScanKey(IndexScanDesc scan) RumScanKey key = so->keys[i]; if (key->orderBy == false && - key->attnumOrig == so->rumstate.attrnOrderByColumn) + key->attnumOrig == so->rumstate.attrnAttachColumn) { for(j=0; addToKey == NULL && jnkeys; j++) if (so->keys[j]->orderBy == false && @@ -654,9 +703,12 @@ rumNewScanKey(IndexScanDesc scan) repalloc(so->entries, so->allocentries * sizeof(RumScanEntry)); } - memcpy(so->entries + so->totalentries, - key->scanEntry, sizeof(*key->scanEntry) * key->nentries); - so->totalentries += key->nentries; + if ( key->scanEntry != NULL ) + { + memcpy(so->entries + so->totalentries, + key->scanEntry, sizeof(*key->scanEntry) * key->nentries); + so->totalentries += key->nentries; + } } /* diff --git a/src/rumsort.c b/src/rumsort.c index 4893cc4f04..0c395f03e7 100644 --- a/src/rumsort.c +++ b/src/rumsort.c @@ -1,118 +1,15 @@ /*------------------------------------------------------------------------- * - * rumsort.h + * rumsort.c * Generalized tuple sorting routines. * - * This module handles sorting of heap tuples, index tuples, or single - * Datums (and could easily support other kinds of sortable objects, - * if necessary). It works efficiently for both small and large amounts - * of data. Small amounts are sorted in-memory using qsort(). Large - * amounts are sorted using temporary files and a standard external sort - * algorithm. + * This module handles sorting of RumSortItem or RumScanItem structures. + * It contains copy of static functions from + * src/backend/utils/sort/tuplesort.c. * - * See Knuth, volume 3, for more than you want to know about the external - * sorting algorithm. Historically, we divided the input into sorted runs - * using replacement selection, in the form of a priority tree implemented - * as a heap (essentially his Algorithm 5.2.3H -- although that strategy is - * often avoided altogether), but that can now only happen first the first - * run. We merge the runs using polyphase merge, Knuth's Algorithm - * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by - * logtape.c, which avoids space wastage by recycling disk space as soon - * as each block is read from its "tape". * - * We never form the initial runs using Knuth's recommended replacement - * selection data structure (Algorithm 5.4.1R), because it uses a fixed - * number of records in memory at all times. Since we are dealing with - * tuples that may vary considerably in size, we want to be able to vary - * the number of records kept in memory to ensure full utilization of the - * allowed sort memory space. So, we keep the tuples in a variable-size - * heap, with the next record to go out at the top of the heap. Like - * Algorithm 5.4.1R, each record is stored with the run number that it - * must go into, and we use (run number, key) as the ordering key for the - * heap. When the run number at the top of the heap changes, we know that - * no more records of the prior run are left in the heap. Note that there - * are in practice only ever two distinct run numbers, due to the greatly - * reduced use of replacement selection in PostgreSQL 9.6. - * - * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small - * customizations) is only used with the aim of producing just one run, - * thereby avoiding all merging. Only the first run can use replacement - * selection, which is why there are now only two possible valid run - * numbers, and why heapification is customized to not distinguish between - * tuples in the second run (those will be quicksorted). We generally - * prefer a simple hybrid sort-merge strategy, where runs are sorted in much - * the same way as the entire input of an internal sort is sorted (using - * qsort()). The replacement_sort_tuples GUC controls the limited remaining - * use of replacement selection for the first run. - * - * There are several reasons to favor a hybrid sort-merge strategy. - * Maintaining a priority tree/heap has poor CPU cache characteristics. - * Furthermore, the growth in main memory sizes has greatly diminished the - * value of having runs that are larger than available memory, even in the - * case where there is partially sorted input and runs can be made far - * larger by using a heap. In most cases, a single-pass merge step is all - * that is required even when runs are no larger than available memory. - * Avoiding multiple merge passes was traditionally considered to be the - * major advantage of using replacement selection. - * - * The approximate amount of memory allowed for any one sort operation - * is specified in kilobytes by the caller (most pass work_mem). Initially, - * we absorb tuples and simply store them in an unsorted array as long as - * we haven't exceeded workMem. If we reach the end of the input without - * exceeding workMem, we sort the array using qsort() and subsequently return - * tuples just by scanning the tuple array sequentially. If we do exceed - * workMem, we begin to emit tuples into sorted runs in temporary tapes. - * When tuples are dumped in batch after quicksorting, we begin a new run - * with a new output tape (selected per Algorithm D). After the end of the - * input is reached, we dump out remaining tuples in memory into a final run - * (or two, when replacement selection is still used), then merge the runs - * using Algorithm D. - * - * When merging runs, we use a heap containing just the frontmost tuple from - * each source run; we repeatedly output the smallest tuple and insert the - * next tuple from its source tape (if any). When the heap empties, the merge - * is complete. The basic merge algorithm thus needs very little memory --- - * only M tuples for an M-way merge, and M is constrained to a small number. - * However, we can still make good use of our full workMem allocation by - * pre-reading additional tuples from each source tape. Without prereading, - * our access pattern to the temporary file would be very erratic; on average - * we'd read one block from each of M source tapes during the same time that - * we're writing M blocks to the output tape, so there is no sequentiality of - * access at all, defeating the read-ahead methods used by most Unix kernels. - * Worse, the output tape gets written into a very random sequence of blocks - * of the temp file, ensuring that things will be even worse when it comes - * time to read that tape. A straightforward merge pass thus ends up doing a - * lot of waiting for disk seeks. We can improve matters by prereading from - * each source tape sequentially, loading about workMem/M bytes from each tape - * in turn. Then we run the merge algorithm, writing but not reading until - * one of the preloaded tuple series runs out. Then we switch back to preread - * mode, fill memory again, and repeat. This approach helps to localize both - * read and write accesses. - * - * When the caller requests random access to the sort result, we form - * the final sorted run on a logical tape which is then "frozen", so - * that we can access it randomly. When the caller does not need random - * access, we return from rum_tuplesort_performsort() as soon as we are down - * to one run per logical tape. The final merge is then performed - * on-the-fly as the caller repeatedly calls rum_tuplesort_getXXX; this - * saves one cycle of writing all the data out to disk and reading it in. - * - * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the - * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according - * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that - * tape drives are expensive beasts, and in particular that there will always - * be many more runs than tape drives. In our implementation a "tape drive" - * doesn't cost much more than a few Kb of memory buffers, so we can afford - * to have lots of them. In particular, if we can have as many tape drives - * as sorted runs, we can eliminate any repeated I/O at all. In the current - * code we determine the number of tapes M on the basis of workMem: we want - * workMem/M to be large enough that we read a fair amount of data each time - * we preread from a tape, so as to maintain the locality of access described - * above. Nonetheless, with large workMem we can have many tapes. - * - * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- @@ -122,4045 +19,570 @@ #include "miscadmin.h" #include "rumsort.h" -#include "access/htup_details.h" -#include "access/nbtree.h" -#include "catalog/index.h" -#include "catalog/pg_am.h" #include "commands/tablespace.h" #include "executor/executor.h" -#include "utils/datum.h" #include "utils/logtape.h" -#include "utils/lsyscache.h" -#include "utils/memutils.h" #include "utils/pg_rusage.h" -#include "utils/probes.h" -#include "utils/rel.h" -#include "utils/sortsupport.h" - -#include "rum.h" /* RumKey */ - -/* sort-type codes for sort__start probes */ -#define HEAP_SORT 0 -#define INDEX_SORT 1 -#define DATUM_SORT 2 -#define CLUSTER_SORT 3 - -/* GUC variables */ -#ifdef TRACE_SORT -bool trace_sort = false; -#endif - -#ifdef DEBUG_BOUNDED_SORT -bool optimize_bounded_sort = true; -#endif - -#if PG_VERSION_NUM < 100000 -/* Provide fallback for old version of tape interface for 9.6 */ -#define LogicalTapeRewindForRead(x, y, z) LogicalTapeRewind((x), (y), false) -#define LogicalTapeRewindForWrite(x, y) LogicalTapeRewind((x), (y), true) +#include "utils/tuplesort.h" + +#include "rum.h" /* RumItem */ + +#if PG_VERSION_NUM >= 160000 +/* + * After allocating a public interface for Tuplesortstate, no need to include + * source code from pg-core. + */ +#elif PG_VERSION_NUM >= 150000 +#include "tuplesort15.c" +#elif PG_VERSION_NUM >= 140000 +#include "tuplesort14.c" +#elif PG_VERSION_NUM >= 130000 +#include "tuplesort13.c" +#elif PG_VERSION_NUM >= 120000 +#include "tuplesort12.c" +#elif PG_VERSION_NUM >= 110000 +#include "tuplesort11.c" +#elif PG_VERSION_NUM >= 100000 +#include "tuplesort10.c" +#elif PG_VERSION_NUM >= 90600 +#include "tuplesort96.c" #endif /* - * The objects we actually sort are SortTuple structs. These contain - * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), - * which is a separate palloc chunk --- we assume it is just one chunk and - * can be freed by a simple pfree(). SortTuples also contain the tuple's - * first key column in Datum/nullflag format, and an index integer. - * - * Storing the first key column lets us save heap_getattr or index_getattr - * calls during tuple comparisons. We could extract and save all the key - * columns not just the first, but this would increase code complexity and - * overhead, and wouldn't actually save any comparison cycles in the common - * case where the first key determines the comparison result. Note that - * for a pass-by-reference datatype, datum1 points into the "tuple" storage. - * - * When sorting single Datums, the data value is represented directly by - * datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false, - * then datum1 points to a separately palloc'd data value that is also pointed - * to by the "tuple" pointer; otherwise "tuple" is NULL. - * - * While building initial runs, tupindex holds the tuple's run number. During - * merge passes, we re-use it to hold the input tape number that each tuple in - * the heap was read from, or to hold the index of the next tuple pre-read - * from the same tape in the case of pre-read entries. tupindex goes unused - * if the sort occurs entirely in memory. + * In case of using custom compare function we should store function pointer in + * sort stare in order to use it later. */ -typedef struct -{ - void *tuple; /* the tuple proper */ - Datum datum1; /* value of first key column */ - bool isnull1; /* is first key column NULL? */ - int tupindex; /* see notes above */ -} SortTuple; - +#if PG_VERSION_NUM >= 160000 /* - * Possible states of a Tuplesort object. These denote the states that - * persist between calls of Tuplesort routines. + * After allocating a public interface for Tuplesortstate we may use + * TuplesortPublic->arg filed to store pointer to the compare function. */ -typedef enum -{ - TSS_INITIAL, /* Loading tuples; still within memory limit */ - TSS_BOUNDED, /* Loading tuples into bounded-size heap */ - TSS_BUILDRUNS, /* Loading tuples; writing to tape */ - TSS_SORTEDINMEM, /* Sort completed entirely in memory */ - TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ - TSS_FINALMERGE /* Performing final merge on-the-fly */ -} TupSortStatus; -/* - * Parameters for calculation of number of tapes to use --- see inittapes() - * and rum_tuplesort_merge_order(). - * - * In this calculation we assume that each tape will cost us about 3 blocks - * worth of buffer space (which is an underestimate for very large data - * volumes, but it's probably close enough --- see logtape.c). - * - * MERGE_BUFFER_SIZE is how much data we'd like to read from each input - * tape during a preread cycle (see discussion at top of file). - */ -#define MINORDER 6 /* minimum merge order */ -#define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3) -#define MERGE_BUFFER_SIZE (BLCKSZ * 32) +/* GUC variables */ +#ifdef TRACE_SORT +extern PGDLLIMPORT bool trace_sort; +#endif -typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); +/* All memory management should be inside Tuplesortstate module. */ +#define USEMEM(state,amt) do {} while(0) +#else /* PG_VERSION_NUM >= 160000 */ /* - * Private state of a Tuplesort operation. + * We need extra field in a state structure but we should not modify struct + * RumTuplesortstate which is inherited from Tuplesortstate core function. */ -struct Tuplesortstate +typedef struct RumTuplesortstateExt { - TupSortStatus status; /* enumerated value as shown above */ - int nKeys; /* number of columns in sort key */ - bool randomAccess; /* did caller request random access? */ - bool bounded; /* did caller specify a maximum number of - * tuples to return? */ - bool boundUsed; /* true if we made use of a bounded heap */ - int bound; /* if bounded, the maximum number of tuples */ - long availMem; /* remaining memory available, in bytes */ - long allowedMem; /* total memory allowed, in bytes */ - int maxTapes; /* number of tapes (Knuth's T) */ - int tapeRange; /* maxTapes-1 (Knuth's P) */ - MemoryContext sortcontext; /* memory context holding all sort data */ - LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ - - /* - * These function pointers decouple the routines that must know what kind - * of tuple we are sorting from the routines that don't need to know it. - * They are set up by the rum_tuplesort_begin_xxx routines. - * - * Function to compare two tuples; result is per qsort() convention, ie: - * <0, 0, >0 according as ab. The API must match - * qsort_arg_comparator. - */ - SortTupleComparator comparetup; - - /* - * Function to copy a supplied input tuple into palloc'd space and set up - * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, - * state->availMem must be decreased by the amount of space used for the - * tuple copy (note the SortTuple struct itself is not counted). - */ - void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); - - /* - * Function to write a stored tuple onto tape. The representation of the - * tuple on tape need not be the same as it is in memory; requirements on - * the tape representation are given below. After writing the tuple, - * pfree() the out-of-line data (not the SortTuple struct!), and increase - * state->availMem by the amount of memory space thereby released. - */ - void (*writetup) (Tuplesortstate *state, int tapenum, - SortTuple *stup); - - /* - * Function to read a stored tuple from tape back into memory. 'len' is - * the already-read length of the stored tuple. Create a palloc'd copy, - * initialize tuple/datum1/isnull1 in the target SortTuple struct, and - * decrease state->availMem by the amount of memory space consumed. - */ - void (*readtup) (Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); - - /* - * Function to reverse the sort direction from its current state. (We - * could dispense with this if we wanted to enforce that all variants - * represent the sort key information alike.) - */ - void (*reversedirection) (Tuplesortstate *state); - - /* - * This array holds the tuples now in sort memory. If we are in state - * INITIAL, the tuples are in no particular order; if we are in state - * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS - * and FINALMERGE, the tuples are organized in "heap" order per Algorithm - * H. (Note that memtupcount only counts the tuples that are part of the - * heap --- during merge passes, memtuples[] entries beyond tapeRange are - * never in the heap and are used to hold pre-read tuples.) In state - * SORTEDONTAPE, the array is not used. - */ - SortTuple *memtuples; /* array of SortTuple structs */ - int memtupcount; /* number of tuples currently present */ - int memtupsize; /* allocated length of memtuples array */ - bool growmemtuples; /* memtuples' growth still underway? */ - - /* Buffer size to use for reading input tapes, during merge. */ - size_t read_buffer_size; - - /* - * While building initial runs, this is the current output run number - * (starting at 0). Afterwards, it is the number of initial runs we made. - */ - int currentRun; - - /* - * Unless otherwise noted, all pointer variables below are pointers to - * arrays of length maxTapes, holding per-tape data. - */ - - /* - * These variables are only used during merge passes. mergeactive[i] is - * true if we are reading an input run from (actual) tape number i and - * have not yet exhausted that run. mergenext[i] is the memtuples index - * of the next pre-read tuple (next to be loaded into the heap) for tape - * i, or 0 if we are out of pre-read tuples. mergelast[i] similarly - * points to the last pre-read tuple from each tape. mergeavailslots[i] - * is the number of unused memtuples[] slots reserved for tape i, and - * mergeavailmem[i] is the amount of unused space allocated for tape i. - * mergefreelist and mergefirstfree keep track of unused locations in the - * memtuples[] array. The memtuples[].tupindex fields link together - * pre-read tuples for each tape as well as recycled locations in - * mergefreelist. It is OK to use 0 as a null link in these lists, because - * memtuples[0] is part of the merge heap and is never a pre-read tuple. - */ - bool *mergeactive; /* active input run source? */ - int *mergenext; /* first preread tuple for each source */ - int *mergelast; /* last preread tuple for each source */ - int *mergeavailslots; /* slots left for prereading each tape */ - long *mergeavailmem; /* availMem for prereading each tape */ - int mergefreelist; /* head of freelist of recycled slots */ - int mergefirstfree; /* first slot never used in this merge */ - - /* - * Variables for Algorithm D. Note that destTape is a "logical" tape - * number, ie, an index into the tp_xxx[] arrays. Be careful to keep - * "logical" and "actual" tape numbers straight! - */ - int Level; /* Knuth's l */ - int destTape; /* current output tape (Knuth's j, less 1) */ - int *tp_fib; /* Target Fibonacci run counts (A[]) */ - int *tp_runs; /* # of real runs on each tape */ - int *tp_dummy; /* # of dummy runs for each tape (D[]) */ - int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ - int activeTapes; /* # of active input tapes in merge pass */ - - /* - * These variables are used after completion of sorting to keep track of - * the next tuple to return. (In the tape case, the tape's current read - * position is also critical state.) - */ - int result_tape; /* actual tape number of finished output */ - int current; /* array index (only used if SORTEDINMEM) */ - bool eof_reached; /* reached EOF (needed for cursors) */ + RumTuplesortstate ts; + FmgrInfo *cmp; +} RumTuplesortstateExt; +#endif /* PG_VERSION_NUM < 160000 */ - /* markpos_xxx holds marked position for mark and restore */ - long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ - int markpos_offset; /* saved "current", or offset in tape block */ - bool markpos_eof; /* saved "eof_reached" */ +static int comparetup_rum(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state, bool compareItemPointer); +static int comparetup_rum_true(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state); +static int comparetup_rum_false(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state); +static int comparetup_rumitem(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state); +static void copytup_rum(RumTuplesortstate *state, SortTuple *stup, void *tup); +static void copytup_rumitem(RumTuplesortstate *state, SortTuple *stup, + void *tup); +static void *rum_tuplesort_getrum_internal(RumTuplesortstate *state, + bool forward, bool *should_free); - /* - * These variables are specific to the MinimalTuple case; they are set by - * rum_tuplesort_begin_heap and used only by the MinimalTuple routines. - */ - TupleDesc tupDesc; - SortSupport sortKeys; /* array of length nKeys */ - - /* - * This variable is shared by the single-key MinimalTuple case and the - * Datum case (which both use qsort_ssup()). Otherwise it's NULL. - */ - SortSupport onlyKey; - - /* - * These variables are specific to the CLUSTER case; they are set by - * rum_tuplesort_begin_cluster. Note CLUSTER also uses tupDesc and - * indexScanKey. - */ - IndexInfo *indexInfo; /* info about index being used for reference */ - EState *estate; /* for evaluating index expressions */ - - /* - * These variables are specific to the IndexTuple case; they are set by - * rum_tuplesort_begin_index_xxx and used only by the IndexTuple routines. - */ - Relation heapRel; /* table the index is being built on */ - Relation indexRel; /* index being built */ - - /* These are specific to the index_btree subcase: */ - ScanKey indexScanKey; - bool enforceUnique; /* complain if we find duplicate tuples */ - - /* These are specific to the index_hash subcase: */ - uint32 hash_mask; /* mask for sortable part of hash code */ - - /* - * These variables are specific to the Datum case; they are set by - * rum_tuplesort_begin_datum and used only by the DatumTuple routines. - */ - Oid datumType; - /* we need typelen and byval in order to know how to copy the Datums. */ - int datumTypeLen; - bool datumTypeByVal; - - bool reverse; - - /* Do we need ItemPointer comparison in comparetup_rum()? */ - bool compareItemPointer; - - /* compare_rumkey */ - FmgrInfo *cmp; - - /* - * Resource snapshot for time of sort start. - */ -#ifdef TRACE_SORT - PGRUsage ru_start; +/* + * Tuplesortstate handling should be done through this macro. + */ +#if PG_VERSION_NUM >= 160000 +# define TSS_GET(state) TuplesortstateGetPublic((state)) +#else +# define TSS_GET(state) (state) #endif -}; - -#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) -#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) -#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) -#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) -#define REVERSEDIRECTION(state) ((*(state)->reversedirection) (state)) -#define LACKMEM(state) ((state)->availMem < 0) -#define USEMEM(state,amt) ((state)->availMem -= (amt)) -#define FREEMEM(state,amt) ((state)->availMem += (amt)) /* - * NOTES about on-tape representation of tuples: - * - * We require the first "unsigned int" of a stored tuple to be the total size - * on-tape of the tuple, including itself (so it is never zero; an all-zero - * unsigned int is used to delimit runs). The remainder of the stored tuple - * may or may not match the in-memory representation of the tuple --- - * any conversion needed is the job of the writetup and readtup routines. - * - * If state->randomAccess is true, then the stored representation of the - * tuple must be followed by another "unsigned int" that is a copy of the - * length --- so the total tape space used is actually sizeof(unsigned int) - * more than the stored length value. This allows read-backwards. When - * randomAccess is not true, the write/read routines may omit the extra - * length word. - * - * writetup is expected to write both length words as well as the tuple - * data. When readtup is called, the tape is positioned just after the - * front length word; readtup must read the tuple data and advance past - * the back length word (if present). - * - * The write/read routines can make use of the tuple description data - * stored in the Tuplesortstate record, if needed. They are also expected - * to adjust state->availMem by the amount of memory space (not tape space!) - * released or consumed. There is no error return from either writetup - * or readtup; they should ereport() on failure. - * - * - * NOTES about memory consumption calculations: - * - * We count space allocated for tuples against the workMem limit, plus - * the space used by the variable-size memtuples array. Fixed-size space - * is not counted; it's small enough to not be interesting. - * - * Note that we count actual space used (as shown by GetMemoryChunkSpace) - * rather than the originally-requested size. This is important since - * palloc can add substantial overhead. It's not a complete answer since - * we won't count any wasted space in palloc allocation blocks, but it's - * a lot better than what we were doing before 7.3. + * Logical tape handling should be done through this macro. */ - -/* When using this macro, beware of double evaluation of len */ -#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ - do { \ - if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ - elog(ERROR, "unexpected end of data"); \ - } while(0) - - -static Tuplesortstate *rum_tuplesort_begin_common(int workMem, bool randomAccess); -static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); -static void inittapes(Tuplesortstate *state); -static void selectnewtape(Tuplesortstate *state); -static void mergeruns(Tuplesortstate *state); -static void mergeonerun(Tuplesortstate *state); -static void beginmerge(Tuplesortstate *state); -static void mergepreread(Tuplesortstate *state); -static void mergeprereadone(Tuplesortstate *state, int srcTape); -static void dumptuples(Tuplesortstate *state, bool alltuples); -static void make_bounded_heap(Tuplesortstate *state); -static void sort_bounded_heap(Tuplesortstate *state); -static void rum_tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, - int tupleindex, bool checkIndex); -static void rum_tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex); -static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); -static void markrunend(Tuplesortstate *state, int tapenum); -static int comparetup_heap(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_heap(Tuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_heap(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_heap(Tuplesortstate *state); -static int comparetup_cluster(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_cluster(Tuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_index(Tuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_index(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_index_btree(Tuplesortstate *state); -static void reversedirection_index_hash(Tuplesortstate *state); -static int comparetup_datum(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_datum(Tuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_datum(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_datum(Tuplesortstate *state); -static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); -static int comparetup_rum(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static void copytup_rum(Tuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_rum(Tuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_rum(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_rum(Tuplesortstate *state); -static int comparetup_rumkey(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static void copytup_rumkey(Tuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_rumkey(Tuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_rumkey(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); +#if PG_VERSION_NUM >= 150000 +#define LT_TYPE LogicalTape * +#define LT_ARG tape +#define TAPE(state, LT_ARG) LT_ARG +#else +#define LT_TYPE int +#define LT_ARG tapenum +#define TAPE(state, LT_ARG) state->tapeset, LT_ARG +#endif /* - * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts - * any variant of SortTuples, using the appropriate comparetup function. - * qsort_ssup() is specialized for the case where the comparetup function - * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts - * and Datum sorts. + * Just for convenience and uniformity. */ -/* #include "qsort_tuple.c" */ - -static void -swapfunc(SortTuple *a, SortTuple *b, size_t n) -{ - do - { - SortTuple t = *a; - - *a++ = *b; - *b++ = t; - } while (--n > 0); -} - -#define cmp_ssup(a, b, ssup) \ - ApplySortComparator((a)->datum1, (a)->isnull1, \ - (b)->datum1, (b)->isnull1, ssup) - -#define swap(a, b) \ - do { \ - SortTuple t = *(a); \ - *(a) = *(b); \ - *(b) = t; \ - } while (0); - -#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n) - -static SortTuple * -med3_tuple(SortTuple *a, SortTuple *b, SortTuple *c, SortTupleComparator cmp_tuple, Tuplesortstate *state) -{ - return cmp_tuple(a, b, state) < 0 ? - (cmp_tuple(b, c, state) < 0 ? b : - (cmp_tuple(a, c, state) < 0 ? c : a)) - : (cmp_tuple(b, c, state) > 0 ? b : - (cmp_tuple(a, c, state) < 0 ? a : c)); -} - -static SortTuple * -med3_ssup(SortTuple *a, SortTuple *b, SortTuple *c, SortSupport ssup) -{ - return cmp_ssup(a, b, ssup) < 0 ? - (cmp_ssup(b, c, ssup) < 0 ? b : - (cmp_ssup(a, c, ssup) < 0 ? c : a)) - : (cmp_ssup(b, c, ssup) > 0 ? b : - (cmp_ssup(a, c, ssup) < 0 ? a : c)); -} - -static void -qsort_ssup(SortTuple *a, size_t n, SortSupport ssup) -{ - SortTuple *pa, - *pb, - *pc, - *pd, - *pl, - *pm, - *pn; - size_t d1, - d2; - int r, - presorted; - -loop: - CHECK_FOR_INTERRUPTS(); - if (n < 7) - { - for (pm = a + 1; pm < a + n; pm++) - for (pl = pm; pl > a && cmp_ssup(pl - 1, pl, ssup) > 0; pl--) - swap(pl, pl - 1); - return; - } - presorted = 1; - for (pm = a + 1; pm < a + n; pm++) - { - CHECK_FOR_INTERRUPTS(); - if (cmp_ssup(pm - 1, pm, ssup) > 0) - { - presorted = 0; - break; - } - } - if (presorted) - return; - pm = a + (n / 2); - if (n > 7) - { - pl = a; - pn = a + (n - 1); - if (n > 40) - { - size_t d = (n / 8); - - pl = med3_ssup(pl, pl + d, pl + 2 * d, ssup); - pm = med3_ssup(pm - d, pm, pm + d, ssup); - pn = med3_ssup(pn - 2 * d, pn - d, pn, ssup); - } - pm = med3_ssup(pl, pm, pn, ssup); - } - swap(a, pm); - pa = pb = a + 1; - pc = pd = a + (n - 1); - for (;;) - { - while (pb <= pc && (r = cmp_ssup(pb, a, ssup)) <= 0) - { - if (r == 0) - { - swap(pa, pb); - pa++; - } - pb++; - CHECK_FOR_INTERRUPTS(); - } - while (pb <= pc && (r = cmp_ssup(pc, a, ssup)) >= 0) - { - if (r == 0) - { - swap(pc, pd); - pd--; - } - pc--; - CHECK_FOR_INTERRUPTS(); - } - if (pb > pc) - break; - swap(pb, pc); - pb++; - pc--; - } - pn = a + n; - d1 = Min(pa - a, pb - pa); - vecswap(a, pb - d1, d1); - d1 = Min(pd - pc, pn - pd - 1); - vecswap(pb, pn - d1, d1); - d1 = pb - pa; - d2 = pd - pc; - if (d1 <= d2) - { - /* Recurse on left partition, then iterate on right partition */ - if (d1 > 1) - qsort_ssup(a, d1, ssup); - if (d2 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_ssup(pn - d2, d2, ssup); */ - a = pn - d2; - n = d2; - goto loop; - } - } - else - { - /* Recurse on right partition, then iterate on left partition */ - if (d2 > 1) - qsort_ssup(pn - d2, d2, ssup); - if (d1 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_ssup(a, d1, ssup); */ - n = d1; - goto loop; - } - } -} - -static void -qsort_tuple(SortTuple *a, size_t n, SortTupleComparator cmp_tuple, Tuplesortstate *state) -{ - SortTuple *pa, - *pb, - *pc, - *pd, - *pl, - *pm, - *pn; - size_t d1, - d2; - int r, - presorted; - -loop: - CHECK_FOR_INTERRUPTS(); - if (n < 7) - { - for (pm = a + 1; pm < a + n; pm++) - for (pl = pm; pl > a && cmp_tuple(pl - 1, pl, state) > 0; pl--) - swap(pl, pl - 1); - return; - } - presorted = 1; - for (pm = a + 1; pm < a + n; pm++) - { - CHECK_FOR_INTERRUPTS(); - if (cmp_tuple(pm - 1, pm, state) > 0) - { - presorted = 0; - break; - } - } - if (presorted) - return; - pm = a + (n / 2); - if (n > 7) - { - pl = a; - pn = a + (n - 1); - if (n > 40) - { - size_t d = (n / 8); - - pl = med3_tuple(pl, pl + d, pl + 2 * d, cmp_tuple, state); - pm = med3_tuple(pm - d, pm, pm + d, cmp_tuple, state); - pn = med3_tuple(pn - 2 * d, pn - d, pn, cmp_tuple, state); - } - pm = med3_tuple(pl, pm, pn, cmp_tuple, state); - } - swap(a, pm); - pa = pb = a + 1; - pc = pd = a + (n - 1); - for (;;) - { - while (pb <= pc && (r = cmp_tuple(pb, a, state)) <= 0) - { - if (r == 0) - { - swap(pa, pb); - pa++; - } - pb++; - CHECK_FOR_INTERRUPTS(); - } - while (pb <= pc && (r = cmp_tuple(pc, a, state)) >= 0) - { - if (r == 0) - { - swap(pc, pd); - pd--; - } - pc--; - CHECK_FOR_INTERRUPTS(); - } - if (pb > pc) - break; - swap(pb, pc); - pb++; - pc--; - } - pn = a + n; - d1 = Min(pa - a, pb - pa); - vecswap(a, pb - d1, d1); - d1 = Min(pd - pc, pn - pd - 1); - vecswap(pb, pn - d1, d1); - d1 = pb - pa; - d2 = pd - pc; - if (d1 <= d2) - { - /* Recurse on left partition, then iterate on right partition */ - if (d1 > 1) - qsort_tuple(a, d1, cmp_tuple, state); - if (d2 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_tuple(pn - d2, d2, cmp_tuple, state); */ - a = pn - d2; - n = d2; - goto loop; - } - } - else - { - /* Recurse on right partition, then iterate on left partition */ - if (d2 > 1) - qsort_tuple(pn - d2, d2, cmp_tuple, state); - if (d1 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_tuple(a, d1, cmp_tuple, state); */ - n = d1; - goto loop; - } - } -} +#if PG_VERSION_NUM >= 110000 +#define tuplesort_begin_common(x,y) tuplesort_begin_common((x), NULL, (y)) +#endif /* - * rum_tuplesort_begin_xxx - * - * Initialize for a tuple sort operation. - * - * After calling rum_tuplesort_begin, the caller should call rum_tuplesort_putXXX - * zero or more times, then call rum_tuplesort_performsort when all the tuples - * have been supplied. After performsort, retrieve the tuples in sorted - * order by calling rum_tuplesort_getXXX until it returns false/NULL. (If random - * access was requested, rescan, markpos, and restorepos can also be called.) - * Call rum_tuplesort_end to terminate the operation and release memory/disk space. - * - * Each variant of rum_tuplesort_begin has a workMem parameter specifying the - * maximum number of kilobytes of RAM to use before spilling data to disk. - * (The normal value of this parameter is work_mem, but some callers use - * other values.) Each variant also has a randomAccess parameter specifying - * whether the caller needs non-sequential access to the sort result. + * Trace log wrapper. */ - -static Tuplesortstate * -rum_tuplesort_begin_common(int workMem, bool randomAccess) -{ - Tuplesortstate *state; - MemoryContext sortcontext; - MemoryContext oldcontext; - - /* - * Create a working memory context for this sort operation. All data - * needed by the sort will live inside this context. - */ - sortcontext = AllocSetContextCreate(CurrentMemoryContext, - "TupleSort", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - - /* - * Make the Tuplesortstate within the per-sort context. This way, we - * don't need a separate pfree() operation for it at shutdown. - */ - oldcontext = MemoryContextSwitchTo(sortcontext); - - state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); - #ifdef TRACE_SORT - if (trace_sort) - pg_rusage_init(&state->ru_start); +# define LOG_SORT(...) \ + if (trace_sort) \ + ereport(LOG, errmsg_internal(__VA_ARGS__)) +#else +# define LOG_SORT(...) \ + {} #endif - state->status = TSS_INITIAL; - state->randomAccess = randomAccess; - state->bounded = false; - state->boundUsed = false; - state->allowedMem = workMem * 1024L; - state->availMem = state->allowedMem; - state->sortcontext = sortcontext; - state->tapeset = NULL; - - state->memtupcount = 0; - - /* - * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; - * see comments in grow_memtuples(). - */ - state->memtupsize = Max(1024, - ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); - - state->growmemtuples = true; - state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); - - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); - - /* workMem must be large enough for the minimal memtuples array */ - if (LACKMEM(state)) - elog(ERROR, "insufficient memory allowed for sort"); - - state->currentRun = 0; - - /* - * maxTapes, tapeRange, and Algorithm D variables will be initialized by - * inittapes(), if needed - */ - - state->result_tape = -1; /* flag that result tape has not been formed */ - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -MemoryContext -rum_tuplesort_get_memorycontext(Tuplesortstate *state) -{ - return state->sortcontext; -} - -Tuplesortstate * -rum_tuplesort_begin_heap(TupleDesc tupDesc, - int nkeys, AttrNumber *attNums, - Oid *sortOperators, Oid *sortCollations, - bool *nullsFirstFlags, - int workMem, bool randomAccess) +static inline int +compare_rum_itempointer(ItemPointerData p1, ItemPointerData p2) { - Tuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - int i; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - - AssertArg(nkeys > 0); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", - nkeys, workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = nkeys; - - TRACE_POSTGRESQL_SORT_START(HEAP_SORT, - false, /* no unique check */ - nkeys, - workMem, - randomAccess); - - state->comparetup = comparetup_heap; - state->copytup = copytup_heap; - state->writetup = writetup_heap; - state->readtup = readtup_heap; - state->reversedirection = reversedirection_heap; - - state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ - - /* Prepare SortSupport data for each column */ - state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); - - for (i = 0; i < nkeys; i++) - { - SortSupport sortKey = state->sortKeys + i; - - AssertArg(attNums[i] != 0); - AssertArg(sortOperators[i] != 0); - - sortKey->ssup_cxt = CurrentMemoryContext; - sortKey->ssup_collation = sortCollations[i]; - sortKey->ssup_nulls_first = nullsFirstFlags[i]; - sortKey->ssup_attno = attNums[i]; - - PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); - } + if (p1.ip_blkid.bi_hi < p2.ip_blkid.bi_hi) + return -1; + else if (p1.ip_blkid.bi_hi > p2.ip_blkid.bi_hi) + return 1; - if (nkeys == 1) - state->onlyKey = state->sortKeys; + if (p1.ip_blkid.bi_lo < p2.ip_blkid.bi_lo) + return -1; + else if (p1.ip_blkid.bi_lo > p2.ip_blkid.bi_lo) + return 1; - MemoryContextSwitchTo(oldcontext); + if (p1.ip_posid < p2.ip_posid) + return -1; + else if (p1.ip_posid > p2.ip_posid) + return 1; - return state; + return 0; } -Tuplesortstate * -rum_tuplesort_begin_cluster(TupleDesc tupDesc, - Relation indexRel, - int workMem, bool randomAccess) +static int +comparetup_rum(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state, bool compareItemPointer) { - Tuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - Assert(indexRel->rd_rel->relam == BTREE_AM_OID); - - oldcontext = MemoryContextSwitchTo(state->sortcontext); + RumSortItem *i1, + *i2; + float8 v1 = DatumGetFloat8(a->datum1); + float8 v2 = DatumGetFloat8(b->datum1); + int i; -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", - RelationGetNumberOfAttributes(indexRel), - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = RelationGetNumberOfAttributes(indexRel); - - TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, - false, /* no unique check */ - state->nKeys, - workMem, - randomAccess); - - state->comparetup = comparetup_cluster; - state->copytup = copytup_cluster; - state->writetup = writetup_cluster; - state->readtup = readtup_cluster; - state->reversedirection = reversedirection_index_btree; - - state->indexInfo = BuildIndexInfo(indexRel); - state->indexScanKey = _bt_mkscankey_nodata(indexRel); - - state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ - - if (state->indexInfo->ii_Expressions != NULL) - { - TupleTableSlot *slot; - ExprContext *econtext; - - /* - * We will need to use FormIndexDatum to evaluate the index - * expressions. To do that, we need an EState, as well as a - * TupleTableSlot to put the table tuples into. The econtext's - * scantuple has to point to that slot, too. - */ - state->estate = CreateExecutorState(); - slot = MakeSingleTupleTableSlot(tupDesc); - econtext = GetPerTupleExprContext(state->estate); - econtext->ecxt_scantuple = slot; - } - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -Tuplesortstate * -rum_tuplesort_begin_index_btree(Relation heapRel, - Relation indexRel, - bool enforceUnique, - int workMem, bool randomAccess) -{ - Tuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin index sort: unique = %c, workMem = %d, randomAccess = %c", - enforceUnique ? 't' : 'f', - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = RelationGetNumberOfAttributes(indexRel); - - TRACE_POSTGRESQL_SORT_START(INDEX_SORT, - enforceUnique, - state->nKeys, - workMem, - randomAccess); - - state->comparetup = comparetup_index_btree; - state->copytup = copytup_index; - state->writetup = writetup_index; - state->readtup = readtup_index; - state->reversedirection = reversedirection_index_btree; - - state->heapRel = heapRel; - state->indexRel = indexRel; - state->indexScanKey = _bt_mkscankey_nodata(indexRel); - state->enforceUnique = enforceUnique; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -Tuplesortstate * -rum_tuplesort_begin_index_hash(Relation heapRel, - Relation indexRel, - uint32 hash_mask, - int workMem, bool randomAccess) -{ - Tuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c", - hash_mask, - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = 1; /* Only one sort column, the hash code */ - - state->comparetup = comparetup_index_hash; - state->copytup = copytup_index; - state->writetup = writetup_index; - state->readtup = readtup_index; - state->reversedirection = reversedirection_index_hash; - - state->heapRel = heapRel; - state->indexRel = indexRel; - - state->hash_mask = hash_mask; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -Tuplesortstate * -rum_tuplesort_begin_rum(int workMem, int nKeys, bool randomAccess, - bool compareItemPointer) -{ - Tuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin rum sort: nKeys = %d, workMem = %d, randomAccess = %c", - nKeys, workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = nKeys; - - TRACE_POSTGRESQL_SORT_START(INDEX_SORT, - false, /* no unique check */ - state->nKeys, - workMem, - randomAccess); - - state->comparetup = comparetup_rum; - state->copytup = copytup_rum; - state->writetup = writetup_rum; - state->readtup = readtup_rum; - state->reversedirection = reversedirection_rum; - state->reverse = false; - state->compareItemPointer = compareItemPointer; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -Tuplesortstate * -rum_tuplesort_begin_rumkey(int workMem, FmgrInfo *cmp) -{ - Tuplesortstate *state = rum_tuplesort_begin_common(workMem, false); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin rumkey sort: workMem = %d", workMem); -#endif - - TRACE_POSTGRESQL_SORT_START(INDEX_SORT, - false, /* no unique check */ - 2, - workMem, - false); - - state->cmp = cmp; - state->comparetup = comparetup_rumkey; - state->copytup = copytup_rumkey; - state->writetup = writetup_rumkey; - state->readtup = readtup_rumkey; - state->reversedirection = reversedirection_rum; - state->reverse = false; - state->compareItemPointer = false; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -Tuplesortstate * -rum_tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, - bool nullsFirstFlag, - int workMem, bool randomAccess) -{ - Tuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - int16 typlen; - bool typbyval; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin datum sort: workMem = %d, randomAccess = %c", - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = 1; /* always a one-column sort */ - - TRACE_POSTGRESQL_SORT_START(DATUM_SORT, - false, /* no unique check */ - 1, - workMem, - randomAccess); - - state->comparetup = comparetup_datum; - state->copytup = copytup_datum; - state->writetup = writetup_datum; - state->readtup = readtup_datum; - state->reversedirection = reversedirection_datum; - - state->datumType = datumType; - - /* Prepare SortSupport data */ - state->onlyKey = (SortSupport) palloc0(sizeof(SortSupportData)); - - state->onlyKey->ssup_cxt = CurrentMemoryContext; - state->onlyKey->ssup_collation = sortCollation; - state->onlyKey->ssup_nulls_first = nullsFirstFlag; - - PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey); - - /* lookup necessary attributes of the datum type */ - get_typlenbyval(datumType, &typlen, &typbyval); - state->datumTypeLen = typlen; - state->datumTypeByVal = typbyval; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -/* - * rum_tuplesort_set_bound - * - * Advise tuplesort that at most the first N result tuples are required. - * - * Must be called before inserting any tuples. (Actually, we could allow it - * as long as the sort hasn't spilled to disk, but there seems no need for - * delayed calls at the moment.) - * - * This is a hint only. The tuplesort may still return more tuples than - * requested. - */ -void -rum_tuplesort_set_bound(Tuplesortstate *state, int64 bound) -{ - /* Assert we're called before loading any tuples */ - Assert(state->status == TSS_INITIAL); - Assert(state->memtupcount == 0); - Assert(!state->bounded); - -#ifdef DEBUG_BOUNDED_SORT - /* Honor GUC setting that disables the feature (for easy testing) */ - if (!optimize_bounded_sort) - return; -#endif - - /* We want to be able to compute bound * 2, so limit the setting */ - if (bound > (int64) (INT_MAX / 2)) - return; - - state->bounded = true; - state->bound = (int) bound; -} - -/* - * rum_tuplesort_end - * - * Release resources and clean up. - * - * NOTE: after calling this, any pointers returned by rum_tuplesort_getXXX are - * pointing to garbage. Be careful not to attempt to use or free such - * pointers afterwards! - */ -void -rum_tuplesort_end(Tuplesortstate *state) -{ - /* context swap probably not needed, but let's be safe */ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - long spaceUsed; - - if (state->tapeset) - spaceUsed = LogicalTapeSetBlocks(state->tapeset); - else - spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; -#endif - - /* - * Delete temporary "tape" files, if any. - * - * Note: want to include this in reported total cost of sort, hence need - * for two #ifdef TRACE_SORT sections. - */ - if (state->tapeset) - LogicalTapeSetClose(state->tapeset); - -#ifdef TRACE_SORT - if (trace_sort) - { - if (state->tapeset) - elog(LOG, "external sort ended, %ld disk blocks used: %s", - spaceUsed, pg_rusage_show(&state->ru_start)); - else - elog(LOG, "internal sort ended, %ld KB used: %s", - spaceUsed, pg_rusage_show(&state->ru_start)); - } - - TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); -#else - - /* - * If you disabled TRACE_SORT, you can still probe sort__done, but you - * ain't getting space-used stats. - */ - TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); -#endif - - /* Free any execution state created for CLUSTER case */ - if (state->estate != NULL) - { - ExprContext *econtext = GetPerTupleExprContext(state->estate); - - ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); - FreeExecutorState(state->estate); - } - - MemoryContextSwitchTo(oldcontext); - - /* - * Free the per-sort memory context, thereby releasing all working memory, - * including the Tuplesortstate struct itself. - */ - MemoryContextDelete(state->sortcontext); -} - -/* - * Grow the memtuples[] array, if possible within our memory constraint. - * Return TRUE if we were able to enlarge the array, FALSE if not. - * - * Normally, at each increment we double the size of the array. When we no - * longer have enough memory to do that, we attempt one last, smaller increase - * (and then clear the growmemtuples flag so we don't try any more). That - * allows us to use allowedMem as fully as possible; sticking to the pure - * doubling rule could result in almost half of allowedMem going unused. - * Because availMem moves around with tuple addition/removal, we need some - * rule to prevent making repeated small increases in memtupsize, which would - * just be useless thrashing. The growmemtuples flag accomplishes that and - * also prevents useless recalculations in this function. - */ -static bool -grow_memtuples(Tuplesortstate *state) -{ - int newmemtupsize; - int memtupsize = state->memtupsize; - long memNowUsed = state->allowedMem - state->availMem; - - /* Forget it if we've already maxed out memtuples, per comment above */ - if (!state->growmemtuples) - return false; - - /* Select new value of memtupsize */ - if (memNowUsed <= state->availMem) - { - /* - * It is surely safe to double memtupsize if we've used no more than - * half of allowedMem. - * - * Note: it might seem that we need to worry about memtupsize * 2 - * overflowing an int, but the MaxAllocSize clamp applied below - * ensures the existing memtupsize can't be large enough for that. - */ - newmemtupsize = memtupsize * 2; - } - else - { - /* - * This will be the last increment of memtupsize. Abandon doubling - * strategy and instead increase as much as we safely can. - * - * To stay within allowedMem, we can't increase memtupsize by more - * than availMem / sizeof(SortTuple) elements. In practice, we want - * to increase it by considerably less, because we need to leave some - * space for the tuples to which the new array slots will refer. We - * assume the new tuples will be about the same size as the tuples - * we've already seen, and thus we can extrapolate from the space - * consumption so far to estimate an appropriate new size for the - * memtuples array. The optimal value might be higher or lower than - * this estimate, but it's hard to know that in advance. - * - * This calculation is safe against enlarging the array so much that - * LACKMEM becomes true, because the memory currently used includes - * the present array; thus, there would be enough allowedMem for the - * new array elements even if no other memory were currently used. - * - * We do the arithmetic in float8, because otherwise the product of - * memtupsize and allowedMem could overflow. (A little algebra shows - * that grow_ratio must be less than 2 here, so we are not risking - * integer overflow this way.) Any inaccuracy in the result should be - * insignificant; but even if we computed a completely insane result, - * the checks below will prevent anything really bad from happening. - */ - double grow_ratio; - - grow_ratio = (double) state->allowedMem / (double) memNowUsed; - newmemtupsize = (int) (memtupsize * grow_ratio); - - /* We won't make any further enlargement attempts */ - state->growmemtuples = false; - } - - /* Must enlarge array by at least one element, else report failure */ - if (newmemtupsize <= memtupsize) - goto noalloc; - - /* - * On a 64-bit machine, allowedMem could be more than MaxAllocSize. Clamp - * to ensure our request won't be rejected by palloc. - */ - if ((Size) newmemtupsize >= MaxAllocSize / sizeof(SortTuple)) - { - newmemtupsize = (int) (MaxAllocSize / sizeof(SortTuple)); - state->growmemtuples = false; /* can't grow any more */ - } - - /* - * We need to be sure that we do not cause LACKMEM to become true, else - * the space management algorithm will go nuts. The code above should - * never generate a dangerous request, but to be safe, check explicitly - * that the array growth fits within availMem. (We could still cause - * LACKMEM if the memory chunk overhead associated with the memtuples - * array were to increase. That shouldn't happen because we chose the - * initial array size large enough to ensure that palloc will be treating - * both old and new arrays as separate chunks. But we'll check LACKMEM - * explicitly below just in case.) - */ - if (state->availMem < (long) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) - goto noalloc; - - /* OK, do it */ - FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); - state->memtupsize = newmemtupsize; - state->memtuples = (SortTuple *) - repalloc(state->memtuples, - state->memtupsize * sizeof(SortTuple)); - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); - if (LACKMEM(state)) - elog(ERROR, "unexpected out-of-memory situation in tuplesort"); - return true; - -noalloc: - /* If for any reason we didn't realloc, shut off future attempts */ - state->growmemtuples = false; - return false; -} - -/* - * Accept one tuple while collecting input data for sort. - * - * Note that the input data is always copied; the caller need not save it. - */ -void -rum_tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) slot); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Accept one tuple while collecting input data for sort. - * - * Note that the input data is always copied; the caller need not save it. - */ -void -rum_tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) tup); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Accept one index tuple while collecting input data for sort. - * - * Note that the input tuple is always copied; the caller need not save it. - */ -void -rum_tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) tuple); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Accept one Datum while collecting input data for sort. - * - * If the Datum is pass-by-ref type, the value will be copied. - */ -void -rum_tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * If it's a pass-by-reference value, copy it into memory we control, and - * decrease availMem. Then call the common code. - */ - if (isNull || state->datumTypeByVal) - { - stup.datum1 = val; - stup.isnull1 = isNull; - stup.tuple = NULL; /* no separate storage */ - } - else - { - stup.datum1 = datumCopy(val, false, state->datumTypeLen); - stup.isnull1 = false; - stup.tuple = DatumGetPointer(stup.datum1); - USEMEM(state, GetMemoryChunkSpace(stup.tuple)); - } - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -void -rum_tuplesort_putrum(Tuplesortstate *state, RumSortItem * item) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) item); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -void -rum_tuplesort_putrumkey(Tuplesortstate *state, RumKey * item) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) item); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Shared code for tuple and datum cases. - */ -static void -puttuple_common(Tuplesortstate *state, SortTuple *tuple) -{ - switch (state->status) - { - case TSS_INITIAL: - - /* - * Save the tuple into the unsorted array. First, grow the array - * as needed. Note that we try to grow the array when there is - * still one free slot remaining --- if we fail, there'll still be - * room to store the incoming tuple, and then we'll switch to - * tape-based operation. - */ - if (state->memtupcount >= state->memtupsize - 1) - { - (void) grow_memtuples(state); - Assert(state->memtupcount < state->memtupsize); - } - state->memtuples[state->memtupcount++] = *tuple; - - /* - * Check if it's time to switch over to a bounded heapsort. We do - * so if the input tuple count exceeds twice the desired tuple - * count (this is a heuristic for where heapsort becomes cheaper - * than a quicksort), or if we've just filled workMem and have - * enough tuples to meet the bound. - * - * Note that once we enter TSS_BOUNDED state we will always try to - * complete the sort that way. In the worst case, if later input - * tuples are larger than earlier ones, this might cause us to - * exceed workMem significantly. - */ - if (state->bounded && - (state->memtupcount > state->bound * 2 || - (state->memtupcount > state->bound && LACKMEM(state)))) - { -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "switching to bounded heapsort at %d tuples: %s", - state->memtupcount, - pg_rusage_show(&state->ru_start)); -#endif - make_bounded_heap(state); - return; - } - - /* - * Done if we still fit in available memory and have array slots. - */ - if (state->memtupcount < state->memtupsize && !LACKMEM(state)) - return; - - /* - * Nope; time to switch to tape-based operation. - */ - inittapes(state); - - /* - * Dump tuples until we are back under the limit. - */ - dumptuples(state, false); - break; - - case TSS_BOUNDED: - - /* - * We don't want to grow the array here, so check whether the new - * tuple can be discarded before putting it in. This should be a - * good speed optimization, too, since when there are many more - * input tuples than the bound, most input tuples can be discarded - * with just this one comparison. Note that because we currently - * have the sort direction reversed, we must check for <= not >=. - */ - if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) - { - /* new tuple <= top of the heap, so we can discard it */ - free_sort_tuple(state, tuple); - CHECK_FOR_INTERRUPTS(); - } - else - { - /* discard top of heap, sift up, insert new tuple */ - free_sort_tuple(state, &state->memtuples[0]); - rum_tuplesort_heap_siftup(state, false); - rum_tuplesort_heap_insert(state, tuple, 0, false); - } - break; - - case TSS_BUILDRUNS: - - /* - * Insert the tuple into the heap, with run number currentRun if - * it can go into the current run, else run number currentRun+1. - * The tuple can go into the current run if it is >= the first - * not-yet-output tuple. (Actually, it could go into the current - * run if it is >= the most recently output tuple ... but that - * would require keeping around the tuple we last output, and it's - * simplest to let writetup free each tuple as soon as it's - * written.) - * - * Note there will always be at least one tuple in the heap at - * this point; see dumptuples. - */ - Assert(state->memtupcount > 0); - if (COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) - rum_tuplesort_heap_insert(state, tuple, state->currentRun, true); - else - rum_tuplesort_heap_insert(state, tuple, state->currentRun + 1, true); - - /* - * If we are over the memory limit, dump tuples till we're under. - */ - dumptuples(state, false); - break; - - default: - elog(ERROR, "invalid tuplesort state"); - break; - } -} - -/* - * All tuples have been provided; finish the sort. - */ -void -rum_tuplesort_performsort(Tuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "performsort starting: %s", - pg_rusage_show(&state->ru_start)); -#endif - - switch (state->status) - { - case TSS_INITIAL: - - /* - * We were able to accumulate all the tuples within the allowed - * amount of memory. Just qsort 'em and we're done. - */ - if (state->memtupcount > 1) - { - /* Can we use the single-key sort function? */ - if (state->onlyKey != NULL) - qsort_ssup(state->memtuples, state->memtupcount, - state->onlyKey); - else - qsort_tuple(state->memtuples, - state->memtupcount, - state->comparetup, - state); - } - state->current = 0; - state->eof_reached = false; - state->markpos_offset = 0; - state->markpos_eof = false; - state->status = TSS_SORTEDINMEM; - break; - - case TSS_BOUNDED: - - /* - * We were able to accumulate all the tuples required for output - * in memory, using a heap to eliminate excess tuples. Now we - * have to transform the heap to a properly-sorted array. - */ - sort_bounded_heap(state); - state->current = 0; - state->eof_reached = false; - state->markpos_offset = 0; - state->markpos_eof = false; - state->status = TSS_SORTEDINMEM; - break; - - case TSS_BUILDRUNS: - - /* - * Finish tape-based sort. First, flush all tuples remaining in - * memory out to tape; then merge until we have a single remaining - * run (or, if !randomAccess, one run per tape). Note that - * mergeruns sets the correct state->status. - */ - dumptuples(state, true); - mergeruns(state); - state->eof_reached = false; - state->markpos_block = 0L; - state->markpos_offset = 0; - state->markpos_eof = false; - break; - - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - -#ifdef TRACE_SORT - if (trace_sort) - { - if (state->status == TSS_FINALMERGE) - elog(LOG, "performsort done (except %d-way final merge): %s", - state->activeTapes, - pg_rusage_show(&state->ru_start)); - else - elog(LOG, "performsort done: %s", - pg_rusage_show(&state->ru_start)); - } -#endif - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Internal routine to fetch the next tuple in either forward or back - * direction into *stup. Returns FALSE if no more tuples. - * If *should_free is set, the caller must pfree stup.tuple when done with it. - */ -static bool -rum_tuplesort_gettuple_common(Tuplesortstate *state, bool forward, - SortTuple *stup, bool *should_free) -{ - unsigned int tuplen; - - switch (state->status) - { - case TSS_SORTEDINMEM: - Assert(forward || state->randomAccess); - *should_free = false; - if (forward) - { - if (state->current < state->memtupcount) - { - *stup = state->memtuples[state->current++]; - return true; - } - state->eof_reached = true; - - /* - * Complain if caller tries to retrieve more tuples than - * originally asked for in a bounded sort. This is because - * returning EOF here might be the wrong thing. - */ - if (state->bounded && state->current >= state->bound) - elog(ERROR, "retrieved too many tuples in a bounded sort"); - - return false; - } - else - { - if (state->current <= 0) - return false; - - /* - * if all tuples are fetched already then we return last - * tuple, else - tuple before last returned. - */ - if (state->eof_reached) - state->eof_reached = false; - else - { - state->current--; /* last returned tuple */ - if (state->current <= 0) - return false; - } - *stup = state->memtuples[state->current - 1]; - return true; - } - break; - - case TSS_SORTEDONTAPE: - Assert(forward || state->randomAccess); - *should_free = true; - if (forward) - { - if (state->eof_reached) - return false; - if ((tuplen = getlen(state, state->result_tape, true)) != 0) - { - READTUP(state, stup, state->result_tape, tuplen); - return true; - } - else - { - state->eof_reached = true; - return false; - } - } - - /* - * Backward. - * - * if all tuples are fetched already then we return last tuple, - * else - tuple before last returned. - */ - if (state->eof_reached) - { - /* - * Seek position is pointing just past the zero tuplen at the - * end of file; back up to fetch last tuple's ending length - * word. If seek fails we must have a completely empty file. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - 2 * sizeof(unsigned int))) - return false; - state->eof_reached = false; - } - else - { - /* - * Back up and fetch previously-returned tuple's ending length - * word. If seek fails, assume we are at start of file. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - sizeof(unsigned int))) - return false; - tuplen = getlen(state, state->result_tape, false); - - /* - * Back up to get ending length word of tuple before it. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - tuplen + 2 * sizeof(unsigned int))) - { - /* - * If that fails, presumably the prev tuple is the first - * in the file. Back up so that it becomes next to read - * in forward direction (not obviously right, but that is - * what in-memory case does). - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - tuplen + sizeof(unsigned int))) - elog(ERROR, "bogus tuple length in backward scan"); - return false; - } - } - - tuplen = getlen(state, state->result_tape, false); - - /* - * Now we have the length of the prior tuple, back up and read it. - * Note: READTUP expects we are positioned after the initial - * length word of the tuple, so back up to that point. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - tuplen)) - elog(ERROR, "bogus tuple length in backward scan"); - READTUP(state, stup, state->result_tape, tuplen); - return true; - - case TSS_FINALMERGE: - Assert(forward); - *should_free = true; - - /* - * This code should match the inner loop of mergeonerun(). - */ - if (state->memtupcount > 0) - { - int srcTape = state->memtuples[0].tupindex; - Size tuplen; - int tupIndex; - SortTuple *newtup; - - *stup = state->memtuples[0]; - /* returned tuple is no longer counted in our memory space */ - if (stup->tuple) - { - tuplen = GetMemoryChunkSpace(stup->tuple); - state->availMem += tuplen; - state->mergeavailmem[srcTape] += tuplen; - } - rum_tuplesort_heap_siftup(state, false); - if ((tupIndex = state->mergenext[srcTape]) == 0) - { - /* - * out of preloaded data on this tape, try to read more - * - * Unlike mergeonerun(), we only preload from the single - * tape that's run dry. See mergepreread() comments. - */ - mergeprereadone(state, srcTape); - - /* - * if still no data, we've reached end of run on this tape - */ - if ((tupIndex = state->mergenext[srcTape]) == 0) - return true; - } - /* pull next preread tuple from list, insert in heap */ - newtup = &state->memtuples[tupIndex]; - state->mergenext[srcTape] = newtup->tupindex; - if (state->mergenext[srcTape] == 0) - state->mergelast[srcTape] = 0; - rum_tuplesort_heap_insert(state, newtup, srcTape, false); - /* put the now-unused memtuples entry on the freelist */ - newtup->tupindex = state->mergefreelist; - state->mergefreelist = tupIndex; - state->mergeavailslots[srcTape]++; - return true; - } - return false; - - default: - elog(ERROR, "invalid tuplesort state"); - return false; /* keep compiler quiet */ - } -} - -/* - * Fetch the next tuple in either forward or back direction. - * If successful, put tuple in slot and return TRUE; else, clear the slot - * and return FALSE. - */ -bool -rum_tuplesort_gettupleslot(Tuplesortstate *state, bool forward, - TupleTableSlot *slot) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - bool should_free; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, &should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - if (stup.tuple) - { - ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, should_free); - return true; - } - else - { - ExecClearTuple(slot); - return false; - } -} - -/* - * Fetch the next tuple in either forward or back direction. - * Returns NULL if no more tuples. If *should_free is set, the - * caller must pfree the returned tuple when done with it. - */ -HeapTuple -rum_tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return stup.tuple; -} - -/* - * Fetch the next index tuple in either forward or back direction. - * Returns NULL if no more tuples. If *should_free is set, the - * caller must pfree the returned tuple when done with it. - */ -IndexTuple -rum_tuplesort_getindextuple(Tuplesortstate *state, bool forward, - bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return (IndexTuple) stup.tuple; -} - -/* - * Fetch the next Datum in either forward or back direction. - * Returns FALSE if no more datums. - * - * If the Datum is pass-by-ref type, the returned value is freshly palloc'd - * and is now owned by the caller. - */ -bool -rum_tuplesort_getdatum(Tuplesortstate *state, bool forward, - Datum *val, bool *isNull) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - bool should_free; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, &should_free)) - { - MemoryContextSwitchTo(oldcontext); - return false; - } - - if (stup.isnull1 || state->datumTypeByVal) - { - *val = stup.datum1; - *isNull = stup.isnull1; - } - else - { - if (should_free) - *val = stup.datum1; - else - *val = datumCopy(stup.datum1, false, state->datumTypeLen); - *isNull = false; - } - - MemoryContextSwitchTo(oldcontext); - - return true; -} - -RumSortItem * -rum_tuplesort_getrum(Tuplesortstate *state, bool forward, bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return (RumSortItem *) stup.tuple; -} - -RumKey * -rum_tuplesort_getrumkey(Tuplesortstate *state, bool forward, bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return (RumKey *) stup.tuple; -} - -/* - * rum_tuplesort_merge_order - report merge order we'll use for given memory - * (note: "merge order" just means the number of input tapes in the merge). - * - * This is exported for use by the planner. allowedMem is in bytes. - */ -int -rum_tuplesort_merge_order(long allowedMem) -{ - int mOrder; - - /* - * We need one tape for each merge input, plus another one for the output, - * and each of these tapes needs buffer space. In addition we want - * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't - * count). - * - * Note: you might be thinking we need to account for the memtuples[] - * array in this calculation, but we effectively treat that as part of the - * MERGE_BUFFER_SIZE workspace. - */ - mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / - (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); - - /* Even in minimum memory, use at least a MINORDER merge */ - mOrder = Max(mOrder, MINORDER); - - return mOrder; -} - -/* - * inittapes - initialize for tape sorting. - * - * This is called only if we have found we don't have room to sort in memory. - */ -static void -inittapes(Tuplesortstate *state) -{ - int maxTapes, - ntuples, - j; - long tapeSpace; - - /* Compute number of tapes to use: merge order plus 1 */ - maxTapes = rum_tuplesort_merge_order(state->allowedMem) + 1; - - /* - * We must have at least 2*maxTapes slots in the memtuples[] array, else - * we'd not have room for merge heap plus preread. It seems unlikely that - * this case would ever occur, but be safe. - */ - maxTapes = Min(maxTapes, state->memtupsize / 2); - - state->maxTapes = maxTapes; - state->tapeRange = maxTapes - 1; - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "switching to external sort with %d tapes: %s", - maxTapes, pg_rusage_show(&state->ru_start)); -#endif - - /* - * Decrease availMem to reflect the space needed for tape buffers; but - * don't decrease it to the point that we have no room for tuples. (That - * case is only likely to occur if sorting pass-by-value Datums; in all - * other scenarios the memtuples[] array is unlikely to occupy more than - * half of allowedMem. In the pass-by-value case it's not important to - * account for tuple space, so we don't care if LACKMEM becomes - * inaccurate.) - */ - tapeSpace = (long) maxTapes *TAPE_BUFFER_OVERHEAD; - - if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) - USEMEM(state, tapeSpace); - - /* - * Make sure that the temp file(s) underlying the tape set are created in - * suitable temp tablespaces. - */ - PrepareTempTablespaces(); - - /* - * Create the tape set and allocate the per-tape data arrays. - */ - state->tapeset = LogicalTapeSetCreate(maxTapes); - - state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); - state->mergenext = (int *) palloc0(maxTapes * sizeof(int)); - state->mergelast = (int *) palloc0(maxTapes * sizeof(int)); - state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int)); - state->mergeavailmem = (long *) palloc0(maxTapes * sizeof(long)); - state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); - state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); - state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); - state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); - - /* - * Convert the unsorted contents of memtuples[] into a heap. Each tuple is - * marked as belonging to run number zero. - * - * NOTE: we pass false for checkIndex since there's no point in comparing - * indexes in this step, even though we do intend the indexes to be part - * of the sort key... - */ - ntuples = state->memtupcount; - state->memtupcount = 0; /* make the heap empty */ - for (j = 0; j < ntuples; j++) - { - /* Must copy source tuple to avoid possible overwrite */ - SortTuple stup = state->memtuples[j]; - - rum_tuplesort_heap_insert(state, &stup, 0, false); - } - Assert(state->memtupcount == ntuples); - - state->currentRun = 0; - - /* - * Initialize variables of Algorithm D (step D1). - */ - for (j = 0; j < maxTapes; j++) - { - state->tp_fib[j] = 1; - state->tp_runs[j] = 0; - state->tp_dummy[j] = 1; - state->tp_tapenum[j] = j; - } - state->tp_fib[state->tapeRange] = 0; - state->tp_dummy[state->tapeRange] = 0; - - state->Level = 1; - state->destTape = 0; - - state->status = TSS_BUILDRUNS; -} - -/* - * selectnewtape -- select new tape for new initial run. - * - * This is called after finishing a run when we know another run - * must be started. This implements steps D3, D4 of Algorithm D. - */ -static void -selectnewtape(Tuplesortstate *state) -{ - int j; - int a; - - /* Step D3: advance j (destTape) */ - if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) - { - state->destTape++; - return; - } - if (state->tp_dummy[state->destTape] != 0) - { - state->destTape = 0; - return; - } - - /* Step D4: increase level */ - state->Level++; - a = state->tp_fib[0]; - for (j = 0; j < state->tapeRange; j++) - { - state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; - state->tp_fib[j] = a + state->tp_fib[j + 1]; - } - state->destTape = 0; -} - -/* - * mergeruns -- merge all the completed initial runs. - * - * This implements steps D5, D6 of Algorithm D. All input data has - * already been written to initial runs on tape (see dumptuples). - */ -static void -mergeruns(Tuplesortstate *state) -{ - int tapenum, - svTape, - svRuns, - svDummy; - int numTapes; - int numInputTapes; - - Assert(state->status == TSS_BUILDRUNS); - Assert(state->memtupcount == 0); - - /* - * If we produced only one initial run (quite likely if the total data - * volume is between 1X and 2X workMem), we can just use that tape as the - * finished output, rather than doing a useless merge. (This obvious - * optimization is not in Knuth's algorithm.) - */ - if (state->currentRun == 1) - { - state->result_tape = state->tp_tapenum[state->destTape]; - /* must freeze and rewind the finished output tape */ - LogicalTapeFreeze(state->tapeset, state->result_tape); - state->status = TSS_SORTEDONTAPE; - return; - } - - /* - * If we had fewer runs than tapes, refund the memory that we imagined we - * would need for the tape buffers of the unused tapes. - * - * numTapes and numInputTapes reflect the actual number of tapes we will - * use. Note that the output tape's tape number is maxTapes - 1, so the - * tape numbers of the used tapes are not consecutive, and you cannot just - * loop from 0 to numTapes to visit all used tapes! - */ - if (state->Level == 1) - { - numInputTapes = state->currentRun; - numTapes = numInputTapes + 1; - FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); - } - else - { - numInputTapes = state->tapeRange; - numTapes = state->maxTapes; - } - - state->read_buffer_size = Max(state->availMem / numInputTapes, 0); - USEMEM(state, state->read_buffer_size * numInputTapes); - - /* End of step D2: rewind all output tapes to prepare for merging */ - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); - - for (;;) - { - /* - * At this point we know that tape[T] is empty. If there's just one - * (real or dummy) run left on each input tape, then only one merge - * pass remains. If we don't have to produce a materialized sorted - * tape, we can stop at this point and do the final merge on-the-fly. - */ - if (!state->randomAccess) - { - bool allOneRun = true; - - Assert(state->tp_runs[state->tapeRange] == 0); - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - { - if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) - { - allOneRun = false; - break; - } - } - if (allOneRun) - { - /* Tell logtape.c we won't be writing anymore */ - LogicalTapeSetForgetFreeSpace(state->tapeset); - /* Initialize for the final merge pass */ - beginmerge(state); - state->status = TSS_FINALMERGE; - return; - } - } - - /* Step D5: merge runs onto tape[T] until tape[P] is empty */ - while (state->tp_runs[state->tapeRange - 1] || - state->tp_dummy[state->tapeRange - 1]) - { - bool allDummy = true; - - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - { - if (state->tp_dummy[tapenum] == 0) - { - allDummy = false; - break; - } - } - - if (allDummy) - { - state->tp_dummy[state->tapeRange]++; - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - state->tp_dummy[tapenum]--; - } - else - mergeonerun(state); - } - - /* Step D6: decrease level */ - if (--state->Level == 0) - break; - /* rewind output tape T to use as new input */ - LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], - state->read_buffer_size); - /* rewind used-up input tape P, and prepare it for write pass */ - LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); - state->tp_runs[state->tapeRange - 1] = 0; - - /* - * reassign tape units per step D6; note we no longer care about A[] - */ - svTape = state->tp_tapenum[state->tapeRange]; - svDummy = state->tp_dummy[state->tapeRange]; - svRuns = state->tp_runs[state->tapeRange]; - for (tapenum = state->tapeRange; tapenum > 0; tapenum--) - { - state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; - state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; - state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; - } - state->tp_tapenum[0] = svTape; - state->tp_dummy[0] = svDummy; - state->tp_runs[0] = svRuns; - } - - /* - * Done. Knuth says that the result is on TAPE[1], but since we exited - * the loop without performing the last iteration of step D6, we have not - * rearranged the tape unit assignment, and therefore the result is on - * TAPE[T]. We need to do it this way so that we can freeze the final - * output tape while rewinding it. The last iteration of step D6 would be - * a waste of cycles anyway... - */ - state->result_tape = state->tp_tapenum[state->tapeRange]; - LogicalTapeFreeze(state->tapeset, state->result_tape); - state->status = TSS_SORTEDONTAPE; -} - -/* - * Merge one run from each input tape, except ones with dummy runs. - * - * This is the inner loop of Algorithm D step D5. We know that the - * output tape is TAPE[T]. - */ -static void -mergeonerun(Tuplesortstate *state) -{ - int destTape = state->tp_tapenum[state->tapeRange]; - int srcTape; - int tupIndex; - SortTuple *tup; - long priorAvail, - spaceFreed; - - /* - * Start the merge by loading one tuple from each active source tape into - * the heap. We can also decrease the input run/dummy run counts. - */ - beginmerge(state); - - /* - * Execute merge by repeatedly extracting lowest tuple in heap, writing it - * out, and replacing it with next tuple from same tape (if there is - * another one). - */ - while (state->memtupcount > 0) - { - /* write the tuple to destTape */ - priorAvail = state->availMem; - srcTape = state->memtuples[0].tupindex; - WRITETUP(state, destTape, &state->memtuples[0]); - /* writetup adjusted total free space, now fix per-tape space */ - spaceFreed = state->availMem - priorAvail; - state->mergeavailmem[srcTape] += spaceFreed; - /* compact the heap */ - rum_tuplesort_heap_siftup(state, false); - if ((tupIndex = state->mergenext[srcTape]) == 0) - { - /* out of preloaded data on this tape, try to read more */ - mergepreread(state); - /* if still no data, we've reached end of run on this tape */ - if ((tupIndex = state->mergenext[srcTape]) == 0) - continue; - } - /* pull next preread tuple from list, insert in heap */ - tup = &state->memtuples[tupIndex]; - state->mergenext[srcTape] = tup->tupindex; - if (state->mergenext[srcTape] == 0) - state->mergelast[srcTape] = 0; - rum_tuplesort_heap_insert(state, tup, srcTape, false); - /* put the now-unused memtuples entry on the freelist */ - tup->tupindex = state->mergefreelist; - state->mergefreelist = tupIndex; - state->mergeavailslots[srcTape]++; - } - - /* - * When the heap empties, we're done. Write an end-of-run marker on the - * output tape, and increment its count of real runs. - */ - markrunend(state, destTape); - state->tp_runs[state->tapeRange]++; - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "finished %d-way merge step: %s", state->activeTapes, - pg_rusage_show(&state->ru_start)); -#endif -} - -/* - * beginmerge - initialize for a merge pass - * - * We decrease the counts of real and dummy runs for each tape, and mark - * which tapes contain active input runs in mergeactive[]. Then, load - * as many tuples as we can from each active input tape, and finally - * fill the merge heap with the first tuple from each active tape. - */ -static void -beginmerge(Tuplesortstate *state) -{ - int activeTapes; - int tapenum; - int srcTape; - int slotsPerTape; - long spacePerTape; - - /* Heap should be empty here */ - Assert(state->memtupcount == 0); - - /* Adjust run counts and mark the active tapes */ - memset(state->mergeactive, 0, - state->maxTapes * sizeof(*state->mergeactive)); - activeTapes = 0; - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - { - if (state->tp_dummy[tapenum] > 0) - state->tp_dummy[tapenum]--; - else - { - Assert(state->tp_runs[tapenum] > 0); - state->tp_runs[tapenum]--; - srcTape = state->tp_tapenum[tapenum]; - state->mergeactive[srcTape] = true; - activeTapes++; - } - } - state->activeTapes = activeTapes; - - /* Clear merge-pass state variables */ - memset(state->mergenext, 0, - state->maxTapes * sizeof(*state->mergenext)); - memset(state->mergelast, 0, - state->maxTapes * sizeof(*state->mergelast)); - state->mergefreelist = 0; /* nothing in the freelist */ - state->mergefirstfree = activeTapes; /* 1st slot avail for preread */ - - /* - * Initialize space allocation to let each active input tape have an equal - * share of preread space. - */ - Assert(activeTapes > 0); - slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes; - Assert(slotsPerTape > 0); - spacePerTape = state->availMem / activeTapes; - for (srcTape = 0; srcTape < state->maxTapes; srcTape++) - { - if (state->mergeactive[srcTape]) - { - state->mergeavailslots[srcTape] = slotsPerTape; - state->mergeavailmem[srcTape] = spacePerTape; - } - } - - /* - * Preread as many tuples as possible (and at least one) from each active - * tape - */ - mergepreread(state); - - /* Load the merge heap with the first tuple from each input tape */ - for (srcTape = 0; srcTape < state->maxTapes; srcTape++) - { - int tupIndex = state->mergenext[srcTape]; - SortTuple *tup; - - if (tupIndex) - { - tup = &state->memtuples[tupIndex]; - state->mergenext[srcTape] = tup->tupindex; - if (state->mergenext[srcTape] == 0) - state->mergelast[srcTape] = 0; - rum_tuplesort_heap_insert(state, tup, srcTape, false); - /* put the now-unused memtuples entry on the freelist */ - tup->tupindex = state->mergefreelist; - state->mergefreelist = tupIndex; - state->mergeavailslots[srcTape]++; - } - } -} - -/* - * mergepreread - load tuples from merge input tapes - * - * This routine exists to improve sequentiality of reads during a merge pass, - * as explained in the header comments of this file. Load tuples from each - * active source tape until the tape's run is exhausted or it has used up - * its fair share of available memory. In any case, we guarantee that there - * is at least one preread tuple available from each unexhausted input tape. - * - * We invoke this routine at the start of a merge pass for initial load, - * and then whenever any tape's preread data runs out. Note that we load - * as much data as possible from all tapes, not just the one that ran out. - * This is because logtape.c works best with a usage pattern that alternates - * between reading a lot of data and writing a lot of data, so whenever we - * are forced to read, we should fill working memory completely. - * - * In FINALMERGE state, we *don't* use this routine, but instead just preread - * from the single tape that ran dry. There's no read/write alternation in - * that state and so no point in scanning through all the tapes to fix one. - * (Moreover, there may be quite a lot of inactive tapes in that state, since - * we might have had many fewer runs than tapes. In a regular tape-to-tape - * merge we can expect most of the tapes to be active.) - */ -static void -mergepreread(Tuplesortstate *state) -{ - int srcTape; - - for (srcTape = 0; srcTape < state->maxTapes; srcTape++) - mergeprereadone(state, srcTape); -} - -/* - * mergeprereadone - load tuples from one merge input tape - * - * Read tuples from the specified tape until it has used up its free memory - * or array slots; but ensure that we have at least one tuple, if any are - * to be had. - */ -static void -mergeprereadone(Tuplesortstate *state, int srcTape) -{ - unsigned int tuplen; - SortTuple stup; - int tupIndex; - long priorAvail, - spaceUsed; - - if (!state->mergeactive[srcTape]) - return; /* tape's run is already exhausted */ - priorAvail = state->availMem; - state->availMem = state->mergeavailmem[srcTape]; - while ((state->mergeavailslots[srcTape] > 0 && !LACKMEM(state)) || - state->mergenext[srcTape] == 0) - { - /* read next tuple, if any */ - if ((tuplen = getlen(state, srcTape, true)) == 0) - { - state->mergeactive[srcTape] = false; - break; - } - READTUP(state, &stup, srcTape, tuplen); - /* find a free slot in memtuples[] for it */ - tupIndex = state->mergefreelist; - if (tupIndex) - state->mergefreelist = state->memtuples[tupIndex].tupindex; - else - { - tupIndex = state->mergefirstfree++; - Assert(tupIndex < state->memtupsize); - } - state->mergeavailslots[srcTape]--; - /* store tuple, append to list for its tape */ - stup.tupindex = 0; - state->memtuples[tupIndex] = stup; - if (state->mergelast[srcTape]) - state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex; - else - state->mergenext[srcTape] = tupIndex; - state->mergelast[srcTape] = tupIndex; - } - /* update per-tape and global availmem counts */ - spaceUsed = state->mergeavailmem[srcTape] - state->availMem; - state->mergeavailmem[srcTape] = state->availMem; - state->availMem = priorAvail - spaceUsed; -} - -/* - * dumptuples - remove tuples from heap and write to tape - * - * This is used during initial-run building, but not during merging. - * - * When alltuples = false, dump only enough tuples to get under the - * availMem limit (and leave at least one tuple in the heap in any case, - * since puttuple assumes it always has a tuple to compare to). We also - * insist there be at least one free slot in the memtuples[] array. - * - * When alltuples = true, dump everything currently in memory. - * (This case is only used at end of input data.) - * - * If we empty the heap, close out the current run and return (this should - * only happen at end of input data). If we see that the tuple run number - * at the top of the heap has changed, start a new run. - */ -static void -dumptuples(Tuplesortstate *state, bool alltuples) -{ - while (alltuples || - (LACKMEM(state) && state->memtupcount > 1) || - state->memtupcount >= state->memtupsize) - { - /* - * Dump the heap's frontmost entry, and sift up to remove it from the - * heap. - */ - Assert(state->memtupcount > 0); - WRITETUP(state, state->tp_tapenum[state->destTape], - &state->memtuples[0]); - rum_tuplesort_heap_siftup(state, true); - - /* - * If the heap is empty *or* top run number has changed, we've - * finished the current run. - */ - if (state->memtupcount == 0 || - state->currentRun != state->memtuples[0].tupindex) - { - markrunend(state, state->tp_tapenum[state->destTape]); - state->currentRun++; - state->tp_runs[state->destTape]++; - state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "finished writing%s run %d to tape %d: %s", - (state->memtupcount == 0) ? " final" : "", - state->currentRun, state->destTape, - pg_rusage_show(&state->ru_start)); -#endif - - /* - * Done if heap is empty, else prepare for new run. - */ - if (state->memtupcount == 0) - break; - Assert(state->currentRun == state->memtuples[0].tupindex); - selectnewtape(state); - } - } -} - -/* - * rum_tuplesort_rescan - rewind and replay the scan - */ -void -rum_tuplesort_rescan(Tuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - - Assert(state->randomAccess); - - switch (state->status) - { - case TSS_SORTEDINMEM: - state->current = 0; - state->eof_reached = false; - state->markpos_offset = 0; - state->markpos_eof = false; - break; - case TSS_SORTEDONTAPE: - LogicalTapeRewindForRead(state->tapeset, - state->result_tape, - state->read_buffer_size); - state->eof_reached = false; - state->markpos_block = 0L; - state->markpos_offset = 0; - state->markpos_eof = false; - break; - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - - MemoryContextSwitchTo(oldcontext); -} - -/* - * rum_tuplesort_markpos - saves current position in the merged sort file - */ -void -rum_tuplesort_markpos(Tuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - - Assert(state->randomAccess); - - switch (state->status) - { - case TSS_SORTEDINMEM: - state->markpos_offset = state->current; - state->markpos_eof = state->eof_reached; - break; - case TSS_SORTEDONTAPE: - LogicalTapeTell(state->tapeset, - state->result_tape, - &state->markpos_block, - &state->markpos_offset); - state->markpos_eof = state->eof_reached; - break; - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - - MemoryContextSwitchTo(oldcontext); -} - -/* - * rum_tuplesort_restorepos - restores current position in merged sort file to - * last saved position - */ -void -rum_tuplesort_restorepos(Tuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - - Assert(state->randomAccess); - - switch (state->status) - { - case TSS_SORTEDINMEM: - state->current = state->markpos_offset; - state->eof_reached = state->markpos_eof; - break; - case TSS_SORTEDONTAPE: -#if PG_VERSION_NUM < 100000 - if (!LogicalTapeSeek(state->tapeset, - state->result_tape, - state->markpos_block, - state->markpos_offset)) - elog(ERROR, "rum_tuplesort_restorepos failed"); -#else - LogicalTapeSeek(state->tapeset, - state->result_tape, - state->markpos_block, - state->markpos_offset); -#endif - state->eof_reached = state->markpos_eof; - break; - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - - MemoryContextSwitchTo(oldcontext); -} - -/* - * rum_tuplesort_get_stats - extract summary statistics - * - * This can be called after rum_tuplesort_performsort() finishes to obtain - * printable summary information about how the sort was performed. - * spaceUsed is measured in kilobytes. - */ -void -rum_tuplesort_get_stats(Tuplesortstate *state, - const char **sortMethod, - const char **spaceType, - long *spaceUsed) -{ - /* - * Note: it might seem we should provide both memory and disk usage for a - * disk-based sort. However, the current code doesn't track memory space - * accurately once we have begun to return tuples to the caller (since we - * don't account for pfree's the caller is expected to do), so we cannot - * rely on availMem in a disk sort. This does not seem worth the overhead - * to fix. Is it worth creating an API for the memory context code to - * tell us how much is actually used in sortcontext? - */ - if (state->tapeset) - { - *spaceType = "Disk"; - *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); - } - else - { - *spaceType = "Memory"; - *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; - } - - switch (state->status) - { - case TSS_SORTEDINMEM: - if (state->boundUsed) - *sortMethod = "top-N heapsort"; - else - *sortMethod = "quicksort"; - break; - case TSS_SORTEDONTAPE: - *sortMethod = "external sort"; - break; - case TSS_FINALMERGE: - *sortMethod = "external merge"; - break; - default: - *sortMethod = "still in progress"; - break; - } -} - - -/* - * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. - * - * Compare two SortTuples. If checkIndex is true, use the tuple index - * as the front of the sort key; otherwise, no. - */ - -#define HEAPCOMPARE(tup1,tup2) \ - (checkIndex && ((tup1)->tupindex != (tup2)->tupindex) ? \ - ((tup1)->tupindex) - ((tup2)->tupindex) : \ - COMPARETUP(state, tup1, tup2)) - -/* - * Convert the existing unordered array of SortTuples to a bounded heap, - * discarding all but the smallest "state->bound" tuples. - * - * When working with a bounded heap, we want to keep the largest entry - * at the root (array entry zero), instead of the smallest as in the normal - * sort case. This allows us to discard the largest entry cheaply. - * Therefore, we temporarily reverse the sort direction. - * - * We assume that all entries in a bounded heap will always have tupindex - * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse - * the direction of comparison for tupindexes. - */ -static void -make_bounded_heap(Tuplesortstate *state) -{ - int tupcount = state->memtupcount; - int i; - - Assert(state->status == TSS_INITIAL); - Assert(state->bounded); - Assert(tupcount >= state->bound); - - /* Reverse sort direction so largest entry will be at root */ - REVERSEDIRECTION(state); - - state->memtupcount = 0; /* make the heap empty */ - for (i = 0; i < tupcount; i++) - { - if (state->memtupcount >= state->bound && - COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) - { - /* New tuple would just get thrown out, so skip it */ - free_sort_tuple(state, &state->memtuples[i]); - CHECK_FOR_INTERRUPTS(); - } - else - { - /* Insert next tuple into heap */ - /* Must copy source tuple to avoid possible overwrite */ - SortTuple stup = state->memtuples[i]; - - rum_tuplesort_heap_insert(state, &stup, 0, false); - - /* If heap too full, discard largest entry */ - if (state->memtupcount > state->bound) - { - free_sort_tuple(state, &state->memtuples[0]); - rum_tuplesort_heap_siftup(state, false); - } - } - } - - Assert(state->memtupcount == state->bound); - state->status = TSS_BOUNDED; -} - -/* - * Convert the bounded heap to a properly-sorted array - */ -static void -sort_bounded_heap(Tuplesortstate *state) -{ - int tupcount = state->memtupcount; - - Assert(state->status == TSS_BOUNDED); - Assert(state->bounded); - Assert(tupcount == state->bound); - - /* - * We can unheapify in place because each sift-up will remove the largest - * entry, which we can promptly store in the newly freed slot at the end. - * Once we're down to a single-entry heap, we're done. - */ - while (state->memtupcount > 1) - { - SortTuple stup = state->memtuples[0]; - - /* this sifts-up the next-largest entry and decreases memtupcount */ - rum_tuplesort_heap_siftup(state, false); - state->memtuples[state->memtupcount] = stup; - } - state->memtupcount = tupcount; - - /* - * Reverse sort direction back to the original state. This is not - * actually necessary but seems like a good idea for tidiness. - */ - REVERSEDIRECTION(state); - - state->status = TSS_SORTEDINMEM; - state->boundUsed = true; -} - -/* - * Insert a new tuple into an empty or existing heap, maintaining the - * heap invariant. Caller is responsible for ensuring there's room. - * - * Note: we assume *tuple is a temporary variable that can be scribbled on. - * For some callers, tuple actually points to a memtuples[] entry above the - * end of the heap. This is safe as long as it's not immediately adjacent - * to the end of the heap (ie, in the [memtupcount] array entry) --- if it - * is, it might get overwritten before being moved into the heap! - */ -static void -rum_tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, - int tupleindex, bool checkIndex) -{ - SortTuple *memtuples; - int j; - - /* - * Save the tupleindex --- see notes above about writing on *tuple. It's a - * historical artifact that tupleindex is passed as a separate argument - * and not in *tuple, but it's notationally convenient so let's leave it - * that way. - */ - tuple->tupindex = tupleindex; - - memtuples = state->memtuples; - Assert(state->memtupcount < state->memtupsize); - - CHECK_FOR_INTERRUPTS(); - - /* - * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is - * using 1-based array indexes, not 0-based. - */ - j = state->memtupcount++; - while (j > 0) - { - int i = (j - 1) >> 1; - - if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0) - break; - memtuples[j] = memtuples[i]; - j = i; - } - memtuples[j] = *tuple; -} - -/* - * The tuple at state->memtuples[0] has been removed from the heap. - * Decrement memtupcount, and sift up to maintain the heap invariant. - */ -static void -rum_tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex) -{ - SortTuple *memtuples = state->memtuples; - SortTuple *tuple; - int i, - n; - - if (--state->memtupcount <= 0) - return; - - CHECK_FOR_INTERRUPTS(); - - n = state->memtupcount; - tuple = &memtuples[n]; /* tuple that must be reinserted */ - i = 0; /* i is where the "hole" is */ - for (;;) - { - int j = 2 * i + 1; - - if (j >= n) - break; - if (j + 1 < n && - HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0) - j++; - if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0) - break; - memtuples[i] = memtuples[j]; - i = j; - } - memtuples[i] = *tuple; -} - - -/* - * Tape interface routines - */ - -static unsigned int -getlen(Tuplesortstate *state, int tapenum, bool eofOK) -{ - unsigned int len; - - if (LogicalTapeRead(state->tapeset, tapenum, - &len, sizeof(len)) != sizeof(len)) - elog(ERROR, "unexpected end of tape"); - if (len == 0 && !eofOK) - elog(ERROR, "unexpected end of data"); - return len; -} - -static void -markrunend(Tuplesortstate *state, int tapenum) -{ - unsigned int len = 0; - - LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); -} - - -/* - * Inline-able copy of FunctionCall2Coll() to save some cycles in sorting. - */ -static inline Datum -myFunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2) -{ - FunctionCallInfoData fcinfo; - Datum result; - - InitFunctionCallInfoData(fcinfo, flinfo, 2, collation, NULL, NULL); - - fcinfo.arg[0] = arg1; - fcinfo.arg[1] = arg2; - fcinfo.argnull[0] = false; - fcinfo.argnull[1] = false; - - result = FunctionCallInvoke(&fcinfo); - - /* Check for null result, since caller is clearly not expecting one */ - if (fcinfo.isnull) - elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid); - - return result; -} - -/* - * Apply a sort function (by now converted to fmgr lookup form) - * and return a 3-way comparison result. This takes care of handling - * reverse-sort and NULLs-ordering properly. We assume that DESC and - * NULLS_FIRST options are encoded in sk_flags the same way btree does it. - */ -static inline int32 -inlineApplySortFunction(FmgrInfo *sortFunction, int sk_flags, Oid collation, - Datum datum1, bool isNull1, - Datum datum2, bool isNull2) -{ - int32 compare; - - if (isNull1) - { - if (isNull2) - compare = 0; /* NULL "=" NULL */ - else if (sk_flags & SK_BT_NULLS_FIRST) - compare = -1; /* NULL "<" NOT_NULL */ - else - compare = 1; /* NULL ">" NOT_NULL */ - } - else if (isNull2) - { - if (sk_flags & SK_BT_NULLS_FIRST) - compare = 1; /* NOT_NULL ">" NULL */ - else - compare = -1; /* NOT_NULL "<" NULL */ - } - else - { - compare = DatumGetInt32(myFunctionCall2Coll(sortFunction, collation, - datum1, datum2)); - - if (sk_flags & SK_BT_DESC) - compare = -compare; - } - - return compare; -} - - -/* - * Routines specialized for HeapTuple (actually MinimalTuple) case - */ - -static int -comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) -{ - SortSupport sortKey = state->sortKeys; - HeapTupleData ltup; - HeapTupleData rtup; - TupleDesc tupDesc; - int nkey; - int32 compare; - - /* Compare the leading sort key */ - compare = ApplySortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - sortKey); - if (compare != 0) - return compare; - - /* Compare additional sort keys */ - ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; - ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); - rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; - rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); - tupDesc = state->tupDesc; - sortKey++; - for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) - { - AttrNumber attno = sortKey->ssup_attno; - Datum datum1, - datum2; - bool isnull1, - isnull2; - - datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); - datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); - - compare = ApplySortComparator(datum1, isnull1, - datum2, isnull2, - sortKey); - if (compare != 0) - return compare; - } - - return 0; -} - -static void -copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) -{ - /* - * We expect the passed "tup" to be a TupleTableSlot, and form a - * MinimalTuple using the exported interface for that. - */ - TupleTableSlot *slot = (TupleTableSlot *) tup; - MinimalTuple tuple; - HeapTupleData htup; - - /* copy the tuple into sort storage */ - tuple = ExecCopySlotMinimalTuple(slot); - stup->tuple = (void *) tuple; - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* set up first-column key value */ - htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; - htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); - stup->datum1 = heap_getattr(&htup, - state->sortKeys[0].ssup_attno, - state->tupDesc, - &stup->isnull1); -} - -static void -writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) -{ - MinimalTuple tuple = (MinimalTuple) stup->tuple; - - /* the part of the MinimalTuple we'll write: */ - char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; - unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; - - /* total on-disk footprint: */ - unsigned int tuplen = tupbodylen + sizeof(int); - - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) tupbody, tupbodylen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - - FREEMEM(state, GetMemoryChunkSpace(tuple)); - heap_free_minimal_tuple(tuple); -} - -static void -readtup_heap(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) -{ - unsigned int tupbodylen = len - sizeof(int); - unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; - MinimalTuple tuple = (MinimalTuple) palloc(tuplen); - char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; - HeapTupleData htup; - - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* read in the tuple proper */ - tuple->t_len = tuplen; - LogicalTapeReadExact(state->tapeset, tapenum, - tupbody, tupbodylen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - stup->tuple = (void *) tuple; - /* set up first-column key value */ - htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; - htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); - stup->datum1 = heap_getattr(&htup, - state->sortKeys[0].ssup_attno, - state->tupDesc, - &stup->isnull1); -} - -static void -reversedirection_heap(Tuplesortstate *state) -{ - SortSupport sortKey = state->sortKeys; - int nkey; - - for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) - { - sortKey->ssup_reverse = !sortKey->ssup_reverse; - sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; - } -} - - -/* - * Routines specialized for the CLUSTER case (HeapTuple data, with - * comparisons per a btree index definition) - */ - -static int -comparetup_cluster(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state) -{ - ScanKey scanKey = state->indexScanKey; - HeapTuple ltup; - HeapTuple rtup; - TupleDesc tupDesc; - int nkey; - int32 compare; - - /* Compare the leading sort key, if it's simple */ - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) - { - compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, - scanKey->sk_collation, - a->datum1, a->isnull1, - b->datum1, b->isnull1); - if (compare != 0 || state->nKeys == 1) - return compare; - /* Compare additional columns the hard way */ - scanKey++; - nkey = 1; - } - else - { - /* Must compare all keys the hard way */ - nkey = 0; - } - - /* Compare additional sort keys */ - ltup = (HeapTuple) a->tuple; - rtup = (HeapTuple) b->tuple; - - if (state->indexInfo->ii_Expressions == NULL) - { - /* If not expression index, just compare the proper heap attrs */ - tupDesc = state->tupDesc; - - for (; nkey < state->nKeys; nkey++, scanKey++) - { - AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; - Datum datum1, - datum2; - bool isnull1, - isnull2; - - datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); - datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); - - compare = inlineApplySortFunction(&scanKey->sk_func, - scanKey->sk_flags, - scanKey->sk_collation, - datum1, isnull1, - datum2, isnull2); - if (compare != 0) - return compare; - } - } - else - { - /* - * In the expression index case, compute the whole index tuple and - * then compare values. It would perhaps be faster to compute only as - * many columns as we need to compare, but that would require - * duplicating all the logic in FormIndexDatum. - */ - Datum l_index_values[INDEX_MAX_KEYS]; - bool l_index_isnull[INDEX_MAX_KEYS]; - Datum r_index_values[INDEX_MAX_KEYS]; - bool r_index_isnull[INDEX_MAX_KEYS]; - TupleTableSlot *ecxt_scantuple; - - /* Reset context each time to prevent memory leakage */ - ResetPerTupleExprContext(state->estate); - - ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; - - ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); - FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, - l_index_values, l_index_isnull); - - ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); - FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, - r_index_values, r_index_isnull); - - for (; nkey < state->nKeys; nkey++, scanKey++) - { - compare = inlineApplySortFunction(&scanKey->sk_func, - scanKey->sk_flags, - scanKey->sk_collation, - l_index_values[nkey], - l_index_isnull[nkey], - r_index_values[nkey], - r_index_isnull[nkey]); - if (compare != 0) - return compare; - } - } - - return 0; -} - -static void -copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) -{ - HeapTuple tuple = (HeapTuple) tup; - - /* copy the tuple into sort storage */ - tuple = heap_copytuple(tuple); - stup->tuple = (void *) tuple; - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* set up first-column key value, if it's a simple column */ - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) - stup->datum1 = heap_getattr(tuple, - state->indexInfo->ii_KeyAttrNumbers[0], - state->tupDesc, - &stup->isnull1); -} - -static void -writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) -{ - HeapTuple tuple = (HeapTuple) stup->tuple; - unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); - - /* We need to store t_self, but not other fields of HeapTupleData */ - LogicalTapeWrite(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - LogicalTapeWrite(state->tapeset, tapenum, - &tuple->t_self, sizeof(ItemPointerData)); - LogicalTapeWrite(state->tapeset, tapenum, - tuple->t_data, tuple->t_len); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - - FREEMEM(state, GetMemoryChunkSpace(tuple)); - heap_freetuple(tuple); -} - -static void -readtup_cluster(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int tuplen) -{ - unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); - HeapTuple tuple = (HeapTuple) palloc(t_len + HEAPTUPLESIZE); - - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* Reconstruct the HeapTupleData header */ - tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); - tuple->t_len = t_len; - LogicalTapeReadExact(state->tapeset, tapenum, - &tuple->t_self, sizeof(ItemPointerData)); - /* We don't currently bother to reconstruct t_tableOid */ - tuple->t_tableOid = InvalidOid; - /* Read in the tuple body */ - LogicalTapeReadExact(state->tapeset, tapenum, - tuple->t_data, tuple->t_len); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - stup->tuple = (void *) tuple; - /* set up first-column key value, if it's a simple column */ - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) - stup->datum1 = heap_getattr(tuple, - state->indexInfo->ii_KeyAttrNumbers[0], - state->tupDesc, - &stup->isnull1); -} - - -/* - * Routines specialized for IndexTuple case - * - * The btree and hash cases require separate comparison functions, but the - * IndexTuple representation is the same so the copy/write/read support - * functions can be shared. - */ - -static int -comparetup_index_btree(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state) -{ - /* - * This is similar to _bt_tuplecompare(), but we have already done the - * index_getattr calls for the first column, and we need to keep track of - * whether any null fields are present. Also see the special treatment - * for equal keys at the end. - */ - ScanKey scanKey = state->indexScanKey; - IndexTuple tuple1; - IndexTuple tuple2; - int keysz; - TupleDesc tupDes; - bool equal_hasnull = false; - int nkey; - int32 compare; - - /* Compare the leading sort key */ - compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, - scanKey->sk_collation, - a->datum1, a->isnull1, - b->datum1, b->isnull1); - if (compare != 0) - return compare; - - /* they are equal, so we only need to examine one null flag */ - if (a->isnull1) - equal_hasnull = true; - - /* Compare additional sort keys */ - tuple1 = (IndexTuple) a->tuple; - tuple2 = (IndexTuple) b->tuple; - keysz = state->nKeys; - tupDes = RelationGetDescr(state->indexRel); - scanKey++; - for (nkey = 2; nkey <= keysz; nkey++, scanKey++) - { - Datum datum1, - datum2; - bool isnull1, - isnull2; - - datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); - datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); - - compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, - scanKey->sk_collation, - datum1, isnull1, - datum2, isnull2); - if (compare != 0) - return compare; /* done when we find unequal attributes */ - - /* they are equal, so we only need to examine one null flag */ - if (isnull1) - equal_hasnull = true; - } - - /* - * If btree has asked us to enforce uniqueness, complain if two equal - * tuples are detected (unless there was at least one NULL field). - * - * It is sufficient to make the test here, because if two tuples are equal - * they *must* get compared at some stage of the sort --- otherwise the - * sort algorithm wouldn't have checked whether one must appear before the - * other. - */ - if (state->enforceUnique && !equal_hasnull) - { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; - - /* - * Some rather brain-dead implementations of qsort (such as the one in - * QNX 4) will sometimes call the comparison routine to compare a - * value to itself, but we always use our own implementation, which - * does not. - */ - Assert(tuple1 != tuple2); - - index_deform_tuple(tuple1, tupDes, values, isnull); + if (v1 < v2) + return -1; + else if (v1 > v2) + return 1; - key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + i1 = (RumSortItem *) a->tuple; + i2 = (RumSortItem *) b->tuple; - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(state->indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(state->heapRel, - RelationGetRelationName(state->indexRel)))); + for (i = 1; i < TSS_GET(state)->nKeys; i++) + { + if (i1->data[i] < i2->data[i]) + return -1; + else if (i1->data[i] > i2->data[i]) + return 1; } + if (!compareItemPointer) + return 0; + /* - * If key values are equal, we sort on ItemPointer. This does not affect - * validity of the finished index, but it may be useful to have index - * scans in physical order. + * If key values are equal, we sort on ItemPointer. */ - { - BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); - BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); - - if (blk1 != blk2) - return (blk1 < blk2) ? -1 : 1; - } - { - OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); - OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); - - if (pos1 != pos2) - return (pos1 < pos2) ? -1 : 1; - } + return compare_rum_itempointer(i1->iptr, i2->iptr); +} - return 0; +static int +comparetup_rum_true(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state) +{ + return comparetup_rum(a, b, state, true); } static int -comparetup_index_hash(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state) +comparetup_rum_false(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state) { - uint32 hash1; - uint32 hash2; - IndexTuple tuple1; - IndexTuple tuple2; + return comparetup_rum(a, b, state, false); +} - /* - * Fetch hash keys and mask off bits we don't want to sort by. We know - * that the first column of the index tuple is the hash key. - */ - Assert(!a->isnull1); - hash1 = DatumGetUInt32(a->datum1) & state->hash_mask; - Assert(!b->isnull1); - hash2 = DatumGetUInt32(b->datum1) & state->hash_mask; +static inline FmgrInfo * +comparetup_rumitem_custom_fun(RumTuplesortstate *state) +{ +#if PG_VERSION_NUM >= 160000 + return (FmgrInfo *) TSS_GET(state)->arg; +#else + return ((RumTuplesortstateExt *) state)->cmp; +#endif +} - if (hash1 > hash2) - return 1; - else if (hash1 < hash2) - return -1; +static int +comparetup_rumitem(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state) +{ + RumItem *i1, + *i2; + FmgrInfo *cmp; - /* - * If hash values are equal, we sort on ItemPointer. This does not affect - * validity of the finished index, but it may be useful to have index - * scans in physical order. - */ - tuple1 = (IndexTuple) a->tuple; - tuple2 = (IndexTuple) b->tuple; + /* Extract RumItem from RumScanItem */ + i1 = (RumItem *) a->tuple; + i2 = (RumItem *) b->tuple; + cmp = comparetup_rumitem_custom_fun(state); + if (cmp != NULL) { - BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); - BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + if (i1->addInfoIsNull || i2->addInfoIsNull) + { + if (!(i1->addInfoIsNull && i2->addInfoIsNull)) + return (i1->addInfoIsNull) ? 1 : -1; + /* go to itempointer compare */ + } + else + { + int r; - if (blk1 != blk2) - return (blk1 < blk2) ? -1 : 1; - } - { - OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); - OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + r = DatumGetInt32(FunctionCall2(cmp, + i1->addInfo, + i2->addInfo)); - if (pos1 != pos2) - return (pos1 < pos2) ? -1 : 1; + if (r != 0) + return r; + } } - return 0; + /* + * If key values are equal, we sort on ItemPointer. + */ + return compare_rum_itempointer(i1->iptr, i2->iptr); } static void -copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +copytup_rum(RumTuplesortstate *state, SortTuple *stup, void *tup) { - IndexTuple tuple = (IndexTuple) tup; - unsigned int tuplen = IndexTupleSize(tuple); - IndexTuple newtuple; + RumSortItem *item = (RumSortItem *) tup; + int nKeys = TSS_GET(state)->nKeys; - /* copy the tuple into sort storage */ - newtuple = (IndexTuple) palloc(tuplen); - memcpy(newtuple, tuple, tuplen); - USEMEM(state, GetMemoryChunkSpace(newtuple)); - stup->tuple = (void *) newtuple; - /* set up first-column key value */ - stup->datum1 = index_getattr(newtuple, - 1, - RelationGetDescr(state->indexRel), - &stup->isnull1); + stup->datum1 = Float8GetDatum(nKeys > 0 ? item->data[0] : 0); + stup->isnull1 = false; + stup->tuple = tup; + USEMEM(state, GetMemoryChunkSpace(tup)); } static void -writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +copytup_rumitem(RumTuplesortstate *state, SortTuple *stup, void *tup) { - IndexTuple tuple = (IndexTuple) stup->tuple; - unsigned int tuplen; - - tuplen = IndexTupleSize(tuple) + sizeof(tuplen); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) tuple, IndexTupleSize(tuple)); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - - FREEMEM(state, GetMemoryChunkSpace(tuple)); - pfree(tuple); + stup->isnull1 = true; + stup->tuple = palloc(sizeof(RumScanItem)); + memcpy(stup->tuple, tup, sizeof(RumScanItem)); + USEMEM(state, GetMemoryChunkSpace(stup->tuple)); } -static void -readtup_index(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) -{ - unsigned int tuplen = len - sizeof(unsigned int); - IndexTuple tuple = (IndexTuple) palloc(tuplen); +static void readtup_rum(RumTuplesortstate *state, SortTuple *stup, + LT_TYPE LT_ARG, unsigned int len); - USEMEM(state, GetMemoryChunkSpace(tuple)); - LogicalTapeReadExact(state->tapeset, tapenum, - tuple, tuplen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - stup->tuple = (void *) tuple; - /* set up first-column key value */ - stup->datum1 = index_getattr(tuple, - 1, - RelationGetDescr(state->indexRel), - &stup->isnull1); -} +static void readtup_rumitem(RumTuplesortstate *state, SortTuple *stup, + LT_TYPE LT_ARG, unsigned int len); -static void -reversedirection_index_btree(Tuplesortstate *state) +static Size +rum_item_size(RumTuplesortstate *state) { - ScanKey scanKey = state->indexScanKey; - int nkey; + if (TSS_GET(state)->readtup == readtup_rum) + return RumSortItemSize(TSS_GET(state)->nKeys); + else if (TSS_GET(state)->readtup == readtup_rumitem) + return sizeof(RumScanItem); - for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++) - { - scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST); - } + elog (FATAL, "Unknown RUM state"); + return 0; /* keep compiler quiet */ } static void -reversedirection_index_hash(Tuplesortstate *state) +writetup_rum_internal(RumTuplesortstate *state, LT_TYPE LT_ARG, + SortTuple *stup) { - /* We don't support reversing direction in a hash index sort */ - elog(ERROR, "reversedirection_index_hash is not implemented"); -} + void *item = stup->tuple; + size_t size = rum_item_size(state); + unsigned int writtenlen = size + sizeof(unsigned int); + bool randomAccess; + LogicalTapeWrite(TAPE(state, LT_ARG), + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(TAPE(state, LT_ARG), + (void *) item, size); -/* - * Routines specialized for DatumTuple case - */ + randomAccess = +# if PG_VERSION_NUM >= 150000 + (TSS_GET(state)->sortopt & TUPLESORT_RANDOMACCESS) != 0; +# else + TSS_GET(state)->randomAccess; +# endif -static int -comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) -{ - return ApplySortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - state->onlyKey); + if (randomAccess) + LogicalTapeWrite(TAPE(TSS_GET(state), LT_ARG), (void *) &writtenlen, + sizeof(writtenlen)); } static void -copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +writetup_rum(RumTuplesortstate *state, LT_TYPE LT_ARG, SortTuple *stup) { - /* Not currently needed */ - elog(ERROR, "copytup_datum() should not be called"); + writetup_rum_internal(state, LT_ARG, stup); } static void -writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +writetup_rumitem(RumTuplesortstate *state, LT_TYPE LT_ARG, SortTuple *stup) { - void *waddr; - unsigned int tuplen; - unsigned int writtenlen; - - if (stup->isnull1) - { - waddr = NULL; - tuplen = 0; - } - else if (state->datumTypeByVal) - { - waddr = &stup->datum1; - tuplen = sizeof(Datum); - } - else - { - waddr = DatumGetPointer(stup->datum1); - tuplen = datumGetSize(stup->datum1, false, state->datumTypeLen); - Assert(tuplen != 0); - } - - writtenlen = tuplen + sizeof(unsigned int); - - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - LogicalTapeWrite(state->tapeset, tapenum, - waddr, tuplen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - - if (stup->tuple) - { - FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); - pfree(stup->tuple); - } + writetup_rum_internal(state, LT_ARG, stup); } static void -readtup_datum(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) +readtup_rum_internal(RumTuplesortstate *state, SortTuple *stup, + LT_TYPE LT_ARG, unsigned int len, bool is_item) { unsigned int tuplen = len - sizeof(unsigned int); + size_t size = rum_item_size(state); + void *item = palloc(size); - if (tuplen == 0) - { - /* it's NULL */ - stup->datum1 = (Datum) 0; - stup->isnull1 = true; - stup->tuple = NULL; - } - else if (state->datumTypeByVal) - { - Assert(tuplen == sizeof(Datum)); - LogicalTapeReadExact(state->tapeset, tapenum, - &stup->datum1, tuplen); - stup->isnull1 = false; - stup->tuple = NULL; - } - else - { - void *raddr = palloc(tuplen); + Assert(tuplen == size); - LogicalTapeReadExact(state->tapeset, tapenum, - raddr, tuplen); - stup->datum1 = PointerGetDatum(raddr); - stup->isnull1 = false; - stup->tuple = raddr; - USEMEM(state, GetMemoryChunkSpace(raddr)); - } + USEMEM(state, GetMemoryChunkSpace(item)); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); +#if PG_VERSION_NUM >= 150000 + LogicalTapeReadExact(LT_ARG, item, size); +#else + LogicalTapeReadExact(TSS_GET(state)->tapeset, LT_ARG, item, size); +#endif + stup->tuple = item; + stup->isnull1 = is_item; + + if (!is_item) + stup->datum1 = Float8GetDatum(TSS_GET(state)->nKeys > 0 ? + ((RumSortItem *) item)->data[0] : 0); +#if PG_VERSION_NUM >= 150000 + if (TSS_GET(state)->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing + * length word? */ + LogicalTapeReadExact(LT_ARG, &tuplen, sizeof(tuplen)); +#else + if (TSS_GET(state)->randomAccess) + LogicalTapeReadExact(TSS_GET(state)->tapeset, LT_ARG, &tuplen, + sizeof(tuplen)); +#endif } static void -reversedirection_datum(Tuplesortstate *state) +readtup_rum(RumTuplesortstate *state, SortTuple *stup, LT_TYPE LT_ARG, + unsigned int len) { - state->onlyKey->ssup_reverse = !state->onlyKey->ssup_reverse; - state->onlyKey->ssup_nulls_first = !state->onlyKey->ssup_nulls_first; + readtup_rum_internal(state, stup, LT_ARG, len, false); } -/* - * Convenience routine to free a tuple previously loaded into sort memory - */ static void -free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +readtup_rumitem(RumTuplesortstate *state, SortTuple *stup, LT_TYPE LT_ARG, + unsigned int len) { - FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); - pfree(stup->tuple); + readtup_rum_internal(state, stup, LT_ARG, len, true); } -static int -comparetup_rum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +RumTuplesortstate * +rum_tuplesort_begin_rum(int workMem, int nKeys, bool randomAccess, + bool compareItemPointer) { - RumSortItem *i1, - *i2; - float8 v1 = DatumGetFloat8(a->datum1); - float8 v2 = DatumGetFloat8(b->datum1); - int i; - - if (v1 < v2) - return -1; - else if (v1 > v2) - return 1; - - i1 = (RumSortItem *) a->tuple; - i2 = (RumSortItem *) b->tuple; - for (i = 1; i < state->nKeys; i++) - { - if (i1->data[i] < i2->data[i]) - return -1; - else if (i1->data[i] > i2->data[i]) - return 1; - } +#if PG_VERSION_NUM >= 150000 + RumTuplesortstate *state = tuplesort_begin_common(workMem, + randomAccess ? + TUPLESORT_RANDOMACCESS : + TUPLESORT_NONE); +#else + RumTuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); +#endif + MemoryContext oldcontext; - if (!state->compareItemPointer) - return 0; + oldcontext = MemoryContextSwitchTo(TSS_GET(state)->sortcontext); - /* - * If key values are equal, we sort on ItemPointer. - */ - if (i1->iptr.ip_blkid.bi_hi < i2->iptr.ip_blkid.bi_hi) - return -1; - else if (i1->iptr.ip_blkid.bi_hi > i2->iptr.ip_blkid.bi_hi) - return 1; + LOG_SORT("begin rum sort: nKeys = %d, workMem = %d, randomAccess = %c", + nKeys, workMem, randomAccess ? 't' : 'f'); - if (i1->iptr.ip_blkid.bi_lo < i2->iptr.ip_blkid.bi_lo) - return -1; - else if (i1->iptr.ip_blkid.bi_lo > i2->iptr.ip_blkid.bi_lo) - return 1; + TSS_GET(state)->nKeys = nKeys; + TSS_GET(state)->comparetup = compareItemPointer ? comparetup_rum_true : + comparetup_rum_false; + TSS_GET(state)->writetup = writetup_rum; + TSS_GET(state)->readtup = readtup_rum; - if (i1->iptr.ip_posid < i2->iptr.ip_posid) - return -1; - else if (i1->iptr.ip_posid > i2->iptr.ip_posid) - return 1; + MemoryContextSwitchTo(oldcontext); - return 0; + return state; } -static void -copytup_rum(Tuplesortstate *state, SortTuple *stup, void *tup) +RumTuplesortstate * +rum_tuplesort_begin_rumitem(int workMem, FmgrInfo *cmp) { - RumSortItem *item = (RumSortItem *) tup; +#if PG_VERSION_NUM >= 160000 + RumTuplesortstate *state = tuplesort_begin_common(workMem, false); + MemoryContext oldcontext; - stup->datum1 = Float8GetDatum(state->nKeys > 0 ? item->data[0] : 0); - stup->isnull1 = false; - stup->tuple = tup; - USEMEM(state, GetMemoryChunkSpace(tup)); -} + oldcontext = MemoryContextSwitchTo(TSS_GET(state)->sortcontext); -static void -writetup_rum(Tuplesortstate *state, int tapenum, SortTuple *stup) -{ - RumSortItem *item = (RumSortItem *) stup->tuple; - unsigned int writtenlen = RumSortItemSize(state->nKeys) + sizeof(unsigned int); + LOG_SORT("begin rumitem sort: workMem = %d", workMem); + TSS_GET(state)->comparetup = comparetup_rumitem; + TSS_GET(state)->writetup = writetup_rumitem; + TSS_GET(state)->readtup = readtup_rumitem; + TSS_GET(state)->arg = cmp; - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) item, RumSortItemSize(state->nKeys)); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); + MemoryContextSwitchTo(oldcontext); - FREEMEM(state, GetMemoryChunkSpace(item)); - pfree(item); -} + return state; +#else + RumTuplesortstate *state = tuplesort_begin_common(workMem, false); + RumTuplesortstateExt *rs; + MemoryContext oldcontext; -static void -readtup_rum(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) -{ - unsigned int tuplen = len - sizeof(unsigned int); - RumSortItem *item = (RumSortItem *) palloc(RumSortItemSize(state->nKeys)); + oldcontext = MemoryContextSwitchTo(TSS_GET(state)->sortcontext); - Assert(tuplen == RumSortItemSize(state->nKeys)); + /* Allocate extended state in the same context as state */ + rs = palloc(sizeof(*rs)); - USEMEM(state, GetMemoryChunkSpace(item)); - LogicalTapeReadExact(state->tapeset, tapenum, - (void *) item, RumSortItemSize(state->nKeys)); - stup->datum1 = Float8GetDatum(state->nKeys > 0 ? item->data[0] : 0); - stup->isnull1 = false; - stup->tuple = item; + LOG_SORT("begin rumitem sort: workMem = %d", workMem); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); + rs->cmp = cmp; + TSS_GET(state)->comparetup = comparetup_rumitem; + TSS_GET(state)->writetup = writetup_rumitem; + TSS_GET(state)->readtup = readtup_rumitem; + memcpy(&rs->ts, state, sizeof(RumTuplesortstate)); + pfree(state); /* just to be sure *state isn't used anywhere + * else */ + + MemoryContextSwitchTo(oldcontext); + + return (RumTuplesortstate *) rs; +#endif } -static void -reversedirection_rum(Tuplesortstate *state) +/* + * rum_tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by rum_tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +rum_tuplesort_end(RumTuplesortstate *state) { - state->reverse = !state->reverse; +#if PG_VERSION_NUM < 160000 && PG_VERSION_NUM >= 130000 + tuplesort_free(state); +#else + tuplesort_end(state); +#endif } -static int -comparetup_rumkey(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +/* + * Get sort state memory context. Currently it is used only to allocate + * RumSortItem. + */ +MemoryContext +rum_tuplesort_get_memorycontext(RumTuplesortstate *state) { - RumKey *i1, *i2; + return TSS_GET(state)->sortcontext; +} - i1 = (RumKey *) a->tuple; - i2 = (RumKey *) b->tuple; +void +rum_tuplesort_putrum(RumTuplesortstate *state, RumSortItem *item) +{ + MemoryContext oldcontext; + SortTuple stup; +#if PG_VERSION_NUM >= 170000 + MinimalTuple tuple = (MinimalTuple)item; + Size tuplen; + TuplesortPublic *base = TuplesortstateGetPublic((TuplesortPublic *)state); +#endif - if (state->cmp) - { - if (i1->addInfoIsNull || i2->addInfoIsNull) - { - if (!(i1->addInfoIsNull && i2->addInfoIsNull)) - return (i1->addInfoIsNull) ? 1 : -1; - /* go to itempointer compare */ - } - else - { - int r; + oldcontext = MemoryContextSwitchTo(rum_tuplesort_get_memorycontext(state)); + copytup_rum(state, &stup, item); - r = DatumGetInt32(FunctionCall2(state->cmp, - i1->addInfo, - i2->addInfo)); +#if PG_VERSION_NUM >= 170000 + /* GetMemoryChunkSpace is not supported for bump contexts */ + if (TupleSortUseBumpTupleCxt(base->sortopt)) + tuplen = MAXALIGN(tuple->t_len); + else + tuplen = GetMemoryChunkSpace(tuple); + tuplesort_puttuple_common(state, &stup, false, tuplen); +#elif PG_VERSION_NUM >= 160000 + tuplesort_puttuple_common(state, &stup, false); +#else + puttuple_common(state, &stup); +#endif - if (r != 0) - return r; - } - } + MemoryContextSwitchTo(oldcontext); +} - /* - * If key values are equal, we sort on ItemPointer. - */ - if (i1->iptr.ip_blkid.bi_hi < i2->iptr.ip_blkid.bi_hi) - return -1; - else if (i1->iptr.ip_blkid.bi_hi > i2->iptr.ip_blkid.bi_hi) - return 1; +void +rum_tuplesort_putrumitem(RumTuplesortstate *state, RumScanItem *item) +{ + MemoryContext oldcontext; + SortTuple stup; +#if PG_VERSION_NUM >= 170000 + MinimalTuple tuple = (MinimalTuple)item; + Size tuplen; + TuplesortPublic *base = TuplesortstateGetPublic((TuplesortPublic *)state); +#endif - if (i1->iptr.ip_blkid.bi_lo < i2->iptr.ip_blkid.bi_lo) - return -1; - else if (i1->iptr.ip_blkid.bi_lo > i2->iptr.ip_blkid.bi_lo) - return 1; + oldcontext = MemoryContextSwitchTo(rum_tuplesort_get_memorycontext(state)); + copytup_rumitem(state, &stup, item); - if (i1->iptr.ip_posid < i2->iptr.ip_posid) - return -1; - else if (i1->iptr.ip_posid > i2->iptr.ip_posid) - return 1; +#if PG_VERSION_NUM >= 170000 + /* GetMemoryChunkSpace is not supported for bump contexts */ + if (TupleSortUseBumpTupleCxt(base->sortopt)) + tuplen = MAXALIGN(tuple->t_len); + else + tuplen = GetMemoryChunkSpace(tuple); + tuplesort_puttuple_common(state, &stup, false, tuplen); +#elif PG_VERSION_NUM >= 160000 + tuplesort_puttuple_common(state, &stup, false); +#else + puttuple_common(state, &stup); +#endif - return 0; + MemoryContextSwitchTo(oldcontext); } -static void -copytup_rumkey(Tuplesortstate *state, SortTuple *stup, void *tup) +void +rum_tuplesort_performsort(RumTuplesortstate *state) { - stup->isnull1 = true; - stup->tuple = palloc(sizeof(RumKey)); - memcpy(stup->tuple, tup, sizeof(RumKey)); - USEMEM(state, GetMemoryChunkSpace(stup->tuple)); + tuplesort_performsort(state); } -static void -writetup_rumkey(Tuplesortstate *state, int tapenum, SortTuple *stup) +/* + * Internal routine to fetch the next index tuple in either forward or back + * direction. Returns NULL if no more tuples. Returned tuple belongs to + * tuplesort memory context. Caller may not rely on tuple remaining valid after + * any further manipulation of tuplesort. + * + * If *should_free is set, the caller must pfree stup.tuple when done with it. + * + * NOTE: in PG 10 and newer tuple is always allocated tuple in tuplesort context + * and should not be freed by caller. + */ +static void * +rum_tuplesort_getrum_internal(RumTuplesortstate *state, bool forward, + bool *should_free) { - RumKey *item = (RumKey *) stup->tuple; - unsigned int writtenlen = sizeof(*item) + sizeof(unsigned int); - - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) item, sizeof(*item)); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - - FREEMEM(state, GetMemoryChunkSpace(item)); - pfree(item); +#if PG_VERSION_NUM >= 100000 + *should_free = false; + return (RumSortItem *)tuplesort_getindextuple(state, forward); +#else + return (RumSortItem *)tuplesort_getindextuple(state, forward, should_free); +#endif } -static void -readtup_rumkey(Tuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) +RumSortItem * +rum_tuplesort_getrum(RumTuplesortstate *state, bool forward, bool *should_free) { - unsigned int tuplen = len - sizeof(unsigned int); - RumKey *item = (RumKey *) palloc(sizeof(RumKey)); - - Assert(tuplen == sizeof(RumKey)); - - USEMEM(state, GetMemoryChunkSpace(item)); - LogicalTapeReadExact(state->tapeset, tapenum, - (void *) item, tuplen); - stup->isnull1 = true; - stup->tuple = item; - - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); + return (RumSortItem *) rum_tuplesort_getrum_internal(state, forward, + should_free); } +RumScanItem * +rum_tuplesort_getrumitem(RumTuplesortstate *state, bool forward, + bool *should_free) +{ + return (RumScanItem *) rum_tuplesort_getrum_internal(state, forward, + should_free); +} diff --git a/src/rumsort.h b/src/rumsort.h index 6d65449741..160aa5c8da 100644 --- a/src/rumsort.h +++ b/src/rumsort.h @@ -3,62 +3,30 @@ * rumsort.h * Generalized tuple sorting routines. * - * This module handles sorting of heap tuples, index tuples, or single - * Datums (and could easily support other kinds of sortable objects, - * if necessary). It works efficiently for both small and large amounts - * of data. Small amounts are sorted in-memory using qsort(). Large - * amounts are sorted using temporary files and a standard external sort - * algorithm. + * This module handles sorting of RumSortItem or RumScanItem structures. + * It contains copy of static functions from + * src/backend/utils/sort/tuplesort.c. * + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- */ -#ifndef TUPLESORT_H -/* Hide tuplesort.h and tuplesort.c */ -#define TUPLESORT_H +#ifndef RUMSORT_H +#define RUMSORT_H #include "postgres.h" #include "fmgr.h" -#include "access/itup.h" #include "executor/tuptable.h" -#include "utils/relcache.h" -/* Tuplesortstate is an opaque type whose details are not known outside - * tuplesort.c. - */ -typedef struct Tuplesortstate Tuplesortstate; -struct RumKey; - -/* - * We provide multiple interfaces to what is essentially the same code, - * since different callers have different data to be sorted and want to - * specify the sort key information differently. There are two APIs for - * sorting HeapTuples and two more for sorting IndexTuples. Yet another - * API supports sorting bare Datums. - * - * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't - * preserve the system columns (tuple identity and transaction visibility - * info). The sort keys are specified by column numbers within the tuples - * and sort operator OIDs. We save some cycles by passing and returning the - * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd - * have to be converted to MinimalTuples). This API works well for sorts - * executed as parts of plan trees. - * - * The "cluster" API stores/sorts full HeapTuples including all visibility - * info. The sort keys are specified by reference to a btree index that is - * defined on the relation to be sorted. Note that putheaptuple/getheaptuple - * go with this API, not the "begin_heap" one! - * - * The "index_btree" API stores/sorts IndexTuples (preserving all their - * header fields). The sort keys are specified by a btree index definition. - * - * The "index_hash" API is similar to index_btree, but the tuples are - * actually sorted by their hash codes not the raw data. +/* RumTuplesortstate is an opaque type whose details are not known outside + * rumsort.c. */ +typedef struct Tuplesortstate RumTuplesortstate; +struct RumScanItem; typedef struct { @@ -69,75 +37,22 @@ typedef struct #define RumSortItemSize(nKeys) (offsetof(RumSortItem,data)+(nKeys)*sizeof(float8)) -extern MemoryContext rum_tuplesort_get_memorycontext(Tuplesortstate *state); -extern Tuplesortstate *rum_tuplesort_begin_heap(TupleDesc tupDesc, - int nkeys, AttrNumber *attNums, - Oid *sortOperators, Oid *sortCollations, - bool *nullsFirstFlags, - int workMem, bool randomAccess); -extern Tuplesortstate *rum_tuplesort_begin_cluster(TupleDesc tupDesc, - Relation indexRel, - int workMem, bool randomAccess); -extern Tuplesortstate *rum_tuplesort_begin_index_btree(Relation heapRel, - Relation indexRel, - bool enforceUnique, - int workMem, bool randomAccess); -extern Tuplesortstate *rum_tuplesort_begin_index_hash(Relation heapRel, - Relation indexRel, - uint32 hash_mask, - int workMem, bool randomAccess); -extern Tuplesortstate *rum_tuplesort_begin_datum(Oid datumType, - Oid sortOperator, Oid sortCollation, - bool nullsFirstFlag, - int workMem, bool randomAccess); -extern Tuplesortstate *rum_tuplesort_begin_rum(int workMem, +extern MemoryContext rum_tuplesort_get_memorycontext(RumTuplesortstate *state); +extern RumTuplesortstate *rum_tuplesort_begin_rum(int workMem, int nKeys, bool randomAccess, bool compareItemPointer); -extern Tuplesortstate *rum_tuplesort_begin_rumkey(int workMem, +extern RumTuplesortstate *rum_tuplesort_begin_rumitem(int workMem, FmgrInfo *cmp); -extern void rum_tuplesort_set_bound(Tuplesortstate *state, int64 bound); +extern void rum_tuplesort_putrum(RumTuplesortstate *state, RumSortItem * item); +extern void rum_tuplesort_putrumitem(RumTuplesortstate *state, struct RumScanItem * item); -extern void rum_tuplesort_puttupleslot(Tuplesortstate *state, - TupleTableSlot *slot); -extern void rum_tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup); -extern void rum_tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple); -extern void rum_tuplesort_putdatum(Tuplesortstate *state, Datum val, - bool isNull); -extern void rum_tuplesort_putrum(Tuplesortstate *state, RumSortItem * item); -extern void rum_tuplesort_putrumkey(Tuplesortstate *state, struct RumKey * item); +extern void rum_tuplesort_performsort(RumTuplesortstate *state); -extern void rum_tuplesort_performsort(Tuplesortstate *state); - -extern bool rum_tuplesort_gettupleslot(Tuplesortstate *state, bool forward, - TupleTableSlot *slot); -extern HeapTuple rum_tuplesort_getheaptuple(Tuplesortstate *state, bool forward, - bool *should_free); -extern IndexTuple rum_tuplesort_getindextuple(Tuplesortstate *state, bool forward, - bool *should_free); -extern bool rum_tuplesort_getdatum(Tuplesortstate *state, bool forward, - Datum *val, bool *isNull); -extern RumSortItem *rum_tuplesort_getrum(Tuplesortstate *state, bool forward, +extern RumSortItem *rum_tuplesort_getrum(RumTuplesortstate *state, bool forward, bool *should_free); -extern struct RumKey *rum_tuplesort_getrumkey(Tuplesortstate *state, bool forward, +extern struct RumScanItem *rum_tuplesort_getrumitem(RumTuplesortstate *state, bool forward, bool *should_free); -extern void rum_tuplesort_end(Tuplesortstate *state); - -extern void rum_tuplesort_get_stats(Tuplesortstate *state, - const char **sortMethod, - const char **spaceType, - long *spaceUsed); - -extern int rum_tuplesort_merge_order(long allowedMem); - -/* - * These routines may only be called if randomAccess was specified 'true'. - * Likewise, backwards scan in gettuple/getdatum is only allowed if - * randomAccess was specified. - */ - -extern void rum_tuplesort_rescan(Tuplesortstate *state); -extern void rum_tuplesort_markpos(Tuplesortstate *state); -extern void rum_tuplesort_restorepos(Tuplesortstate *state); +extern void rum_tuplesort_end(RumTuplesortstate *state); -#endif /* TUPLESORT_H */ +#endif /* RUMSORT_H */ diff --git a/src/rumtsquery.c b/src/rumtsquery.c index 242360346a..6c6b3c86d0 100644 --- a/src/rumtsquery.c +++ b/src/rumtsquery.c @@ -3,7 +3,7 @@ * rumtsquery.c * Inverted fulltext search: indexing tsqueries. * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * *------------------------------------------------------------------------- @@ -20,14 +20,15 @@ #include "rum.h" +/* + * A "wrapper" over tsquery item. More suitable representation for pocessing. + */ typedef struct QueryItemWrap { QueryItemType type; int8 oper; bool not; - int operandsCount, - operandsAllocated; - struct QueryItemWrap *operands; + List *operands; struct QueryItemWrap *parent; int distance, length; @@ -35,40 +36,29 @@ typedef struct QueryItemWrap int num; } QueryItemWrap; +/* + * Add child to tsquery item wrap. + */ static QueryItemWrap * -add_child(QueryItemWrap * parent) +add_child(QueryItemWrap *parent) { QueryItemWrap *result; - if (!parent) - { - result = (QueryItemWrap *) palloc0(sizeof(QueryItemWrap)); - } - else + result = (QueryItemWrap *) palloc0(sizeof(QueryItemWrap)); + + if (parent) { - parent->operandsCount++; - while (parent->operandsCount > parent->operandsAllocated) - { - if (parent->operandsAllocated > 0) - { - parent->operandsAllocated *= 2; - parent->operands = (QueryItemWrap *) repalloc(parent->operands, parent->operandsAllocated * sizeof(*parent->operands)); - } - else - { - parent->operandsAllocated = 4; - parent->operands = (QueryItemWrap *) palloc(parent->operandsAllocated * sizeof(*parent->operands)); - } - } - result = &parent->operands[parent->operandsCount - 1]; - memset(result, 0, sizeof(*result)); result->parent = parent; + parent->operands = lappend(parent->operands, result); } return result; } +/* + * Make wrapper over tsquery item. Flattern tree if needed. + */ static QueryItemWrap * -make_query_item_wrap(QueryItem *item, QueryItemWrap * parent, bool not) +make_query_item_wrap(QueryItem *item, QueryItemWrap *parent, bool not) { if (item->type == QI_VAL) { @@ -118,6 +108,7 @@ make_query_item_wrap(QueryItem *item, QueryItemWrap * parent, bool not) } case OP_PHRASE: elog(ERROR, "Indexing of phrase tsqueries isn't supported yet"); + break; default: elog(ERROR, "Invalid tsquery operator"); } @@ -126,16 +117,21 @@ make_query_item_wrap(QueryItem *item, QueryItemWrap * parent, bool not) return NULL; } +/* + * Recursively calculate "sum" for tsquery item wraps. + */ static int -calc_wraps(QueryItemWrap * wrap, int *num) +calc_wraps(QueryItemWrap *wrap, int *num) { - int i, - notCount = 0, + int notCount = 0, result; + ListCell *lc; - for (i = 0; i < wrap->operandsCount; i++) + foreach(lc, wrap->operands) { - if (wrap->operands[i].not) + QueryItemWrap *item = (QueryItemWrap *) lfirst(lc); + + if (item->not) notCount++; } @@ -143,7 +139,7 @@ calc_wraps(QueryItemWrap * wrap, int *num) { wrap->num = (*num)++; if (wrap->oper == OP_AND) - wrap->sum = notCount + 1 - wrap->operandsCount; + wrap->sum = notCount + 1 - list_length(wrap->operands); if (wrap->oper == OP_OR) wrap->sum = notCount; } @@ -153,11 +149,19 @@ calc_wraps(QueryItemWrap * wrap, int *num) } result = 0; - for (i = 0; i < wrap->operandsCount; i++) - result += calc_wraps(&wrap->operands[i], num); + foreach(lc, wrap->operands) + { + QueryItemWrap *item = (QueryItemWrap *) lfirst(lc); + + result += calc_wraps(item, num); + } return result; } +/* + * Check if tsquery doesn't need any positive lexeme occurence for satisfaction. + * That is this funciton returns true when tsquery maches empty tsvector. + */ static bool check_allnegative(QueryItemWrap * wrap) { @@ -167,22 +171,26 @@ check_allnegative(QueryItemWrap * wrap) } else if (wrap->oper == OP_AND) { - int i; + ListCell *lc; - for (i = 0; i < wrap->operandsCount; i++) + foreach(lc, wrap->operands) { - if (!check_allnegative(&wrap->operands[i])) + QueryItemWrap *item = (QueryItemWrap *) lfirst(lc); + + if (!check_allnegative(item)) return false; } return true; } else if (wrap->oper == OP_OR) { - int i; + ListCell *lc; - for (i = 0; i < wrap->operandsCount; i++) + foreach(lc, wrap->operands) { - if (check_allnegative(&wrap->operands[i])) + QueryItemWrap *item = (QueryItemWrap *) lfirst(lc); + + if (check_allnegative(item)) return true; } return false; @@ -195,6 +203,7 @@ check_allnegative(QueryItemWrap * wrap) } +/* Max length of variable-length encoded 32-bit integer */ #define MAX_ENCODED_LEN 5 /* @@ -262,8 +271,11 @@ typedef struct char *operand; } ExtractContext; +/* + * Recursively extract entries from tsquery wraps. Encode paths into addInfos. + */ static void -extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level) +extract_wraps(QueryItemWrap *wrap, ExtractContext *context, int level) { if (wrap->type == QI_VAL) { @@ -271,7 +283,7 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level) unsigned char *ptr; int index; - + /* Check if given lexeme was already extracted */ for (index = 0; index < context->index; index++) { text *entry; @@ -282,6 +294,7 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level) break; } + /* Either allocate new addInfo or extend existing addInfo */ if (index >= context->index) { index = context->index; @@ -291,11 +304,6 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level) context->addInfo[index] = PointerGetDatum(addinfo); context->addInfoIsNull[index] = false; context->index++; - - /* - * ptrEnd = (unsigned char *) VARDATA(addinfo) + VARHDRSZ + 2 * - * Max(level, 1) * MAX_ENCODED_LEN; - */ } else { @@ -304,25 +312,14 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level) VARSIZE(addinfo) + 2 * Max(level, 1) * MAX_ENCODED_LEN); context->addInfo[index] = PointerGetDatum(addinfo); ptr = (unsigned char *) VARDATA(addinfo) + VARSIZE_ANY_EXHDR(addinfo); - - /* - * ptrEnd = (unsigned char *) VARDATA(addinfo) + - * VARSIZE_ANY_EXHDR(addinfo) + 2 * Max(level, 1) * - * MAX_ENCODED_LEN; - */ } - /* - * elog(NOTICE, "%s", - * text_to_cstring(DatumGetTextP(context->entries[index]))); - */ - + /* Encode path into addInfo */ while (wrap->parent) { QueryItemWrap *parent = wrap->parent; uint32 sum; - /* elog(NOTICE, "%d %d %d", parent->num, parent->sum, wrap->not); */ encode_varbyte((uint32) parent->num, &ptr); sum = (uint32) abs(parent->sum); sum <<= 2; @@ -338,38 +335,21 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level) encode_varbyte(1, &ptr); encode_varbyte(4 | 1, &ptr); } - /* Assert(ptr <= ptrEnd); */ SET_VARSIZE(addinfo, ptr - (unsigned char *) addinfo); - - /* - * elog(NOTICE, "%s", DatumGetPointer(DirectFunctionCall1(byteaout, - * PointerGetDatum(addinfo)))); - */ } else if (wrap->type == QI_OPR) { - int i; + ListCell *lc; - for (i = 0; i < wrap->operandsCount; i++) - extract_wraps(&wrap->operands[i], context, level + 1); + foreach(lc, wrap->operands) + { + QueryItemWrap *item = (QueryItemWrap *) lfirst(lc); + + extract_wraps(item, context, level + 1); + } } } -/*PG_FUNCTION_INFO_V1(rum_process_tsquery); -Datum -rum_process_tsquery(PG_FUNCTION_ARGS) -{ - TSQuery query = PG_GETARG_TSQUERY(0); - QueryItem *item = GETQUERY(query); - QueryItemWrap *wrap = make_query_item_wrap(item, NULL, false); - int num = 1; - - calc_wraps(wrap, &num); - print_wraps(wrap, , 0); - - PG_RETURN_VOID(); -}*/ - PG_FUNCTION_INFO_V1(ruminv_extract_tsquery); Datum ruminv_extract_tsquery(PG_FUNCTION_ARGS) @@ -419,12 +399,6 @@ ruminv_extract_tsquery(PG_FUNCTION_ARGS) } *nentries = count; -/* elog(NOTICE, "%d", *nentries); - for (i = 0; i < *nentries; i++) - { - elog(NOTICE, "%s", text_to_cstring(DatumGetPointer((entries)[i]))); - }*/ - PG_FREE_IF_COPY(query, 0); PG_RETURN_POINTER(entries); } @@ -520,14 +494,10 @@ ruminv_tsvector_consistent(PG_FUNCTION_ARGS) if (addInfoIsNull[i]) elog(ERROR, "Unexpected addInfoIsNull"); + /* Iterate path making corresponding calculation */ ptr = (unsigned char *) VARDATA_ANY(DatumGetPointer(addInfo[i])); size = VARSIZE_ANY_EXHDR(DatumGetPointer(addInfo[i])); - /* - * elog(NOTICE, "%d %s", i, - * DatumGetPointer(DirectFunctionCall1(byteaout, addInfo[i]))); - */ - if (size == 0) { res = true; @@ -549,8 +519,6 @@ ruminv_tsvector_consistent(PG_FUNCTION_ARGS) index = num - 1; - /* elog(NOTICE, "a %d %d %d %d", i, index, sum, not); */ - if (child) { child->parent = index; @@ -584,17 +552,13 @@ ruminv_tsvector_consistent(PG_FUNCTION_ARGS) } } + /* Iterate over nodes */ if (allFalse && check[nkeys - 1]) { res = true; } else { - /* - * for (i = 0; i < lastIndex; i++) { elog(NOTICE, "s %d %d %d %d", i, - * nodes[i].sum, nodes[i].parent, nodes[i].not); } - */ - for (i = lastIndex - 1; i >= 0; i--) { if (nodes[i].parent != -2) @@ -617,8 +581,6 @@ ruminv_tsvector_consistent(PG_FUNCTION_ARGS) } } -/* elog(NOTICE, "%d", res);*/ - PG_RETURN_BOOL(res); } diff --git a/src/rumutil.c b/src/rumutil.c index 93aec428c1..4a239c85c7 100644 --- a/src/rumutil.c +++ b/src/rumutil.c @@ -4,7 +4,7 @@ * utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -13,15 +13,20 @@ #include "postgres.h" +#include "access/htup_details.h" #include "access/reloptions.h" #include "catalog/pg_collation.h" +#include "catalog/pg_opclass.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "utils/builtins.h" #include "utils/guc.h" #include "utils/index_selfuncs.h" #include "utils/lsyscache.h" +#include "utils/syscache.h" +#include "utils/typcache.h" #include "rum.h" @@ -34,6 +39,14 @@ PG_FUNCTION_INFO_V1(rumhandler); /* Kind of relation optioms for rum index */ static relopt_kind rum_relopt_kind; +static const struct config_enum_entry rum_array_similarity_function_opts[] = +{ + { "cosine", SMT_COSINE, false }, + { "jaccard", SMT_JACCARD, false }, + { "overlap", SMT_OVERLAP, false }, + { NULL, 0, false } +}; + /* * Module load callback */ @@ -49,17 +62,46 @@ _PG_init(void) PGC_USERSET, 0, NULL, NULL, NULL); + DefineCustomRealVariable("rum.array_similarity_threshold", + "Sets the array similarity threshold.", + NULL, + &RumArraySimilarityThreshold, + RUM_SIMILARITY_THRESHOLD_DEFAULT, 0.0, 1.0, + PGC_USERSET, 0, + NULL, NULL, NULL); + + DefineCustomEnumVariable("rum.array_similarity_function", + "Sets the array similarity function.", + NULL, + &RumArraySimilarityFunction, + RUM_SIMILARITY_FUNCTION_DEFAULT, + rum_array_similarity_function_opts, + PGC_USERSET, 0, + NULL, NULL, NULL); + rum_relopt_kind = add_reloption_kind(); add_string_reloption(rum_relopt_kind, "attach", "Column name to attach as additional info", - NULL, NULL); + NULL, NULL +#if PG_VERSION_NUM >= 130000 + , AccessExclusiveLock +#endif + ); add_string_reloption(rum_relopt_kind, "to", "Column name to add a order by column", - NULL, NULL); + NULL, NULL +#if PG_VERSION_NUM >= 130000 + , AccessExclusiveLock +#endif + ); add_bool_reloption(rum_relopt_kind, "order_by_attach", "Use (addinfo, itempointer) order instead of just itempointer", - false); + false +#if PG_VERSION_NUM >= 130000 + , AccessExclusiveLock +#endif + ); } /* @@ -83,7 +125,10 @@ rumhandler(PG_FUNCTION_ARGS) amroutine->amsearchnulls = false; amroutine->amstorage = true; amroutine->amclusterable = false; - amroutine->ampredlocks = false; + amroutine->ampredlocks = true; +#if PG_VERSION_NUM >= 100000 + amroutine->amcanparallel = false; +#endif amroutine->amkeytype = InvalidOid; amroutine->ambuild = rumbuild; @@ -94,6 +139,7 @@ rumhandler(PG_FUNCTION_ARGS) amroutine->amcanreturn = NULL; amroutine->amcostestimate = gincostestimate; amroutine->amoptions = rumoptions; + amroutine->amproperty = rumproperty; amroutine->amvalidate = rumvalidate; amroutine->ambeginscan = rumbeginscan; amroutine->amrescan = rumrescan; @@ -102,6 +148,11 @@ rumhandler(PG_FUNCTION_ARGS) amroutine->amendscan = rumendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; +#if PG_VERSION_NUM >= 100000 + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; +#endif PG_RETURN_POINTER(amroutine); } @@ -124,15 +175,15 @@ initRumState(RumState * state, Relation index) state->oneCol = (origTupdesc->natts == 1) ? true : false; state->origTupdesc = origTupdesc; - state->attrnOrderByColumn = InvalidAttrNumber; + state->attrnAttachColumn = InvalidAttrNumber; state->attrnAddToColumn = InvalidAttrNumber; if (index->rd_options) { RumOptions *options = (RumOptions *) index->rd_options; - if (options->orderByColumn > 0) + if (options->attachColumn > 0) { - char *colname = (char *) options + options->orderByColumn; + char *colname = (char *) options + options->attachColumn; AttrNumber attrnOrderByHeapColumn; attrnOrderByHeapColumn = get_attnum(index->rd_index->indrelid, colname); @@ -140,9 +191,9 @@ initRumState(RumState * state, Relation index) if (!AttributeNumberIsValid(attrnOrderByHeapColumn)) elog(ERROR, "attribute \"%s\" is not found in table", colname); - state->attrnOrderByColumn = get_attnum(index->rd_id, colname); + state->attrnAttachColumn = get_attnum(index->rd_id, colname); - if (!AttributeNumberIsValid(state->attrnOrderByColumn)) + if (!AttributeNumberIsValid(state->attrnAttachColumn)) elog(ERROR, "attribute \"%s\" is not found in index", colname); } @@ -160,15 +211,18 @@ initRumState(RumState * state, Relation index) if (!AttributeNumberIsValid(state->attrnAddToColumn)) elog(ERROR, "attribute \"%s\" is not found in index", colname); + + if (state->attrnAddToColumn == state->attrnAttachColumn) + elog(ERROR, "column \"%s\" and attached column cannot be the same", colname); } - if (!(AttributeNumberIsValid(state->attrnOrderByColumn) && + if (!(AttributeNumberIsValid(state->attrnAttachColumn) && AttributeNumberIsValid(state->attrnAddToColumn))) elog(ERROR, "AddTo and OrderBy columns should be defined both"); if (options->useAlternativeOrder) { - if (!(AttributeNumberIsValid(state->attrnOrderByColumn) && + if (!(AttributeNumberIsValid(state->attrnAttachColumn) && AttributeNumberIsValid(state->attrnAddToColumn))) elog(ERROR, "to use alternative ordering AddTo and OrderBy should be defined"); @@ -179,6 +233,7 @@ initRumState(RumState * state, Relation index) for (i = 0; i < origTupdesc->natts; i++) { RumConfig *rumConfig = state->rumConfig + i; + Form_pg_attribute origAttr = RumTupleDescAttr(origTupdesc, i); rumConfig->addInfoTypeOid = InvalidOid; @@ -193,28 +248,37 @@ initRumState(RumState * state, Relation index) if (state->attrnAddToColumn == i + 1) { + Form_pg_attribute origAddAttr = RumTupleDescAttr(origTupdesc, + state->attrnAttachColumn - 1); + if (OidIsValid(rumConfig->addInfoTypeOid)) elog(ERROR, "AddTo could should not have AddInfo"); - rumConfig->addInfoTypeOid = origTupdesc->attrs[ - state->attrnOrderByColumn - 1]->atttypid; + if (state->useAlternativeOrder && origAddAttr->attbyval == false) + elog(ERROR, "doesn't support order index over pass-by-reference column"); + + rumConfig->addInfoTypeOid = origAddAttr->atttypid; } if (state->oneCol) { state->tupdesc[i] = CreateTemplateTupleDesc( +#if PG_VERSION_NUM >= 120000 + OidIsValid(rumConfig->addInfoTypeOid) ? 2 : 1); +#else OidIsValid(rumConfig->addInfoTypeOid) ? 2 : 1, false); +#endif TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, - origTupdesc->attrs[i]->atttypid, - origTupdesc->attrs[i]->atttypmod, - origTupdesc->attrs[i]->attndims); + origAttr->atttypid, + origAttr->atttypmod, + origAttr->attndims); TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 1, - origTupdesc->attrs[i]->attcollation); + origAttr->attcollation); if (OidIsValid(rumConfig->addInfoTypeOid)) { TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, rumConfig->addInfoTypeOid, -1, 0); - state->addAttrs[i] = state->tupdesc[i]->attrs[1]; + state->addAttrs[i] = RumTupleDescAttr(state->tupdesc[i], 1); } else { @@ -224,20 +288,24 @@ initRumState(RumState * state, Relation index) else { state->tupdesc[i] = CreateTemplateTupleDesc( +#if PG_VERSION_NUM >= 120000 + OidIsValid(rumConfig->addInfoTypeOid) ? 3 : 2); +#else OidIsValid(rumConfig->addInfoTypeOid) ? 3 : 2, false); +#endif TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, INT2OID, -1, 0); TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, - origTupdesc->attrs[i]->atttypid, - origTupdesc->attrs[i]->atttypmod, - origTupdesc->attrs[i]->attndims); + origAttr->atttypid, + origAttr->atttypmod, + origAttr->attndims); TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 2, - origTupdesc->attrs[i]->attcollation); + origAttr->attcollation); if (OidIsValid(rumConfig->addInfoTypeOid)) { TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 3, NULL, rumConfig->addInfoTypeOid, -1, 0); - state->addAttrs[i] = state->tupdesc[i]->attrs[2]; + state->addAttrs[i] = RumTupleDescAttr(state->tupdesc[i], 2); } else { @@ -245,9 +313,38 @@ initRumState(RumState * state, Relation index) } } - fmgr_info_copy(&(state->compareFn[i]), - index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), - CurrentMemoryContext); + /* + * If the compare proc isn't specified in the opclass definition, look + * up the index key type's default btree comparator. + */ + if (index_getprocid(index, i + 1, GIN_COMPARE_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->compareFn[i]), + index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), + CurrentMemoryContext); + } + else + { + TypeCacheEntry *typentry; + +#if PG_VERSION_NUM < 100000 + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array indexing is only available on PostgreSQL 10+"))); +#endif + + typentry = lookup_type_cache(origAttr->atttypid, + TYPECACHE_CMP_PROC_FINFO); + if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(origAttr->atttypid)))); + fmgr_info_copy(&(state->compareFn[i]), + &(typentry->cmp_proc_finfo), + CurrentMemoryContext); + } + fmgr_info_copy(&(state->extractValueFn[i]), index_getprocinfo(index, i + 1, GIN_EXTRACTVALUE_PROC), CurrentMemoryContext); @@ -297,7 +394,6 @@ initRumState(RumState * state, Relation index) index_getprocinfo(index, i + 1, RUM_ORDERING_PROC), CurrentMemoryContext); state->canOrdering[i] = true; - } else { @@ -345,13 +441,6 @@ initRumState(RumState * state, Relation index) else state->supportCollation[i] = DEFAULT_COLLATION_OID; } - - if (AttributeNumberIsValid(state->attrnOrderByColumn)) - { - /* Follow FIXME comment(s) to understand */ - if (origTupdesc->attrs[state->attrnOrderByColumn - 1]->attbyval == false) - elog(ERROR, "currently, RUM doesn't support order by over pass-by-reference column"); - } } /* @@ -494,6 +583,7 @@ RumInitPage(Page page, uint32 f, Size pageSize) opaque->flags = f; opaque->leftlink = InvalidBlockNumber; opaque->rightlink = InvalidBlockNumber; + RumItemSetMin(RumDataPageGetRightBound(page)); } void @@ -800,15 +890,15 @@ rumExtractEntries(RumState * rumstate, OffsetNumber attnum, bytea * rumoptions(Datum reloptions, bool validate) { - relopt_value *options; - RumOptions *rdopts; - int numoptions; static const relopt_parse_elt tab[] = { - {"fastupdate", RELOPT_TYPE_BOOL, offsetof(RumOptions, useFastUpdate)}, - {"attach", RELOPT_TYPE_STRING, offsetof(RumOptions, orderByColumn)}, + {"attach", RELOPT_TYPE_STRING, offsetof(RumOptions, attachColumn)}, {"to", RELOPT_TYPE_STRING, offsetof(RumOptions, addToColumn)}, {"order_by_attach", RELOPT_TYPE_BOOL, offsetof(RumOptions, useAlternativeOrder)} }; +#if PG_VERSION_NUM < 130000 + relopt_value *options; + RumOptions *rdopts; + int numoptions; options = parseRelOptions(reloptions, validate, rum_relopt_kind, &numoptions); @@ -825,6 +915,86 @@ rumoptions(Datum reloptions, bool validate) pfree(options); return (bytea *) rdopts; +#else + return (bytea *) build_reloptions(reloptions, validate, rum_relopt_kind, + sizeof(RumOptions), tab, lengthof(tab)); +#endif +} + +bool +rumproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull) +{ + HeapTuple tuple; + Form_pg_index rd_index PG_USED_FOR_ASSERTS_ONLY; + Form_pg_opclass rd_opclass; + Datum datum; + bool disnull; + oidvector *indclass; + Oid opclass, + opfamily, + opcintype; + int16 procno; + + /* Only answer column-level inquiries */ + if (attno == 0) + return false; + + switch (prop) + { + case AMPROP_DISTANCE_ORDERABLE: + procno = RUM_ORDERING_PROC; + break; + default: + return false; + } + + /* First we need to know the column's opclass. */ + + tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(index_oid)); + if (!HeapTupleIsValid(tuple)) + { + *isnull = true; + return true; + } + rd_index = (Form_pg_index) GETSTRUCT(tuple); + + /* caller is supposed to guarantee this */ + Assert(attno > 0 && attno <= rd_index->indnatts); + + datum = SysCacheGetAttr(INDEXRELID, tuple, + Anum_pg_index_indclass, &disnull); + Assert(!disnull); + + indclass = ((oidvector *) DatumGetPointer(datum)); + opclass = indclass->values[attno - 1]; + + ReleaseSysCache(tuple); + + /* Now look up the opclass family and input datatype. */ + + tuple = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclass)); + if (!HeapTupleIsValid(tuple)) + { + *isnull = true; + return true; + } + rd_opclass = (Form_pg_opclass) GETSTRUCT(tuple); + + opfamily = rd_opclass->opcfamily; + opcintype = rd_opclass->opcintype; + + ReleaseSysCache(tuple); + + /* And now we can check whether the function is provided. */ + + *res = SearchSysCacheExists4(AMPROCNUM, + ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(opcintype), + ObjectIdGetDatum(opcintype), + Int16GetDatum(procno)); + return true; } /* @@ -907,8 +1077,40 @@ FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg6, Datum arg7, Datum arg8, Datum arg9, Datum arg10) { - FunctionCallInfoData fcinfo; Datum result; +#if PG_VERSION_NUM >= 120000 + LOCAL_FCINFO(fcinfo, 10); + + InitFunctionCallInfoData(*fcinfo, flinfo, 10, collation, NULL, NULL); + + fcinfo->args[0].value = arg1; + fcinfo->args[0].isnull = false; + fcinfo->args[1].value = arg2; + fcinfo->args[1].isnull = false; + fcinfo->args[2].value = arg3; + fcinfo->args[2].isnull = false; + fcinfo->args[3].value = arg4; + fcinfo->args[3].isnull = false; + fcinfo->args[4].value = arg5; + fcinfo->args[4].isnull = false; + fcinfo->args[5].value = arg6; + fcinfo->args[5].isnull = false; + fcinfo->args[6].value = arg7; + fcinfo->args[6].isnull = false; + fcinfo->args[7].value = arg8; + fcinfo->args[7].isnull = false; + fcinfo->args[8].value = arg9; + fcinfo->args[8].isnull = false; + fcinfo->args[9].value = arg10; + fcinfo->args[9].isnull = false; + + result = FunctionCallInvoke(fcinfo); + + /* Check for null result, since caller is clearly not expecting one */ + if (fcinfo->isnull) + elog(ERROR, "function %u returned NULL", fcinfo->flinfo->fn_oid); +#else + FunctionCallInfoData fcinfo; InitFunctionCallInfoData(fcinfo, flinfo, 10, collation, NULL, NULL); @@ -938,6 +1140,7 @@ FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, /* Check for null result, since caller is clearly not expecting one */ if (fcinfo.isnull) elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid); +#endif return result; } diff --git a/src/rumvacuum.c b/src/rumvacuum.c index 3d9aa746f4..fd5e4206b4 100644 --- a/src/rumvacuum.c +++ b/src/rumvacuum.c @@ -4,7 +4,7 @@ * delete & vacuum routines for the postgres RUM * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -17,6 +17,7 @@ #include "postmaster/autovacuum.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "rum.h" @@ -46,12 +47,13 @@ rumVacuumPostingList(RumVacuumState * gvs, OffsetNumber attnum, Pointer src, { OffsetNumber i, j = 0; - RumKey item; + RumItem item; ItemPointerData prevIptr; Pointer dst = NULL, prev, ptr = src; + *newSize = 0; ItemPointerSetMin(&item.iptr); /* @@ -62,7 +64,7 @@ rumVacuumPostingList(RumVacuumState * gvs, OffsetNumber attnum, Pointer src, for (i = 0; i < nitem; i++) { prev = ptr; - ptr = rumDataPageLeafRead(ptr, attnum, &item, &gvs->rumstate); + ptr = rumDataPageLeafRead(ptr, attnum, &item, false, &gvs->rumstate); if (gvs->callback(&item.iptr, gvs->callback_state)) { gvs->result->tuples_removed += 1; @@ -181,6 +183,8 @@ RumFormTuple(RumState * rumstate, { itup = repalloc(itup, newsize); + memset((char *) itup + IndexTupleSize(itup), + 0, newsize - IndexTupleSize(itup)); /* set new size in tuple header */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; @@ -213,7 +217,7 @@ rumVacuumPostingTreeLeaves(RumVacuumState * gvs, OffsetNumber attnum, { Buffer buffer; Page page; - bool hasVoidPage = FALSE; + bool hasVoidPage = false; buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); @@ -239,10 +243,6 @@ rumVacuumPostingTreeLeaves(RumVacuumState * gvs, OffsetNumber attnum, oldMaxOff = RumPageGetOpaque(page)->maxoff; Pointer cleaned = NULL; Size newSize; - GenericXLogState *state; - - state = GenericXLogStart(gvs->index); - page = GenericXLogRegisterBuffer(state, buffer, 0); newMaxOff = rumVacuumPostingList(gvs, attnum, RumDataPageGetData(page), oldMaxOff, &cleaned, @@ -251,38 +251,43 @@ rumVacuumPostingTreeLeaves(RumVacuumState * gvs, OffsetNumber attnum, /* saves changes about deleted tuple ... */ if (oldMaxOff != newMaxOff) { + GenericXLogState *state; + Page newPage; + + state = GenericXLogStart(gvs->index); + + newPage = GenericXLogRegisterBuffer(state, buffer, 0); + if (newMaxOff > 0) - memcpy(RumDataPageGetData(page), cleaned, newSize); + memcpy(RumDataPageGetData(newPage), cleaned, newSize); pfree(cleaned); - RumPageGetOpaque(page)->maxoff = newMaxOff; - updateItemIndexes(page, attnum, &gvs->rumstate); + RumPageGetOpaque(newPage)->maxoff = newMaxOff; + updateItemIndexes(newPage, attnum, &gvs->rumstate); /* if root is a leaf page, we don't desire further processing */ - if (!isRoot && RumPageGetOpaque(page)->maxoff < FirstOffsetNumber) - hasVoidPage = TRUE; + if (!isRoot && RumPageGetOpaque(newPage)->maxoff < FirstOffsetNumber) + hasVoidPage = true; GenericXLogFinish(state); } - else - GenericXLogAbort(state); } else { OffsetNumber i; - bool isChildHasVoid = FALSE; + bool isChildHasVoid = false; for (i = FirstOffsetNumber; i <= RumPageGetOpaque(page)->maxoff; i++) { PostingItem *pitem = (PostingItem *) RumDataPageGetItem(page, i); if (rumVacuumPostingTreeLeaves(gvs, attnum, - PostingItemGetBlockNumber(pitem), FALSE, NULL)) - isChildHasVoid = TRUE; + PostingItemGetBlockNumber(pitem), false, NULL)) + isChildHasVoid = true; } if (isChildHasVoid) - hasVoidPage = TRUE; + hasVoidPage = true; } /* @@ -341,8 +346,6 @@ rumDeletePage(RumVacuumState * gvs, BlockNumber deleteBlkno, LockBuffer(dBuffer, RUM_UNLOCK); - state = GenericXLogStart(gvs->index); - /* * Lock the pages in the same order as an insertion would, to avoid * deadlocks: left, then right, then parent. @@ -360,6 +363,7 @@ rumDeletePage(RumVacuumState * gvs, BlockNumber deleteBlkno, UnlockReleaseBuffer(lBuffer); ReleaseBuffer(dBuffer); ReleaseBuffer(rBuffer); + ReleaseBuffer(pBuffer); goto restart; } LockBuffer(rBuffer, RUM_EXCLUSIVE); @@ -367,9 +371,8 @@ rumDeletePage(RumVacuumState * gvs, BlockNumber deleteBlkno, * LockBufferForCleanup() */ LockBuffer(pBuffer, RUM_EXCLUSIVE); - dPage = GenericXLogRegisterBuffer(state, dBuffer, 0); - lPage = GenericXLogRegisterBuffer(state, lBuffer, 0); - rPage = GenericXLogRegisterBuffer(state, rBuffer, 0); + lPage = BufferGetPage(lBuffer); + rPage = BufferGetPage(rBuffer); /* * last chance to check @@ -378,6 +381,8 @@ rumDeletePage(RumVacuumState * gvs, BlockNumber deleteBlkno, RumPageGetOpaque(rPage)->leftlink == deleteBlkno && RumPageGetOpaque(dPage)->maxoff < FirstOffsetNumber)) { + OffsetNumber dMaxoff = RumPageGetOpaque(dPage)->maxoff; + if (!isParentRoot) LockBuffer(pBuffer, RUM_UNLOCK); ReleaseBuffer(pBuffer); @@ -385,15 +390,29 @@ rumDeletePage(RumVacuumState * gvs, BlockNumber deleteBlkno, UnlockReleaseBuffer(dBuffer); UnlockReleaseBuffer(rBuffer); - if (RumPageGetOpaque(dPage)->maxoff >= FirstOffsetNumber) + if (dMaxoff >= FirstOffsetNumber) return false; goto restart; } + /* At least make the WAL record */ + + state = GenericXLogStart(gvs->index); + + dPage = GenericXLogRegisterBuffer(state, dBuffer, 0); + lPage = GenericXLogRegisterBuffer(state, lBuffer, 0); + rPage = GenericXLogRegisterBuffer(state, rBuffer, 0); + RumPageGetOpaque(lPage)->rightlink = rightBlkno; RumPageGetOpaque(rPage)->leftlink = leftBlkno; + /* + * Any insert which would have gone on the leaf block will now go to its + * right sibling. + */ + PredicateLockPageCombine(gvs->index, deleteBlkno, rightBlkno); + /* Delete downlink from parent */ parentPage = GenericXLogRegisterBuffer(state, pBuffer, 0); #ifdef USE_ASSERT_CHECKING @@ -478,15 +497,22 @@ rumScanToDelete(RumVacuumState * gvs, BlockNumber blkno, bool isRoot, { PostingItem *pitem = (PostingItem *) RumDataPageGetItem(page, i); - if (rumScanToDelete(gvs, PostingItemGetBlockNumber(pitem), FALSE, me, i)) + if (rumScanToDelete(gvs, PostingItemGetBlockNumber(pitem), false, me, i)) i--; } } if (RumPageGetOpaque(page)->maxoff < FirstOffsetNumber && !isRoot) + { + /* + * Release the buffer because in rumDeletePage() we need to pin it again + * and call ConditionalLockBufferForCleanup(). + */ + ReleaseBuffer(buffer); meDelete = rumDeletePage(gvs, blkno, me->parent->blkno, myoff, me->parent->isRoot); - - ReleaseBuffer(buffer); + } + else + ReleaseBuffer(buffer); return meDelete; } @@ -499,18 +525,18 @@ rumVacuumPostingTree(RumVacuumState * gvs, OffsetNumber attnum, BlockNumber root *ptr, *tmp; - if (rumVacuumPostingTreeLeaves(gvs, attnum, rootBlkno, TRUE, &rootBuffer) == FALSE) + if (rumVacuumPostingTreeLeaves(gvs, attnum, rootBlkno, true, &rootBuffer) == false) { Assert(rootBuffer == InvalidBuffer); return; } memset(&root, 0, sizeof(DataPageDeleteStack)); - root.isRoot = TRUE; + root.isRoot = true; vacuum_delay_point(); - rumScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber); + rumScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber); ptr = root.child; while (ptr) diff --git a/src/rumvalidate.c b/src/rumvalidate.c index ddcbbe3ac9..0adbb10ac7 100644 --- a/src/rumvalidate.c +++ b/src/rumvalidate.c @@ -3,7 +3,7 @@ * rumvalidate.c * Opclass validator for RUM. * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -39,7 +39,9 @@ rumvalidate(Oid opclassoid) Form_pg_opclass classform; Oid opfamilyoid; Oid opcintype; + Oid opcintype_overload; /* used for timestamptz */ Oid opckeytype; + Oid opckeytype_overload; /* used for timestamptz */ char *opclassname; HeapTuple familytup; Form_pg_opfamily familyform; @@ -58,12 +60,18 @@ rumvalidate(Oid opclassoid) classform = (Form_pg_opclass) GETSTRUCT(classtup); opfamilyoid = classform->opcfamily; - opcintype = classform->opcintype; - opckeytype = classform->opckeytype; + opcintype = opcintype_overload = classform->opcintype; + opckeytype = opckeytype_overload = classform->opckeytype; if (!OidIsValid(opckeytype)) - opckeytype = opcintype; + opckeytype = opckeytype_overload = opcintype; opclassname = NameStr(classform->opcname); + /* Fix type Oid for timestamptz */ + if (opcintype == TIMESTAMPTZOID) + opcintype_overload = TIMESTAMPOID; + if (opckeytype == TIMESTAMPTZOID) + opckeytype_overload = TIMESTAMPOID; + /* Fetch opfamily information */ familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); if (!HeapTupleIsValid(familytup)) @@ -114,28 +122,56 @@ rumvalidate(Oid opclassoid) case GIN_EXTRACTVALUE_PROC: /* Some opclasses omit nullFlags */ ok = check_amproc_signature(procform->amproc, INTERNALOID, false, - 5, 5, opcintype, INTERNALOID, + 2, 5, opcintype_overload, INTERNALOID, INTERNALOID, INTERNALOID, INTERNALOID); break; case GIN_EXTRACTQUERY_PROC: /* Some opclasses omit nullFlags and searchMode */ - ok = check_amproc_signature(procform->amproc, INTERNALOID, false, - 7, 7, opcintype, INTERNALOID, - INT2OID, INTERNALOID, INTERNALOID, - INTERNALOID, INTERNALOID); + if (opcintype == TSVECTOROID) + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 5, 7, TSQUERYOID, INTERNALOID, + INT2OID, INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + else if (opcintype == TSQUERYOID) + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 5, 7, TSVECTOROID, INTERNALOID, + INT2OID, INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + else + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 5, 7, opcintype_overload, INTERNALOID, + INT2OID, INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); break; case GIN_CONSISTENT_PROC: /* Some opclasses omit queryKeys and nullFlags */ - ok = check_amproc_signature(procform->amproc, BOOLOID, false, - 6, 8, INTERNALOID, INT2OID, - opcintype, INT4OID, - INTERNALOID, INTERNALOID, - INTERNALOID, INTERNALOID); + if (opcintype == TSQUERYOID) + ok = check_amproc_signature(procform->amproc, BOOLOID, false, + 6, 8, INTERNALOID, INT2OID, + TSVECTOROID, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + else if (opcintype == TSVECTOROID || + opcintype == TIMESTAMPOID || + opcintype == TIMESTAMPTZOID || + opcintype == ANYARRAYOID) + ok = check_amproc_signature(procform->amproc, BOOLOID, false, + 6, 8, INTERNALOID, INT2OID, + opcintype_overload, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + else + ok = check_amproc_signature(procform->amproc, BOOLOID, false, + 6, 8, INTERNALOID, INT2OID, + INTERNALOID, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); break; case GIN_COMPARE_PARTIAL_PROC: ok = check_amproc_signature(procform->amproc, INT4OID, false, - 4, 4, opckeytype, opckeytype, + 4, 4, + opckeytype_overload, opckeytype_overload, INT2OID, INTERNALOID); break; case RUM_CONFIG_PROC: @@ -150,21 +186,29 @@ rumvalidate(Oid opclassoid) INTERNALOID, INTERNALOID); break; case RUM_ORDERING_PROC: - ok = check_amproc_signature(procform->amproc, FLOAT8OID, false, - 9, 9, INTERNALOID, INT2OID, - opcintype, INT4OID, - INTERNALOID, INTERNALOID, - INTERNALOID, INTERNALOID, - INTERNALOID); + /* Two possible signatures */ + if (opcintype == TSVECTOROID || + opcintype == ANYARRAYOID) + ok = check_amproc_signature(procform->amproc, FLOAT8OID, false, + 9, 9, INTERNALOID, INT2OID, + opcintype, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + else + ok = check_amproc_signature(procform->amproc, FLOAT8OID, false, + 3, 3, + opcintype, opcintype, INT2OID); break; case RUM_OUTER_ORDERING_PROC: ok = check_amproc_signature(procform->amproc, FLOAT8OID, false, 3, 3, - opcintype, opcintype, INT2OID); + opcintype_overload, opcintype_overload, + INT2OID); break; case RUM_ADDINFO_JOIN: ok = check_amproc_signature(procform->amproc, BYTEAOID, false, - 2, 2, opckeytype, opckeytype); + 2, 2, INTERNALOID, INTERNALOID); break; default: ereport(INFO, @@ -207,22 +251,40 @@ rumvalidate(Oid opclassoid) result = false; } - /* rum doesn't support ORDER BY operators */ - if (oprform->amoppurpose != AMOP_SEARCH || - OidIsValid(oprform->amopsortfamily)) + /* Check ORDER BY operator signature */ + if (oprform->amoppurpose == AMOP_ORDER) { - ereport(INFO, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("rum opfamily %s contains invalid ORDER BY specification for operator %s", - opfamilyname, - format_operator(oprform->amopopr)))); - result = false; + /* tsvector's distance returns float4 */ + if (oprform->amoplefttype == TSVECTOROID && + !check_amop_signature(oprform->amopopr, FLOAT4OID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("rum opfamily %s contains invalid ORDER BY specification for operator %s", + opfamilyname, + format_operator(oprform->amopopr)))); + result = false; + } + /* other types distance returns float8 */ + else if (oprform->amoplefttype != TSVECTOROID && + !check_amop_signature(oprform->amopopr, FLOAT8OID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("rum opfamily %s contains invalid ORDER BY specification for operator %s", + opfamilyname, + format_operator(oprform->amopopr)))); + result = false; + } } - - /* Check operator signature --- same for all rum strategies */ - if (!check_amop_signature(oprform->amopopr, BOOLOID, - oprform->amoplefttype, - oprform->amoprighttype)) + /* Check other operator signature */ + else if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) { ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -261,11 +323,10 @@ rumvalidate(Oid opclassoid) if (opclassgroup && (opclassgroup->functionset & (((uint64) 1) << i)) != 0) continue; /* got it */ - if (i == GIN_COMPARE_PARTIAL_PROC) + if (i == GIN_COMPARE_PROC || + i == GIN_COMPARE_PARTIAL_PROC) continue; /* optional method */ - if (i == GIN_CONSISTENT_PROC) - continue; - if (i == RUM_PRE_CONSISTENT_PROC) + if (i >= RUM_CONFIG_PROC) continue; ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/tuplesort10.c b/src/tuplesort10.c new file mode 100644 index 0000000000..5a829a9240 --- /dev/null +++ b/src/tuplesort10.c @@ -0,0 +1,4469 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we only do that + * for the first run, and only if the run would otherwise end up being very + * short. We merge the runs using polyphase merge, Knuth's Algorithm + * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by + * logtape.c, which avoids space wastage by recycling disk space as soon + * as each block is read from its "tape". + * + * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for + * the replacement selection, because it uses a fixed number of records + * in memory at all times. Since we are dealing with tuples that may vary + * considerably in size, we want to be able to vary the number of records + * kept in memory to ensure full utilization of the allowed sort memory + * space. So, we keep the tuples in a variable-size heap, with the next + * record to go out at the top of the heap. Like Algorithm 5.4.1R, each + * record is stored with the run number that it must go into, and we use + * (run number, key) as the ordering key for the heap. When the run number + * at the top of the heap changes, we know that no more records of the prior + * run are left in the heap. Note that there are in practice only ever two + * distinct run numbers, because since PostgreSQL 9.6, we only use + * replacement selection to form the first run. + * + * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small + * customizations) is only used with the aim of producing just one run, + * thereby avoiding all merging. Only the first run can use replacement + * selection, which is why there are now only two possible valid run + * numbers, and why heapification is customized to not distinguish between + * tuples in the second run (those will be quicksorted). We generally + * prefer a simple hybrid sort-merge strategy, where runs are sorted in much + * the same way as the entire input of an internal sort is sorted (using + * qsort()). The replacement_sort_tuples GUC controls the limited remaining + * use of replacement selection for the first run. + * + * There are several reasons to favor a hybrid sort-merge strategy. + * Maintaining a priority tree/heap has poor CPU cache characteristics. + * Furthermore, the growth in main memory sizes has greatly diminished the + * value of having runs that are larger than available memory, even in the + * case where there is partially sorted input and runs can be made far + * larger by using a heap. In most cases, a single-pass merge step is all + * that is required even when runs are no larger than available memory. + * Avoiding multiple merge passes was traditionally considered to be the + * major advantage of using replacement selection. + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run + * (or two, when replacement selection is still used), then merge the runs + * using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/hash.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * While building initial runs, tupindex holds the tuple's run number. + * Historically, the run number could meaningfully distinguish many runs, but + * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement + * selection is always abandoned after the first run; no other run number + * should be represented here. During merge passes, we re-use it to hold the + * input tape number that each tuple in the heap was read from. tupindex goes + * unused if the sort occurs entirely in memory. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + + /* + * Run numbers, used during external sort operations. + * + * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun. + */ +#define RUN_FIRST 0 +#define HEAP_RUN_NEXT INT_MAX +#define RUN_SECOND 1 + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this indicates if the replacement + * selection strategy is in use. When it isn't, then a simple hybrid + * sort-merge strategy is in use instead (runs are quicksorted). + */ + bool replaceActive; + + /* + * While building initial runs, this is the current output run number + * (starting at RUN_FIRST). Afterwards, it is the number of initial runs + * we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static bool useselection(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void dumpbatch(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex); +static void tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + state->allowedMem = workMem * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->slabAllocatorUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = RUN_FIRST; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey_nodata(indexRel); + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "external sort ended, %ld disk blocks used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "internal sort ended, %ld KB used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return TRUE if we were able to enlarge the array, FALSE if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state); + + /* + * Dump tuples until we are back under the limit. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuple->tupindex = 0; /* not used */ + tuplesort_heap_replace_top(state, tuple, false); + } + break; + + case TSS_BUILDRUNS: + + /* + * Insert the tuple into the heap, with run number currentRun if + * it can go into the current run, else HEAP_RUN_NEXT. The tuple + * can go into the current run if it is >= the first + * not-yet-output tuple. (Actually, it could go into the current + * run if it is >= the most recently output tuple ... but that + * would require keeping around the tuple we last output, and it's + * simplest to let writetup free each tuple as soon as it's + * written.) + * + * Note that this only applies when: + * + * - currentRun is RUN_FIRST + * + * - Replacement selection is in use (typically it is never used). + * + * When these two conditions are not both true, all tuples are + * appended indifferently, much like the TSS_INITIAL case. + * + * There should always be room to store the incoming tuple. + */ + Assert(!state->replaceActive || state->memtupcount > 0); + if (state->replaceActive && + COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) + { + Assert(state->currentRun == RUN_FIRST); + + /* + * Insert tuple into first, fully heapified run. + * + * Unlike classic replacement selection, which this module was + * previously based on, only RUN_FIRST tuples are fully + * heapified. Any second/next run tuples are appended + * indifferently. While HEAP_RUN_NEXT tuples may be sifted + * out of the way of first run tuples, COMPARETUP() will never + * be called for the run's tuples during sifting (only our + * initial COMPARETUP() call is required for the tuple, to + * determine that the tuple does not belong in RUN_FIRST). + */ + tuple->tupindex = state->currentRun; + tuplesort_heap_insert(state, tuple, true); + } + else + { + /* + * Tuple was determined to not belong to heapified RUN_FIRST, + * or replacement selection not in play. Append the tuple to + * memtuples indifferently. + * + * dumptuples() does not trust that the next run's tuples are + * heapified. Anything past the first run will always be + * quicksorted even when replacement selection is initially + * used. (When it's never used, every tuple still takes this + * path.) + */ + tuple->tupindex = HEAP_RUN_NEXT; + state->memtuples[state->memtupcount++] = *tuple; + } + + /* + * If we are over the memory limit, dump tuples till we're under. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort starting: %s", + pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory. Just qsort 'em and we're done. + */ + tuplesort_sort_memtuples(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess, one run per tape). Note that + * mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort done (except %d-way final merge): %s", + state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort done: %s", + pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns FALSE if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state, false); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &newtup, false); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return TRUE; else, clear the slot + * and return FALSE. + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns FALSE if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns TRUE if successful, FALSE if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * useselection - determine algorithm to use to sort first run. + * + * It can sometimes be useful to use the replacement selection algorithm if it + * results in one large run, and there is little available workMem. See + * remarks on RUN_SECOND optimization within dumptuples(). + */ +static bool +useselection(Tuplesortstate *state) +{ + /* + * memtupsize might be noticeably higher than memtupcount here in atypical + * cases. It seems slightly preferable to not allow recent outliers to + * impact this determination. Note that caller's trace_sort output + * reports memtupcount instead. + */ + if (state->memtupsize <= replacement_sort_tuples) + return true; + + return false; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we don't have room to sort in memory. + */ +static void +inittapes(Tuplesortstate *state) +{ + int maxTapes, + j; + int64 tapeSpace; + + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + + state->maxTapes = maxTapes; + state->tapeRange = maxTapes - 1; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to external sort with %d tapes: %s", + maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Decrease availMem to reflect the space needed for tape buffers, when + * writing the initial runs; but don't decrease it to the point that we + * have no room for tuples. (That case is only likely to occur if sorting + * pass-by-value Datums; in all other scenarios the memtuples[] array is + * unlikely to occupy more than half of allowedMem. In the pass-by-value + * case it's not important to account for tuple space, so we don't care if + * LACKMEM becomes inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. + */ + PrepareTempTablespaces(); + + /* + * Create the tape set and allocate the per-tape data arrays. + */ + state->tapeset = LogicalTapeSetCreate(maxTapes); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* + * Give replacement selection a try based on user setting. There will be + * a switch to a simple hybrid sort-merge strategy after the first run + * (iff we could not output one long run). + */ + state->replaceActive = useselection(state); + + if (state->replaceActive) + { + /* + * Convert the unsorted contents of memtuples[] into a heap. Each + * tuple is marked as belonging to run number zero. + * + * NOTE: we pass false for checkIndex since there's no point in + * comparing indexes in this step, even though we do intend the + * indexes to be part of the sort key... + */ + int ntuples = state->memtupcount; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "replacement selection will sort %d first run tuples", + state->memtupcount); +#endif + state->memtupcount = 0; /* make the heap empty */ + + for (j = 0; j < ntuples; j++) + { + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[j]; + + stup.tupindex = RUN_FIRST; + tuplesort_heap_insert(state, &stup, false); + } + Assert(state->memtupcount == ntuples); + } + + state->currentRun = RUN_FIRST; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextDelete(state->tuplecontext); + state->tuplecontext = NULL; + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * If we produced only one initial run (quite likely if the total data + * volume is between 1X and 2X workMem when replacement selection is used, + * but something we particularly count on when input is presorted), we can + * just use that tape as the finished output, rather than doing a useless + * merge. (This obvious optimization is not in Knuth's algorithm.) + */ + if (state->currentRun == RUN_SECOND) + { + state->result_tape = state->tp_tapenum[state->destTape]; + /* must freeze and rewind the finished output tape */ + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; + return; + } + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We do this only after checking for the case that we produced only one + * initial run, because there is no need to use a large read buffer when + * we're reading from a single tape. With one tape, the I/O pattern will + * be the same regardless of the buffer size. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + (state->availMem) / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &stup, false); + + } + else + tuplesort_heap_delete_top(state, false); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished %d-way merge step: %s", state->activeTapes, + pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.tupindex = srcTape; + tuplesort_heap_insert(state, &tup, false); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write to tape + * + * This is used during initial-run building, but not during merging. + * + * When alltuples = false and replacement selection is still active, dump + * only enough tuples to get under the availMem limit (and leave at least + * one tuple in memtuples, since puttuple will then assume it is a heap that + * has a tuple to compare to). We always insist there be at least one free + * slot in the memtuples[] array. + * + * When alltuples = true, dump everything currently in memory. (This + * case is only used at end of input data, although in practice only the + * first run could fail to dump all tuples when we LACKMEM(), and only + * when replacement selection is active.) + * + * If, when replacement selection is active, we see that the tuple run + * number at the top of the heap has changed, start a new run. This must be + * the first run, because replacement selection is always abandoned for all + * further runs. + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + while (alltuples || + (LACKMEM(state) && state->memtupcount > 1) || + state->memtupcount >= state->memtupsize) + { + if (state->replaceActive) + { + /* + * Still holding out for a case favorable to replacement + * selection. Still incrementally spilling using heap. + * + * Dump the heap's frontmost entry, and remove it from the heap. + */ + Assert(state->memtupcount > 0); + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[0]); + tuplesort_heap_delete_top(state, true); + } + else + { + /* + * Once committed to quicksorting runs, never incrementally spill + */ + dumpbatch(state, alltuples); + break; + } + + /* + * If top run number has changed, we've finished the current run (this + * can only be the first run), and will no longer spill incrementally. + */ + if (state->memtupcount == 0 || + state->memtuples[0].tupindex == HEAP_RUN_NEXT) + { + markrunend(state, state->tp_tapenum[state->destTape]); + Assert(state->currentRun == RUN_FIRST); + state->currentRun++; + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished incrementally writing %s run %d to tape %d: %s", + (state->memtupcount == 0) ? "only" : "first", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Done if heap is empty, which is possible when there is only one + * long run. + */ + Assert(state->currentRun == RUN_SECOND); + if (state->memtupcount == 0) + { + /* + * Replacement selection best case; no final merge required, + * because there was only one initial run (second run has no + * tuples). See RUN_SECOND case in mergeruns(). + */ + break; + } + + /* + * Abandon replacement selection for second run (as well as any + * subsequent runs). + */ + state->replaceActive = false; + + /* + * First tuple of next run should not be heapified, and so will + * bear placeholder run number. In practice this must actually be + * the second run, which just became the currentRun, so we're + * clear to quicksort and dump the tuples in batch next time + * memtuples becomes full. + */ + Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT); + selectnewtape(state); + } + } +} + +/* + * dumpbatch - sort and dump all memtuples, forming one run on tape + * + * Second or subsequent runs are never heapified by this module (although + * heapification still respects run number differences between the first and + * second runs), and a heap (replacement selection priority queue) is often + * avoided in the first place. + */ +static void +dumpbatch(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "starting quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished writing run %d to tape %d: %s", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + * spaceUsed is measured in kilobytes. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + const char **sortMethod, + const char **spaceType, + long *spaceUsed) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + *spaceType = "Disk"; + *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + *spaceType = "Memory"; + *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + *sortMethod = "top-N heapsort"; + else + *sortMethod = "quicksort"; + break; + case TSS_SORTEDONTAPE: + *sortMethod = "external sort"; + break; + case TSS_FINALMERGE: + *sortMethod = "external merge"; + break; + default: + *sortMethod = "still in progress"; + break; + } +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + * + * Compare two SortTuples. If checkIndex is true, use the tuple index + * as the front of the sort key; otherwise, no. + * + * Note that for checkIndex callers, the heap invariant is never + * maintained beyond the first run, and so there are no COMPARETUP() + * calls needed to distinguish tuples in HEAP_RUN_NEXT. + */ + +#define HEAPCOMPARE(tup1,tup2) \ + (checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \ + (tup1)->tupindex == HEAP_RUN_NEXT) ? \ + ((tup1)->tupindex) - ((tup2)->tupindex) : \ + COMPARETUP(state, tup1, tup2)) + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + * + * We assume that all entries in a bounded heap will always have tupindex + * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse + * the direction of comparison for tupindexes. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + stup.tupindex = 0; /* not used */ + tuplesort_heap_insert(state, &stup, false); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i], false); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state, false); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts. Quicksort is also generally + * preferred to replacement selection for generating runs during external sort + * operations, although replacement selection is sometimes used for the first + * run. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + Assert(!checkIndex || tuple->tupindex == RUN_FIRST); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + Assert(!checkIndex || state->currentRun == RUN_FIRST); + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple, checkIndex); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(!checkIndex || state->currentRun == RUN_FIRST); + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_KeyAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort11.c b/src/tuplesort11.c new file mode 100644 index 0000000000..23de559073 --- /dev/null +++ b/src/tuplesort11.c @@ -0,0 +1,4595 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/hash.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * tupindex holds the input tape number that each tuple in the heap was read + * from during merge passes. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->slabAllocatorUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + Assert(!WORKER(state)); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* Parallel leader ignores hint */ + if (LEADER(state)) + return; + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextDelete(state->tuplecontext); + state->tuplecontext = NULL; + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &stup); + + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.tupindex = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + stats->spaceType = SORT_SPACE_TYPE_DISK; + stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, shared->tapes, + &shared->fileset, state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort12.c b/src/tuplesort12.c new file mode 100644 index 0000000000..796c1b8392 --- /dev/null +++ b/src/tuplesort12.c @@ -0,0 +1,4596 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * tupindex holds the input tape number that each tuple in the heap was read + * from during merge passes. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->slabAllocatorUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + Assert(!WORKER(state)); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* Parallel leader ignores hint */ + if (LEADER(state)) + return; + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextDelete(state->tuplecontext); + state->tuplecontext = NULL; + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &stup); + + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.tupindex = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + stats->spaceType = SORT_SPACE_TYPE_DISK; + stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, shared->tapes, + &shared->fileset, state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort13.c b/src/tuplesort13.c new file mode 100644 index 0000000000..87354a38b4 --- /dev/null +++ b/src/tuplesort13.c @@ -0,0 +1,4708 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and a source/input tape number that + * tracks which tape each heap element/slot belongs to during merging. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int srctape; /* source tape number */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on copytup routines to use the + * correct memory context for these tuples (and to not use the reset + * context for anything whose lifetime needs to span multiple external + * sort runs). readtup routines use the slab allocator (they cannot use + * the reset context because it gets deleted at the point that merging + * begins). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void tuplesort_begin_batch(Tuplesortstate *state); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext maincontext; + MemoryContext sortcontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. + */ + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(maincontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->randomAccess = randomAccess; + state->tuples = true; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->sortcontext = sortcontext; + state->maincontext = maincontext; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; + + /* + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. + */ + tuplesort_begin_batch(state); + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reset (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple as needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each SortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL && state->memtupcount == 0); + /* Can't set the bound twice, either */ + Assert(!state->bounded); + /* Also, this shouldn't be called in a parallel worker */ + Assert(!WORKER(state)); + + /* Parallel leader allows but ignores hint */ + if (LEADER(state)) + return; + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_used_bound + * + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free + * + * Internal routine for freeing resources of tuplesort. + */ +static void +tuplesort_free(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory. + */ + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it wasn't able to fit that data into + * main memory. This is why we assume space used on the disk to be more + * important for tracking resource usage than space used in memory. Note + * that the amount of space occupied by some tupleset on the disk might be + * less than amount of space occupied by the same tupleset in memory due + * to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].srctape; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.srctape = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, false, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextResetOnly(state->tuplecontext); + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].srctape; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.srctape = srcTape; + tuplesort_heap_replace_top(state, &stup); + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.srctape = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) + stats->spaceType = SORT_SPACE_TYPE_DISK; + else + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_index() should not be called"); +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, false, + shared->tapes, &shared->fileset, + state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort14.c b/src/tuplesort14.c new file mode 100644 index 0000000000..85c8b10415 --- /dev/null +++ b/src/tuplesort14.c @@ -0,0 +1,4784 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and a source/input tape number that + * tracks which tape each heap element/slot belongs to during merging. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int srctape; /* source tape number */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on copytup routines to use the + * correct memory context for these tuples (and to not use the reset + * context for anything whose lifetime needs to span multiple external + * sort runs). readtup routines use the slab allocator (they cannot use + * the reset context because it gets deleted at the point that merging + * begins). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void tuplesort_begin_batch(Tuplesortstate *state); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ + +#define ST_SORT qsort_tuple +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE_RUNTIME_POINTER +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DECLARE +#define ST_DEFINE +#include "lib/sort_template.h" + +#define ST_SORT qsort_ssup +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, ssup) \ + ApplySortComparator((a)->datum1, (a)->isnull1, \ + (b)->datum1, (b)->isnull1, (ssup)) +#define ST_COMPARE_ARG_TYPE SortSupportData +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext maincontext; + MemoryContext sortcontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. + */ + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(maincontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->randomAccess = randomAccess; + state->tuples = true; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->sortcontext = sortcontext; + state->maincontext = maincontext; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; + + /* + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. + */ + tuplesort_begin_batch(state); + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reset (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = indexRel->rd_indcollation[i]; + sortKey->ssup_nulls_first = false; + sortKey->ssup_attno = i + 1; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + /* Look for a sort support function */ + PrepareSortSupportFromGistIndexRel(indexRel, sortKey); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple as needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each SortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL && state->memtupcount == 0); + /* Can't set the bound twice, either */ + Assert(!state->bounded); + /* Also, this shouldn't be called in a parallel worker */ + Assert(!WORKER(state)); + + /* Parallel leader allows but ignores hint */ + if (LEADER(state)) + return; + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_used_bound + * + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free + * + * Internal routine for freeing resources of tuplesort. + */ +static void +tuplesort_free(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory. + */ + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it wasn't able to fit that data into + * main memory. This is why we assume space used on the disk to be more + * important for tracking resource usage than space used in memory. Note + * that the amount of space occupied by some tupleset on the disk might be + * less than amount of space occupied by the same tupleset in memory due + * to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].srctape; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.srctape = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, false, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextResetOnly(state->tuplecontext); + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].srctape; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.srctape = srcTape; + tuplesort_heap_replace_top(state, &stup); + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.srctape = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) + stats->spaceType = SORT_SPACE_TYPE_DISK; + else + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_index() should not be called"); +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, false, + shared->tapes, &shared->fileset, + state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort15.c b/src/tuplesort15.c new file mode 100644 index 0000000000..37184cdcac --- /dev/null +++ b/src/tuplesort15.c @@ -0,0 +1,4939 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about external + * sorting algorithms. The algorithm we use is a balanced k-way merge. + * Before PostgreSQL 15, we used the polyphase merge algorithm (Knuth's + * Algorithm 5.4.2D), but with modern hardware, a straightforward balanced + * merge is better. Knuth is assuming that tape drives are expensive + * beasts, and in particular that there will always be many more runs than + * tape drives. The polyphase merge algorithm was good at keeping all the + * tape drives busy, but in our implementation a "tape drive" doesn't cost + * much more than a few Kb of memory buffers, so we can afford to have + * lots of them. In particular, if we can have as many tape drives as + * sorted runs, we can eliminate any repeated I/O at all. + * + * Historically, we divided the input into sorted runs using replacement + * selection, in the form of a priority tree implemented as a heap + * (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort + * for run generation. + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape. If we reach the max number of tapes, we write + * subsequent runs on the existing tapes in a round-robin fashion. We will + * need multiple merge passes to finish the merge in that case. After the + * end of the input is reached, we dump out remaining tuples in memory into + * a final run, then merge the runs. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * In the current code we determine the number of input tapes M on the basis + * of workMem: we want workMem/M to be large enough that we read a fair + * amount of data each time we read from a tape, so as to maintain the + * locality of access described above. Nonetheless, with large workMem we + * can have many tapes. The logical "tapes" are implemented by logtape.c, + * which avoids space wastage by recycling disk space as soon as each block + * is read from its "tape". + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and a source/input tape number that + * tracks which tape each heap element/slot belongs to during merging. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int srctape; /* source tape number */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much buffer space we'd like to allocate for each + * input tape, for pre-reading (see discussion at top of file). This is *in + * addition to* the 1 block already included in TAPE_BUFFER_OVERHEAD. + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + int sortopt; /* Bitmask of flags used to setup sort */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* max number of input tapes to merge in each + * pass */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); + + /* + * Whether SortTuple's datum1 and isnull1 members are maintained by the + * above routines. If not, some sort specializations are disabled. + */ + bool haveDatum1; + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Memory used for input and output tape buffers. */ + size_t tape_buffer_mem; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Logical tapes, for merging. + * + * The initial runs are written in the output tapes. In each merge pass, + * the output tapes of the previous pass become the input tapes, and new + * output tapes are created as needed. When nInputTapes equals + * nInputRuns, there is only one merge pass left. + */ + LogicalTape **inputTapes; + int nInputTapes; + int nInputRuns; + + LogicalTape **outputTapes; + int nOutputTapes; + int nOutputRuns; + + LogicalTape *destTape; /* current output tape */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + LogicalTape *result_tape; /* actual tape of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run that the leader needs to merge. Typically includes a + * worker state held by the leader process itself. Set in the leader + * Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise, it's NULL. The + * presence of a value in this field is also checked by various sort + * specialization functions as an optimization when comparing the leading + * key in a tiebreak situation to determine if there are any subsequent + * keys to sort on. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->sortopt contains TUPLESORT_RANDOMACCESS, then the stored + * representation of the tuple must be followed by another "unsigned int" that + * is a copy of the length --- so the total tape space used is actually + * sizeof(unsigned int) more than the stored length value. This allows + * read-backwards. When the random access flag was not specified, the + * write/read routines may omit the extra length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on copytup routines to use the + * correct memory context for these tuples (and to not use the reset + * context for anything whose lifetime needs to span multiple external + * sort runs). readtup routines use the slab allocator (they cannot use + * the reset context because it gets deleted at the point that merging + * begins). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tape, ptr, len) \ + do { \ + if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + int sortopt); +static void tuplesort_begin_batch(Tuplesortstate *state); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(LogicalTape *tape, bool eofOK); +static void markrunend(LogicalTape *tape); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); + +/* + * Specialized comparators that we can inline into specialized sorts. The goal + * is to try to sort two tuples without having to follow the pointers to the + * comparator or the tuple. + * + * XXX: For now, these fall back to comparator functions that will compare the + * leading datum a second time. + * + * XXX: For now, there is no specialization for cases where datum1 is + * authoritative and we don't even need to fall back to a callback at all (that + * would be true for types like int4/int8/timestamp/date, but not true for + * abbreviations of text or multi-key sorts. There could be! Is it worth it? + */ + +/* Used if first key's comparator is ssup_datum_unsigned_compare */ +static pg_attribute_always_inline int +qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplyUnsignedSortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + &state->sortKeys[0]); + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->onlyKey != NULL) + return 0; + + return state->comparetup(a, b, state); +} + +#if SIZEOF_DATUM >= 8 +/* Used if first key's comparator is ssup_datum_signed_compare */ +static pg_attribute_always_inline int +qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySignedSortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + &state->sortKeys[0]); + + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->onlyKey != NULL) + return 0; + + return state->comparetup(a, b, state); +} +#endif + +/* Used if first key's comparator is ssup_datum_int32_compare */ +static pg_attribute_always_inline int +qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplyInt32SortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + &state->sortKeys[0]); + + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->onlyKey != NULL) + return 0; + + return state->comparetup(a, b, state); +} + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. qsort_tuple_{unsigned,signed,int32} are specialized for + * common comparison functions on pass-by-value leading datums. + */ + +#define ST_SORT qsort_tuple_unsigned +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +#if SIZEOF_DATUM >= 8 +#define ST_SORT qsort_tuple_signed +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" +#endif + +#define ST_SORT qsort_tuple_int32 +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_int32_compare(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +#define ST_SORT qsort_tuple +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE_RUNTIME_POINTER +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DECLARE +#define ST_DEFINE +#include "lib/sort_template.h" + +#define ST_SORT qsort_ssup +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, ssup) \ + ApplySortComparator((a)->datum1, (a)->isnull1, \ + (b)->datum1, (b)->isnull1, (ssup)) +#define ST_COMPARE_ARG_TYPE SortSupportData +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a sortopt which is a bitmask of + * sort options. See TUPLESORT_* definitions in tuplesort.h + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state; + MemoryContext maincontext; + MemoryContext sortcontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on random access support */ + if (coordinate && (sortopt & TUPLESORT_RANDOMACCESS)) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. + */ + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(maincontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->sortopt = sortopt; + state->tuples = true; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->sortcontext = sortcontext; + state->maincontext = maincontext; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; + + /* + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. + */ + tuplesort_begin_batch(state); + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reset (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. For bounded sorts, tuples may be pfreed in any + * order, so we use a regular aset.c context so that it can make use of + * free'd memory. When the sort is not bounded, we make use of a + * generation.c context as this keeps allocations more compact with less + * wastage. Allocations are also slightly more CPU efficient. + */ + if (state->sortopt & TUPLESORT_ALLOWBOUNDED) + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + else + state->tuplecontext = GenerationContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * Tape variables (inputTapes, outputTapes, etc.) will be initialized by + * inittapes(), if needed. + */ + + state->result_tape = NULL; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + state->haveDatum1 = true; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + /* + * If we don't have a simple leading attribute, we don't currently + * initialize datum1, so disable optimizations that require it. + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + state->haveDatum1 = false; + else + state->haveDatum1 = true; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + bool uniqueNullsNotDistinct, + int workMem, + SortCoordinate coordinate, + int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + state->haveDatum1 = true; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + state->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, + sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->haveDatum1 = true; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + int workMem, + SortCoordinate coordinate, + int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: workMem = %d, randomAccess = %c", + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->haveDatum1 = true; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = indexRel->rd_indcollation[i]; + sortKey->ssup_nulls_first = false; + sortKey->ssup_attno = i + 1; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + AssertState(sortKey->ssup_attno != 0); + + /* Look for a sort support function */ + PrepareSortSupportFromGistIndexRel(indexRel, sortKey); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + state->haveDatum1 = true; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple as needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each SortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL && state->memtupcount == 0); + /* Assert we allow bounded sorts */ + Assert(state->sortopt & TUPLESORT_ALLOWBOUNDED); + /* Can't set the bound twice, either */ + Assert(!state->bounded); + /* Also, this shouldn't be called in a parallel worker */ + Assert(!WORKER(state)); + + /* Parallel leader allows but ignores hint */ + if (LEADER(state)) + return; + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_used_bound + * + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free + * + * Internal routine for freeing resources of tuplesort. + */ +static void +tuplesort_free(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + * + * We don't bother to destroy the individual tapes here. They will go away + * with the sortcontext. (In TSS_FINALMERGE state, we have closed + * finished tapes already.) + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory. + */ + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it wasn't able to fit that data into + * main memory. This is why we assume space used on the disk to be more + * important for tracking resource usage than space used in memory. Note + * that the amount of space occupied by some tupleset on the disk might be + * less than amount of space occupied by the same tupleset in memory due + * to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->nInputTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->sortopt & TUPLESORT_RANDOMACCESS); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->sortopt & TUPLESORT_RANDOMACCESS); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTapeIndex = state->memtuples[0].srctape; + LogicalTape *srcTape = state->inputTapes[srcTapeIndex]; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + state->nInputRuns--; + + /* + * Close the tape. It'd go away at the end of the sort + * anyway, but better to release the memory early. + */ + LogicalTapeClose(srcTape); + return true; + } + newtup.srctape = srcTapeIndex; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /*---------- + * In the merge phase, we need buffer space for each input and output tape. + * Each pass in the balanced merge algorithm reads from M input tapes, and + * writes to N output tapes. Each tape consumes TAPE_BUFFER_OVERHEAD bytes + * of memory. In addition to that, we want MERGE_BUFFER_SIZE workspace per + * input tape. + * + * totalMem = M * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE) + + * N * TAPE_BUFFER_OVERHEAD + * + * Except for the last and next-to-last merge passes, where there can be + * fewer tapes left to process, M = N. We choose M so that we have the + * desired amount of memory available for the input buffers + * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE), given the total memory + * available for the tape buffers (allowedMem). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + *---------- + */ + mOrder = allowedMem / + (2 * TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a multi-pass merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * Helper function to calculate how much memory to allocate for the read buffer + * of each input tape in a merge pass. + * + * 'avail_mem' is the amount of memory available for the buffers of all the + * tapes, both input and output. + * 'nInputTapes' and 'nInputRuns' are the number of input tapes and runs. + * 'maxOutputTapes' is the max. number of output tapes we should produce. + */ +static int64 +merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns, + int maxOutputTapes) +{ + int nOutputRuns; + int nOutputTapes; + + /* + * How many output tapes will we produce in this pass? + * + * This is nInputRuns / nInputTapes, rounded up. + */ + nOutputRuns = (nInputRuns + nInputTapes - 1) / nInputTapes; + + nOutputTapes = Min(nOutputRuns, maxOutputTapes); + + /* + * Each output tape consumes TAPE_BUFFER_OVERHEAD bytes of memory. All + * remaining memory is divided evenly between the input tapes. + * + * This also follows from the formula in tuplesort_merge_order, but here + * we derive the input buffer size from the amount of memory available, + * and M and N. + */ + return Max((avail_mem - TAPE_BUFFER_OVERHEAD * nOutputTapes) / nInputTapes, 0); +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of input tapes to use when merging */ + state->maxTapes = tuplesort_merge_order(state->allowedMem); + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + state->maxTapes = MINORDER; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, state->maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set */ + inittapestate(state, state->maxTapes); + state->tapeset = + LogicalTapeSetCreate(false, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize logical tape arrays. + */ + state->inputTapes = NULL; + state->nInputTapes = 0; + state->nInputRuns = 0; + + state->outputTapes = palloc0(state->maxTapes * sizeof(LogicalTape *)); + state->nOutputTapes = 0; + state->nOutputRuns = 0; + + state->status = TSS_BUILDRUNS; + + selectnewtape(state); +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); +} + +/* + * selectnewtape -- select next tape to output to. + * + * This is called after finishing a run when we know another run + * must be started. This is used both when building the initial + * runs, and during merge passes. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + /* + * At the beginning of each merge pass, nOutputTapes and nOutputRuns are + * both zero. On each call, we create a new output tape to hold the next + * run, until maxTapes is reached. After that, we assign new runs to the + * existing tapes in a round robin fashion. + */ + if (state->nOutputTapes < state->maxTapes) + { + /* Create a new tape to hold the next run */ + Assert(state->outputTapes[state->nOutputRuns] == NULL); + Assert(state->nOutputRuns == state->nOutputTapes); + state->destTape = LogicalTapeCreate(state->tapeset); + state->outputTapes[state->nOutputTapes] = state->destTape; + state->nOutputTapes++; + state->nOutputRuns++; + } + else + { + /* + * We have reached the max number of tapes. Append to an existing + * tape. + */ + state->destTape = state->outputTapes[state->nOutputRuns % state->nOutputTapes]; + state->nOutputRuns++; + } +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements the Balanced k-Way Merge Algorithm. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextResetOnly(state->tuplecontext); + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * In a multi-pass merge, we could shrink this allocation for the last + * merge pass, if it has fewer tapes than previous passes, but we don't + * bother. + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, state->nOutputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + * + * We could shrink this, too, between passes in a multi-pass merge, but we + * don't bother. (The initial input tapes are still in outputTapes. The + * number of input tapes will not increase between passes.) + */ + state->memtupsize = state->nOutputTapes; + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + state->nOutputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for tape buffers among + * all the input tapes. At the beginning of each merge pass, we will + * divide this memory between the input and output tapes in the pass. + */ + state->tape_buffer_mem = state->availMem; + USEMEM(state, state->tape_buffer_mem); +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using %zu KB of memory for tape buffers", + state->worker, state->tape_buffer_mem / 1024); +#endif + + for (;;) + { + /* + * On the first iteration, or if we have read all the runs from the + * input tapes in a multi-pass merge, it's time to start a new pass. + * Rewind all the output tapes, and make them inputs for the next + * pass. + */ + if (state->nInputRuns == 0) + { + int64 input_buffer_size; + + /* Close the old, emptied, input tapes */ + if (state->nInputTapes > 0) + { + for (tapenum = 0; tapenum < state->nInputTapes; tapenum++) + LogicalTapeClose(state->inputTapes[tapenum]); + pfree(state->inputTapes); + } + + /* Previous pass's outputs become next pass's inputs. */ + state->inputTapes = state->outputTapes; + state->nInputTapes = state->nOutputTapes; + state->nInputRuns = state->nOutputRuns; + + /* + * Reset output tape variables. The actual LogicalTapes will be + * created as needed, here we only allocate the array to hold + * them. + */ + state->outputTapes = palloc0(state->nInputTapes * sizeof(LogicalTape *)); + state->nOutputTapes = 0; + state->nOutputRuns = 0; + + /* + * Redistribute the memory allocated for tape buffers, among the + * new input and output tapes. + */ + input_buffer_size = merge_read_buffer_size(state->tape_buffer_mem, + state->nInputTapes, + state->nInputRuns, + state->maxTapes); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "starting merge pass of %d input runs on %d tapes, " INT64_FORMAT " KB of memory for each input tape: %s", + state->nInputRuns, state->nInputTapes, input_buffer_size / 1024, + pg_rusage_show(&state->ru_start)); +#endif + + /* Prepare the new input tapes for merge pass. */ + for (tapenum = 0; tapenum < state->nInputTapes; tapenum++) + LogicalTapeRewindForRead(state->inputTapes[tapenum], input_buffer_size); + + /* + * If there's just one run left on each input tape, then only one + * merge pass remains. If we don't have to produce a materialized + * sorted tape, we can stop at this point and do the final merge + * on-the-fly. + */ + if ((state->sortopt & TUPLESORT_RANDOMACCESS) == 0 + && state->nInputRuns <= state->nInputTapes + && !WORKER(state)) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Select an output tape */ + selectnewtape(state); + + /* Merge one run from each input tape. */ + mergeonerun(state); + + /* + * If the input tapes are empty, and we output only one output run, + * we're done. The current output tape contains the final result. + */ + if (state->nInputRuns == 0 && state->nOutputRuns <= 1) + break; + } + + /* + * Done. The result is on a single run on a single tape. + */ + state->result_tape = state->outputTapes[0]; + if (!WORKER(state)) + LogicalTapeFreeze(state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Close all the now-empty input tapes, to release their read buffers. */ + for (tapenum = 0; tapenum < state->nInputTapes; tapenum++) + LogicalTapeClose(state->inputTapes[tapenum]); +} + +/* + * Merge one run from each input tape. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int srcTapeIndex; + LogicalTape *srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTapeIndex = state->memtuples[0].srctape; + srcTape = state->inputTapes[srcTapeIndex]; + WRITETUP(state, state->destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.srctape = srcTapeIndex; + tuplesort_heap_replace_top(state, &stup); + } + else + { + tuplesort_heap_delete_top(state); + state->nInputRuns--; + } + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape. + */ + markrunend(state->destTape); +} + +/* + * beginmerge - initialize for a merge pass + * + * Fill the merge heap with the first tuple from each input tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int srcTapeIndex; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + activeTapes = Min(state->nInputTapes, state->nInputRuns); + + for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++) + { + SortTuple tup; + + if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup)) + { + tup.srctape = srcTapeIndex; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + /* read next tuple, if any */ + if ((tuplen = getlen(srcTape, true)) == 0) + return false; + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. In general, short final runs are quite possible, but avoid + * creating a completely empty run. In a worker, though, we must produce + * at least one tape, even if it's empty. + */ + if (state->memtupcount == 0 && state->currentRun > 0) + return; + + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + if (state->currentRun > 0) + selectnewtape(state); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->destTape, &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state->destTape); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, (state->currentRun - 1) % state->nOutputTapes + 1, + pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->sortopt & TUPLESORT_RANDOMACCESS); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->result_tape, 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->sortopt & TUPLESORT_RANDOMACCESS); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->sortopt & TUPLESORT_RANDOMACCESS); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) + stats->spaceType = SORT_SPACE_TYPE_DISK; + else + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* + * Do we have the leading column's value or abbreviation in datum1, + * and is there a specialization for its comparator? + */ + if (state->haveDatum1 && state->sortKeys) + { + if (state->sortKeys[0].comparator == ssup_datum_unsigned_cmp) + { + qsort_tuple_unsigned(state->memtuples, + state->memtupcount, + state); + return; + } +#if SIZEOF_DATUM >= 8 + else if (state->sortKeys[0].comparator == ssup_datum_signed_cmp) + { + qsort_tuple_signed(state->memtuples, + state->memtupcount, + state); + return; + } +#endif + else if (state->sortKeys[0].comparator == ssup_datum_int32_cmp) + { + qsort_tuple_int32(state->memtuples, + state->memtupcount, + state); + return; + } + } + + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + { + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + } + else + { + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(LogicalTape *tape, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(tape, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(LogicalTape *tape) +{ + unsigned int len = 0; + + LogicalTapeWrite(tape, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(tape, (void *) tupbody, tupbodylen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(tape, tupbody, tupbodylen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (state->haveDatum1) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (!state->haveDatum1) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); + LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(tape, tuple->t_data, tuple->t_len); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(tape, tuple->t_data, tuple->t_len); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->haveDatum1) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field and NULLS + * NOT DISTINCT was not set). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !(!state->uniqueNullsNotDistinct && equal_hasnull)) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_index() should not be called"); +} + +static void +writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(tape, (void *) tuple, IndexTupleSize(tuple)); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(tape, tuple, tuplen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(tape, waddr, tuplen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(tape, &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(tape, raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != NULL); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == NULL); + Assert(state->nOutputRuns == 1); + + state->result_tape = state->destTape; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + */ + inittapestate(state, nParticipants); + state->tapeset = LogicalTapeSetCreate(false, &shared->fileset, -1); + + /* + * Set currentRun to reflect the number of runs we will merge (it's not + * used for anything, this is just pro forma) + */ + state->currentRun = nParticipants; + + /* + * Initialize the state to look the same as after building the initial + * runs. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + state->inputTapes = NULL; + state->nInputTapes = 0; + state->nInputRuns = 0; + + state->outputTapes = palloc0(nParticipants * sizeof(LogicalTape *)); + state->nOutputTapes = nParticipants; + state->nOutputRuns = nParticipants; + + for (j = 0; j < nParticipants; j++) + { + state->outputTapes[j] = LogicalTapeImport(state->tapeset, j, &shared->tapes[j]); + } + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} + +int +ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup) +{ + if (x < y) + return -1; + else if (x > y) + return 1; + else + return 0; +} + +#if SIZEOF_DATUM >= 8 +int +ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup) +{ + int64 xx = DatumGetInt64(x); + int64 yy = DatumGetInt64(y); + + if (xx < yy) + return -1; + else if (xx > yy) + return 1; + else + return 0; +} +#endif + +int +ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup) +{ + int32 xx = DatumGetInt32(x); + int32 yy = DatumGetInt32(y); + + if (xx < yy) + return -1; + else if (xx > yy) + return 1; + else + return 0; +} diff --git a/src/tuplesort96.c b/src/tuplesort96.c new file mode 100644 index 0000000000..743e025b86 --- /dev/null +++ b/src/tuplesort96.c @@ -0,0 +1,4838 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we only do that + * for the first run, and only if the run would otherwise end up being very + * short. We merge the runs using polyphase merge, Knuth's Algorithm + * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by + * logtape.c, which avoids space wastage by recycling disk space as soon + * as each block is read from its "tape". + * + * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for + * the replacement selection, because it uses a fixed number of records + * in memory at all times. Since we are dealing with tuples that may vary + * considerably in size, we want to be able to vary the number of records + * kept in memory to ensure full utilization of the allowed sort memory + * space. So, we keep the tuples in a variable-size heap, with the next + * record to go out at the top of the heap. Like Algorithm 5.4.1R, each + * record is stored with the run number that it must go into, and we use + * (run number, key) as the ordering key for the heap. When the run number + * at the top of the heap changes, we know that no more records of the prior + * run are left in the heap. Note that there are in practice only ever two + * distinct run numbers, because since PostgreSQL 9.6, we only use + * replacement selection to form the first run. + * + * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small + * customizations) is only used with the aim of producing just one run, + * thereby avoiding all merging. Only the first run can use replacement + * selection, which is why there are now only two possible valid run + * numbers, and why heapification is customized to not distinguish between + * tuples in the second run (those will be quicksorted). We generally + * prefer a simple hybrid sort-merge strategy, where runs are sorted in much + * the same way as the entire input of an internal sort is sorted (using + * qsort()). The replacement_sort_tuples GUC controls the limited remaining + * use of replacement selection for the first run. + * + * There are several reasons to favor a hybrid sort-merge strategy. + * Maintaining a priority tree/heap has poor CPU cache characteristics. + * Furthermore, the growth in main memory sizes has greatly diminished the + * value of having runs that are larger than available memory, even in the + * case where there is partially sorted input and runs can be made far + * larger by using a heap. In most cases, a single-pass merge step is all + * that is required even when runs are no larger than available memory. + * Avoiding multiple merge passes was traditionally considered to be the + * major advantage of using replacement selection. + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run + * (or two, when replacement selection is still used), then merge the runs + * using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and insert the + * next tuple from its source tape (if any). When the heap empties, the merge + * is complete. The basic merge algorithm thus needs very little memory --- + * only M tuples for an M-way merge, and M is constrained to a small number. + * However, we can still make good use of our full workMem allocation by + * pre-reading additional tuples from each source tape. Without prereading, + * our access pattern to the temporary file would be very erratic; on average + * we'd read one block from each of M source tapes during the same time that + * we're writing M blocks to the output tape, so there is no sequentiality of + * access at all, defeating the read-ahead methods used by most Unix kernels. + * Worse, the output tape gets written into a very random sequence of blocks + * of the temp file, ensuring that things will be even worse when it comes + * time to read that tape. A straightforward merge pass thus ends up doing a + * lot of waiting for disk seeks. We can improve matters by prereading from + * each source tape sequentially, loading about workMem/M bytes from each tape + * in turn. Then we run the merge algorithm, writing but not reading until + * one of the preloaded tuple series runs out. Then we switch back to preread + * mode, fill memory again, and repeat. This approach helps to localize both + * read and write accesses. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes. + * + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during final on-the-fly merge, + * when memory is used in batch). SortTuples also contain the tuple's + * first key column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * While building initial runs, tupindex holds the tuple's run number. + * Historically, the run number could meaningfully distinguish many runs, but + * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement + * selection is always abandoned after the first run; no other run number + * should be represented here. During merge passes, we re-use it to hold the + * input tape number that each tuple in the heap was read from, or to hold the + * index of the next tuple pre-read from the same tape in the case of pre-read + * entries. tupindex goes unused if the sort occurs entirely in memory. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 3 blocks + * worth of buffer space (which is an underestimate for very large data + * volumes, but it's probably close enough --- see logtape.c). + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3) +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + + /* + * Run numbers, used during external sort operations. + * + * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun. + */ +#define RUN_FIRST 0 +#define HEAP_RUN_NEXT INT_MAX +#define RUN_SECOND 1 + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. After writing the tuple, + * pfree() the out-of-line data (not the SortTuple struct!), and increase + * state->availMem by the amount of memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. Create a palloc'd copy, + * initialize tuple/datum1/isnull1 in the target SortTuple struct, and + * decrease state->availMem by the amount of memory space consumed. (See + * batchUsed notes for details on how memory is handled when incremental + * accounting is abandoned.) + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * Function to move a caller tuple. This is usually implemented as a + * memmove() shim, but function may also perform additional fix-up of + * caller tuple where needed. Batch memory support requires the movement + * of caller tuples from one location in memory to another. + */ + void (*movetup) (void *dest, void *src, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. (Note that memtupcount only counts the tuples that are part of the + * heap --- during merge passes, memtuples[] entries beyond tapeRange are + * never in the heap and are used to hold pre-read tuples.) In state + * SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated in batch, rather than + * incrementally. This implies that incremental memory accounting has + * been abandoned. Currently, this only happens for the final on-the-fly + * merge step. Large batch allocations can store tuples (e.g. + * IndexTuples) without palloc() fragmentation and other overhead. + */ + bool batchUsed; + + /* + * While building initial runs, this indicates if the replacement + * selection strategy is in use. When it isn't, then a simple hybrid + * sort-merge strategy is in use instead (runs are quicksorted). + */ + bool replaceActive; + + /* + * While building initial runs, this is the current output run number + * (starting at RUN_FIRST). Afterwards, it is the number of initial runs + * we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * These variables are only used during merge passes. mergeactive[i] is + * true if we are reading an input run from (actual) tape number i and + * have not yet exhausted that run. mergenext[i] is the memtuples index + * of the next pre-read tuple (next to be loaded into the heap) for tape + * i, or 0 if we are out of pre-read tuples. mergelast[i] similarly + * points to the last pre-read tuple from each tape. mergeavailslots[i] + * is the number of unused memtuples[] slots reserved for tape i, and + * mergeavailmem[i] is the amount of unused space allocated for tape i. + * mergefreelist and mergefirstfree keep track of unused locations in the + * memtuples[] array. The memtuples[].tupindex fields link together + * pre-read tuples for each tape as well as recycled locations in + * mergefreelist. It is OK to use 0 as a null link in these lists, because + * memtuples[0] is part of the merge heap and is never a pre-read tuple. + */ + bool *mergeactive; /* active input run source? */ + int *mergenext; /* first preread tuple for each source */ + int *mergelast; /* last preread tuple for each source */ + int *mergeavailslots; /* slots left for prereading each tape */ + int64 *mergeavailmem; /* availMem for prereading each tape */ + int mergefreelist; /* head of freelist of recycled slots */ + int mergefirstfree; /* first slot never used in this merge */ + + /* + * Per-tape batch state, when final on-the-fly merge consumes memory from + * just a few large allocations. + * + * Aside from the general benefits of performing fewer individual retail + * palloc() calls, this also helps make merging more cache efficient, + * since each tape's tuples must naturally be accessed sequentially (in + * sorted order). + */ + int64 spacePerTape; /* Space (memory) for tuples (not slots) */ + char **mergetuples; /* Each tape's memory allocation */ + char **mergecurrent; /* Current offset into each tape's memory */ + char **mergetail; /* Last item's start point for each tape */ + char **mergeoverflow; /* Retail palloc() "overflow" for each tape */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 hash_mask; /* mask for sortable part of hash code */ + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->batchUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static bool useselection(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state); +static void selectnewtape(Tuplesortstate *state); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state, bool finalMergeBatch); +static void batchmemtuples(Tuplesortstate *state); +static void mergebatch(Tuplesortstate *state, int64 spacePerTape); +static void mergebatchone(Tuplesortstate *state, int srcTape, + SortTuple *stup, bool *should_free); +static void mergebatchfreetape(Tuplesortstate *state, int srcTape, + SortTuple *rtup, bool *should_free); +static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen); +static void mergepreread(Tuplesortstate *state); +static void mergeprereadone(Tuplesortstate *state, int srcTape); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void dumpbatch(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + int tupleindex, bool checkIndex); +static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_heap(void *dest, void *src, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_cluster(void *dest, void *src, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_index(void *dest, void *src, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_datum(void *dest, void *src, unsigned int len); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + state->allowedMem = workMem * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->batchUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = RUN_FIRST; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + state->movetup = movetup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->movetup = movetup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->movetup = movetup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey_nodata(indexRel); + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 hash_mask, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c", + hash_mask, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->movetup = movetup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->hash_mask = hash_mask; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->movetup = movetup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "external sort ended, %ld disk blocks used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "internal sort ended, %ld KB used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return TRUE if we were able to enlarge the array, FALSE if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state); + + /* + * Dump tuples until we are back under the limit. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, sift up, insert new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_siftup(state, false); + tuplesort_heap_insert(state, tuple, 0, false); + } + break; + + case TSS_BUILDRUNS: + + /* + * Insert the tuple into the heap, with run number currentRun if + * it can go into the current run, else HEAP_RUN_NEXT. The tuple + * can go into the current run if it is >= the first + * not-yet-output tuple. (Actually, it could go into the current + * run if it is >= the most recently output tuple ... but that + * would require keeping around the tuple we last output, and it's + * simplest to let writetup free each tuple as soon as it's + * written.) + * + * Note that this only applies when: + * + * - currentRun is RUN_FIRST + * + * - Replacement selection is in use (typically it is never used). + * + * When these two conditions are not both true, all tuples are + * appended indifferently, much like the TSS_INITIAL case. + * + * There should always be room to store the incoming tuple. + */ + Assert(!state->replaceActive || state->memtupcount > 0); + if (state->replaceActive && + COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) + { + Assert(state->currentRun == RUN_FIRST); + + /* + * Insert tuple into first, fully heapified run. + * + * Unlike classic replacement selection, which this module was + * previously based on, only RUN_FIRST tuples are fully + * heapified. Any second/next run tuples are appended + * indifferently. While HEAP_RUN_NEXT tuples may be sifted + * out of the way of first run tuples, COMPARETUP() will never + * be called for the run's tuples during sifting (only our + * initial COMPARETUP() call is required for the tuple, to + * determine that the tuple does not belong in RUN_FIRST). + */ + tuplesort_heap_insert(state, tuple, state->currentRun, true); + } + else + { + /* + * Tuple was determined to not belong to heapified RUN_FIRST, + * or replacement selection not in play. Append the tuple to + * memtuples indifferently. + * + * dumptuples() does not trust that the next run's tuples are + * heapified. Anything past the first run will always be + * quicksorted even when replacement selection is initially + * used. (When it's never used, every tuple still takes this + * path.) + */ + tuple->tupindex = HEAP_RUN_NEXT; + state->memtuples[state->memtupcount++] = *tuple; + } + + /* + * If we are over the memory limit, dump tuples till we're under. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort starting: %s", + pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory. Just qsort 'em and we're done. + */ + tuplesort_sort_memtuples(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess, one run per tape). Note that + * mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort done (except %d-way final merge): %s", + state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort done: %s", + pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns FALSE if no more tuples. + * If *should_free is set, the caller must pfree stup.tuple when done with it. + * Otherwise, caller should not use tuple following next call here. + * + * Note: Public tuplesort fetch routine callers cannot rely on tuple being + * allocated in their own memory context when should_free is TRUE. It may be + * necessary to create a new copy of the tuple to meet the requirements of + * public fetch routine callers. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup, bool *should_free) +{ + unsigned int tuplen; + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->batchUsed); + *should_free = false; + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(!state->batchUsed); + *should_free = true; + if (forward) + { + if (state->eof_reached) + return false; + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int))) + return false; + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int))) + return false; + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int))) + { + /* + * If that fails, presumably the prev tuple is the first + * in the file. Back up so that it becomes next to read + * in forward direction (not obviously right, but that is + * what in-memory case does). + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + sizeof(unsigned int))) + elog(ERROR, "bogus tuple length in backward scan"); + return false; + } + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen)) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + return true; + + case TSS_FINALMERGE: + Assert(forward); + Assert(state->batchUsed || !state->tuples); + /* For now, assume tuple is stored in tape's batch memory */ + *should_free = false; + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + int tupIndex; + SortTuple *newtup; + + /* + * Returned tuple is still counted in our memory space most of + * the time. See mergebatchone() for discussion of why caller + * may occasionally be required to free returned tuple, and + * how preread memory is managed with regard to edge cases + * more generally. + */ + *stup = state->memtuples[0]; + tuplesort_heap_siftup(state, false); + if ((tupIndex = state->mergenext[srcTape]) == 0) + { + /* + * out of preloaded data on this tape, try to read more + * + * Unlike mergeonerun(), we only preload from the single + * tape that's run dry, though not before preparing its + * batch memory for a new round of sequential consumption. + * See mergepreread() comments. + */ + if (state->batchUsed) + mergebatchone(state, srcTape, stup, should_free); + + mergeprereadone(state, srcTape); + + /* + * if still no data, we've reached end of run on this tape + */ + if ((tupIndex = state->mergenext[srcTape]) == 0) + { + /* Free tape's buffer, avoiding dangling pointer */ + if (state->batchUsed) + mergebatchfreetape(state, srcTape, stup, should_free); + return true; + } + } + /* pull next preread tuple from list, insert in heap */ + newtup = &state->memtuples[tupIndex]; + state->mergenext[srcTape] = newtup->tupindex; + if (state->mergenext[srcTape] == 0) + state->mergelast[srcTape] = 0; + tuplesort_heap_insert(state, newtup, srcTape, false); + /* put the now-unused memtuples entry on the freelist */ + newtup->tupindex = state->mergefreelist; + state->mergefreelist = tupIndex; + state->mergeavailslots[srcTape]++; + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return TRUE; else, clear the slot + * and return FALSE. + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * The slot receives a tuple that's been copied into the caller's memory + * context, so that it will stay valid regardless of future manipulations of + * the tuplesort's state (up to and including deleting the tuplesort). + * This differs from similar routines for other types of tuplesorts. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + bool should_free; + + if (!tuplesort_gettuple_common(state, forward, &stup, &should_free)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + /* + * Callers rely on tuple being in their own memory context, which is + * not guaranteed by tuplesort_gettuple_common(), even when should_free + * is set to TRUE. We must always copy here, since our interface does + * not allow callers to opt into arrangement where tuple memory can go + * away on the next call here, or after tuplesort_end() is called. + */ + ExecStoreMinimalTuple(heap_copy_minimal_tuple((MinimalTuple) stup.tuple), + slot, true); + + /* + * Free local copy if needed. It would be very invasive to get + * tuplesort_gettuple_common() to allocate tuple in caller's context + * for us, so we just do this instead. + */ + if (should_free) + pfree(stup.tuple); + + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. If *should_free is set, the + * caller must pfree the returned tuple when done with it. + * If it is not set, caller should not use tuple following next + * call here. It's never okay to use it after tuplesort_end(). + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup, should_free)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. If *should_free is set, the + * caller must pfree the returned tuple when done with it. + * If it is not set, caller should not use tuple following next + * call here. It's never okay to use it after tuplesort_end(). + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward, + bool *should_free) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup, should_free)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns FALSE if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + bool should_free; + + if (!tuplesort_gettuple_common(state, forward, &stup, &should_free)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* + * Callers rely on datum being in their own memory context, which is + * not guaranteed by tuplesort_gettuple_common(), even when should_free + * is set to TRUE. We must always copy here, since our interface does + * not allow callers to opt into arrangement where tuple memory can go + * away on the next call here, or after tuplesort_end() is called. + * + * Use stup.tuple because stup.datum1 may be an abbreviation. + */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + + /* + * Free local copy if needed. It would be very invasive to get + * tuplesort_gettuple_common() to allocate tuple in caller's context + * for us, so we just do this instead. + */ + if (should_free) + pfree(stup.tuple); + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns TRUE if successful, FALSE if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + bool should_free; + + if (!tuplesort_gettuple_common(state, forward, + &stup, &should_free)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + if (should_free && stup.tuple) + pfree(stup.tuple); + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* Even in minimum memory, use at least a MINORDER merge */ + mOrder = Max(mOrder, MINORDER); + + return mOrder; +} + +/* + * useselection - determine algorithm to use to sort first run. + * + * It can sometimes be useful to use the replacement selection algorithm if it + * results in one large run, and there is little available workMem. See + * remarks on RUN_SECOND optimization within dumptuples(). + */ +static bool +useselection(Tuplesortstate *state) +{ + /* + * memtupsize might be noticeably higher than memtupcount here in atypical + * cases. It seems slightly preferable to not allow recent outliers to + * impact this determination. Note that caller's trace_sort output + * reports memtupcount instead. + */ + if (state->memtupsize <= replacement_sort_tuples) + return true; + + return false; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we don't have room to sort in memory. + */ +static void +inittapes(Tuplesortstate *state) +{ + int maxTapes, + j; + int64 tapeSpace; + + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + + /* + * We must have at least 2*maxTapes slots in the memtuples[] array, else + * we'd not have room for merge heap plus preread. It seems unlikely that + * this case would ever occur, but be safe. + */ + maxTapes = Min(maxTapes, state->memtupsize / 2); + + state->maxTapes = maxTapes; + state->tapeRange = maxTapes - 1; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to external sort with %d tapes: %s", + maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes *TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. + */ + PrepareTempTablespaces(); + + /* + * Create the tape set and allocate the per-tape data arrays. + */ + state->tapeset = LogicalTapeSetCreate(maxTapes); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->mergenext = (int *) palloc0(maxTapes * sizeof(int)); + state->mergelast = (int *) palloc0(maxTapes * sizeof(int)); + state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int)); + state->mergeavailmem = (int64 *) palloc0(maxTapes * sizeof(int64)); + state->mergetuples = (char **) palloc0(maxTapes * sizeof(char *)); + state->mergecurrent = (char **) palloc0(maxTapes * sizeof(char *)); + state->mergetail = (char **) palloc0(maxTapes * sizeof(char *)); + state->mergeoverflow = (char **) palloc0(maxTapes * sizeof(char *)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* + * Give replacement selection a try based on user setting. There will be + * a switch to a simple hybrid sort-merge strategy after the first run + * (iff we could not output one long run). + */ + state->replaceActive = useselection(state); + + if (state->replaceActive) + { + /* + * Convert the unsorted contents of memtuples[] into a heap. Each + * tuple is marked as belonging to run number zero. + * + * NOTE: we pass false for checkIndex since there's no point in + * comparing indexes in this step, even though we do intend the + * indexes to be part of the sort key... + */ + int ntuples = state->memtupcount; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "replacement selection will sort %d first run tuples", + state->memtupcount); +#endif + state->memtupcount = 0; /* make the heap empty */ + + for (j = 0; j < ntuples; j++) + { + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[j]; + + tuplesort_heap_insert(state, &stup, 0, false); + } + Assert(state->memtupcount == ntuples); + } + + state->currentRun = RUN_FIRST; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * If we produced only one initial run (quite likely if the total data + * volume is between 1X and 2X workMem when replacement selection is used, + * but something we particular count on when input is presorted), we can + * just use that tape as the finished output, rather than doing a useless + * merge. (This obvious optimization is not in Knuth's algorithm.) + */ + if (state->currentRun == RUN_SECOND) + { + state->result_tape = state->tp_tapenum[state->destTape]; + /* must freeze and rewind the finished output tape */ + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; + return; + } + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewind(state->tapeset, tapenum, false); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state, state->tuples); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange], + false); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange - 1], + true); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + int tupIndex; + SortTuple *tup; + int64 priorAvail, + spaceFreed; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state, false); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + /* write the tuple to destTape */ + priorAvail = state->availMem; + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + /* writetup adjusted total free space, now fix per-tape space */ + spaceFreed = state->availMem - priorAvail; + state->mergeavailmem[srcTape] += spaceFreed; + /* compact the heap */ + tuplesort_heap_siftup(state, false); + if ((tupIndex = state->mergenext[srcTape]) == 0) + { + /* out of preloaded data on this tape, try to read more */ + mergepreread(state); + /* if still no data, we've reached end of run on this tape */ + if ((tupIndex = state->mergenext[srcTape]) == 0) + continue; + } + /* pull next preread tuple from list, insert in heap */ + tup = &state->memtuples[tupIndex]; + state->mergenext[srcTape] = tup->tupindex; + if (state->mergenext[srcTape] == 0) + state->mergelast[srcTape] = 0; + tuplesort_heap_insert(state, tup, srcTape, false); + /* put the now-unused memtuples entry on the freelist */ + tup->tupindex = state->mergefreelist; + state->mergefreelist = tupIndex; + state->mergeavailslots[srcTape]++; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated, but AllocSetFree will have put those chunks of memory on + * particular free lists, bucketed by size class. Thus, although all of + * that memory is free, it is effectively fragmented. Resetting the + * context gets us out from under that problem. + */ + MemoryContextReset(state->tuplecontext); + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished %d-way merge step: %s", state->activeTapes, + pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, load + * as many tuples as we can from each active input tape, and finally + * fill the merge heap with the first tuple from each active tape. + * + * finalMergeBatch indicates if this is the beginning of a final on-the-fly + * merge where a batched allocation of tuple memory is required. + */ +static void +beginmerge(Tuplesortstate *state, bool finalMergeBatch) +{ + int activeTapes; + int tapenum; + int srcTape; + int slotsPerTape; + int64 spacePerTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + state->activeTapes = activeTapes; + + /* Clear merge-pass state variables */ + memset(state->mergenext, 0, + state->maxTapes * sizeof(*state->mergenext)); + memset(state->mergelast, 0, + state->maxTapes * sizeof(*state->mergelast)); + state->mergefreelist = 0; /* nothing in the freelist */ + state->mergefirstfree = activeTapes; /* 1st slot avail for preread */ + + if (finalMergeBatch) + { + /* Free outright buffers for tape never actually allocated */ + FREEMEM(state, (state->maxTapes - activeTapes) * TAPE_BUFFER_OVERHEAD); + + /* + * Grow memtuples one last time, since the palloc() overhead no longer + * incurred can make a big difference + */ + batchmemtuples(state); + } + + /* + * Initialize space allocation to let each active input tape have an equal + * share of preread space. + */ + Assert(activeTapes > 0); + slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes; + Assert(slotsPerTape > 0); + spacePerTape = MAXALIGN_DOWN(state->availMem / activeTapes); + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + if (state->mergeactive[srcTape]) + { + state->mergeavailslots[srcTape] = slotsPerTape; + state->mergeavailmem[srcTape] = spacePerTape; + } + } + + /* + * Preallocate tuple batch memory for each tape. This is the memory used + * for tuples themselves (not SortTuples), so it's never used by + * pass-by-value datum sorts. Memory allocation is performed here at most + * once per sort, just in advance of the final on-the-fly merge step. + */ + if (finalMergeBatch) + mergebatch(state, spacePerTape); + + /* + * Preread as many tuples as possible (and at least one) from each active + * tape + */ + mergepreread(state); + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + int tupIndex = state->mergenext[srcTape]; + SortTuple *tup; + + if (tupIndex) + { + tup = &state->memtuples[tupIndex]; + state->mergenext[srcTape] = tup->tupindex; + if (state->mergenext[srcTape] == 0) + state->mergelast[srcTape] = 0; + tuplesort_heap_insert(state, tup, srcTape, false); + /* put the now-unused memtuples entry on the freelist */ + tup->tupindex = state->mergefreelist; + state->mergefreelist = tupIndex; + state->mergeavailslots[srcTape]++; + +#ifdef TRACE_SORT + if (trace_sort && finalMergeBatch) + { + int64 perTapeKB = (spacePerTape + 1023) / 1024; + int64 usedSpaceKB; + int usedSlots; + + /* + * Report how effective batchmemtuples() was in balancing the + * number of slots against the need for memory for the + * underlying tuples (e.g. IndexTuples). The big preread of + * all tapes when switching to FINALMERGE state should be + * fairly representative of memory utilization during the + * final merge step, and in any case is the only point at + * which all tapes are guaranteed to have depleted either + * their batch memory allowance or slot allowance. Ideally, + * both will be completely depleted for every tape by now. + */ + usedSpaceKB = (state->mergecurrent[srcTape] - + state->mergetuples[srcTape] + 1023) / 1024; + usedSlots = slotsPerTape - state->mergeavailslots[srcTape]; + + elog(LOG, "tape %d initially used " INT64_FORMAT " KB of " + INT64_FORMAT " KB batch (%2.3f) and %d out of %d slots " + "(%2.3f)", srcTape, + usedSpaceKB, perTapeKB, + (double) usedSpaceKB / (double) perTapeKB, + usedSlots, slotsPerTape, + (double) usedSlots / (double) slotsPerTape); + } +#endif + } + } +} + +/* + * batchmemtuples - grow memtuples without palloc overhead + * + * When called, availMem should be approximately the amount of memory we'd + * require to allocate memtupsize - memtupcount tuples (not SortTuples/slots) + * that were allocated with palloc() overhead, and in doing so use up all + * allocated slots. However, though slots and tuple memory is in balance + * following the last grow_memtuples() call, that's predicated on the observed + * average tuple size for the "final" grow_memtuples() call, which includes + * palloc overhead. During the final merge pass, where we will arrange to + * squeeze out the palloc overhead, we might need more slots in the memtuples + * array. + * + * To make that happen, arrange for the amount of remaining memory to be + * exactly equal to the palloc overhead multiplied by the current size of + * the memtuples array, force the grow_memtuples flag back to true (it's + * probably but not necessarily false on entry to this routine), and then + * call grow_memtuples. This simulates loading enough tuples to fill the + * whole memtuples array and then having some space left over because of the + * elided palloc overhead. We expect that grow_memtuples() will conclude that + * it can't double the size of the memtuples array but that it can increase + * it by some percentage; but if it does decide to double it, that just means + * that we've never managed to use many slots in the memtuples array, in which + * case doubling it shouldn't hurt anything anyway. + */ +static void +batchmemtuples(Tuplesortstate *state) +{ + int64 refund; + int64 availMemLessRefund; + int memtupsize = state->memtupsize; + + /* Caller error if we have no tapes */ + Assert(state->activeTapes > 0); + + /* For simplicity, assume no memtuples are actually currently counted */ + Assert(state->memtupcount == 0); + + /* + * Refund STANDARDCHUNKHEADERSIZE per tuple. + * + * This sometimes fails to make memory use perfectly balanced, but it + * should never make the situation worse. Note that Assert-enabled builds + * get a larger refund, due to a varying STANDARDCHUNKHEADERSIZE. + */ + refund = memtupsize * STANDARDCHUNKHEADERSIZE; + availMemLessRefund = state->availMem - refund; + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the batch allocation size could be calculated as negative, causing + * havoc. Hence, if availMemLessRefund is negative at this point, we must + * do nothing. Moreover, if it's positive but rather small, there's + * little point in proceeding because we could only increase memtuples by + * a small amount, not worth the cost of the repalloc's. We somewhat + * arbitrarily set the threshold at ALLOCSET_DEFAULT_INITSIZE per tape. + * (Note that this does not represent any assumption about tuple sizes.) + */ + if (availMemLessRefund <= + (int64) state->activeTapes * ALLOCSET_DEFAULT_INITSIZE) + return; + + /* + * To establish balanced memory use after refunding palloc overhead, + * temporarily have our accounting indicate that we've allocated all + * memory we're allowed to less that refund, and call grow_memtuples() to + * have it increase the number of slots. + */ + state->growmemtuples = true; + USEMEM(state, availMemLessRefund); + (void) grow_memtuples(state); + state->growmemtuples = false; + /* availMem must stay accurate for spacePerTape calculation */ + FREEMEM(state, availMemLessRefund); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + +#ifdef TRACE_SORT + if (trace_sort) + { + Size OldKb = (memtupsize * sizeof(SortTuple) + 1023) / 1024; + Size NewKb = (state->memtupsize * sizeof(SortTuple) + 1023) / 1024; + + elog(LOG, "grew memtuples %1.2fx from %d (%zu KB) to %d (%zu KB) for final merge", + (double) NewKb / (double) OldKb, + memtupsize, OldKb, + state->memtupsize, NewKb); + } +#endif +} + +/* + * mergebatch - initialize tuple memory in batch + * + * This allows sequential access to sorted tuples buffered in memory from + * tapes/runs on disk during a final on-the-fly merge step. Note that the + * memory is not used for SortTuples, but for the underlying tuples (e.g. + * MinimalTuples). + * + * Note that when batch memory is used, there is a simple division of space + * into large buffers (one per active tape). The conventional incremental + * memory accounting (calling USEMEM() and FREEMEM()) is abandoned. Instead, + * when each tape's memory budget is exceeded, a retail palloc() "overflow" is + * performed, which is then immediately detected in a way that is analogous to + * LACKMEM(). This keeps each tape's use of memory fair, which is always a + * goal. + */ +static void +mergebatch(Tuplesortstate *state, int64 spacePerTape) +{ + int srcTape; + + Assert(state->activeTapes > 0); + Assert(state->tuples); + + /* + * For the purposes of tuplesort's memory accounting, the batch allocation + * is special, and regular memory accounting through USEMEM() calls is + * abandoned (see mergeprereadone()). + */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + char *mergetuples; + + if (!state->mergeactive[srcTape]) + continue; + + /* Allocate buffer for each active tape */ + mergetuples = MemoryContextAllocHuge(state->tuplecontext, + spacePerTape); + + /* Initialize state for tape */ + state->mergetuples[srcTape] = mergetuples; + state->mergecurrent[srcTape] = mergetuples; + state->mergetail[srcTape] = mergetuples; + state->mergeoverflow[srcTape] = NULL; + } + + state->batchUsed = true; + state->spacePerTape = spacePerTape; +} + +/* + * mergebatchone - prepare batch memory for one merge input tape + * + * This is called following the exhaustion of preread tuples for one input + * tape. All that actually occurs is that the state for the source tape is + * reset to indicate that all memory may be reused. + * + * This routine must deal with fixing up the tuple that is about to be returned + * to the client, due to "overflow" allocations. + */ +static void +mergebatchone(Tuplesortstate *state, int srcTape, SortTuple *rtup, + bool *should_free) +{ + Assert(state->batchUsed); + + /* + * Tuple about to be returned to caller ("stup") is final preread tuple + * from tape, just removed from the top of the heap. Special steps around + * memory management must be performed for that tuple, to make sure it + * isn't overwritten early. + */ + if (!state->mergeoverflow[srcTape]) + { + Size tupLen; + + /* + * Mark tuple buffer range for reuse, but be careful to move final, + * tail tuple to start of space for next run so that it's available to + * caller when stup is returned, and remains available at least until + * the next tuple is requested. + */ + tupLen = state->mergecurrent[srcTape] - state->mergetail[srcTape]; + state->mergecurrent[srcTape] = state->mergetuples[srcTape]; + MOVETUP(state->mergecurrent[srcTape], state->mergetail[srcTape], + tupLen); + + /* Make SortTuple at top of the merge heap point to new tuple */ + rtup->tuple = (void *) state->mergecurrent[srcTape]; + + state->mergetail[srcTape] = state->mergecurrent[srcTape]; + state->mergecurrent[srcTape] += tupLen; + } + else + { + /* + * Handle an "overflow" retail palloc. + * + * This is needed when we run out of tuple memory for the tape. + */ + state->mergecurrent[srcTape] = state->mergetuples[srcTape]; + state->mergetail[srcTape] = state->mergetuples[srcTape]; + + if (rtup->tuple) + { + Assert(rtup->tuple == (void *) state->mergeoverflow[srcTape]); + /* Caller should free palloc'd tuple */ + *should_free = true; + } + state->mergeoverflow[srcTape] = NULL; + } +} + +/* + * mergebatchfreetape - handle final clean-up for batch memory once tape is + * about to become exhausted + * + * All tuples are returned from tape, but a single final tuple, *rtup, is to be + * passed back to caller. Free tape's batch allocation buffer while ensuring + * that the final tuple is managed appropriately. + */ +static void +mergebatchfreetape(Tuplesortstate *state, int srcTape, SortTuple *rtup, + bool *should_free) +{ + Assert(state->batchUsed); + Assert(state->status == TSS_FINALMERGE); + + /* + * Tuple may or may not already be an overflow allocation from + * mergebatchone() + */ + if (!*should_free && rtup->tuple) + { + /* + * Final tuple still in tape's batch allocation. + * + * Return palloc()'d copy to caller, and have it freed in a similar + * manner to overflow allocation. Otherwise, we'd free batch memory + * and pass back a pointer to garbage. Note that we deliberately + * allocate this in the parent tuplesort context, to be on the safe + * side. + */ + Size tuplen; + void *oldTuple = rtup->tuple; + + tuplen = state->mergecurrent[srcTape] - state->mergetail[srcTape]; + rtup->tuple = MemoryContextAlloc(state->sortcontext, tuplen); + MOVETUP(rtup->tuple, oldTuple, tuplen); + *should_free = true; + } + + /* Free spacePerTape-sized buffer */ + pfree(state->mergetuples[srcTape]); +} + +/* + * mergebatchalloc - allocate memory for one tuple using a batch memory + * "logical allocation". + * + * This is used for the final on-the-fly merge phase only. READTUP() routines + * receive memory from here in place of palloc() and USEMEM() calls. + * + * Tuple tapenum is passed, ensuring each tape's tuples are stored in sorted, + * contiguous order (while allowing safe reuse of memory made available to + * each tape). This maximizes locality of access as tuples are returned by + * final merge. + * + * Caller must not subsequently attempt to free memory returned here. In + * general, only mergebatch* functions know about how memory returned from + * here should be freed, and this function's caller must ensure that batch + * memory management code will definitely have the opportunity to do the right + * thing during the final on-the-fly merge. + */ +static void * +mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen) +{ + Size reserve_tuplen = MAXALIGN(tuplen); + char *ret; + + /* Should overflow at most once before mergebatchone() call: */ + Assert(state->mergeoverflow[tapenum] == NULL); + Assert(state->batchUsed); + + /* It should be possible to use precisely spacePerTape memory at once */ + if (state->mergecurrent[tapenum] + reserve_tuplen <= + state->mergetuples[tapenum] + state->spacePerTape) + { + /* + * Usual case -- caller is returned pointer into its tape's buffer, + * and an offset from that point is recorded as where tape has + * consumed up to for current round of preloading. + */ + ret = state->mergetail[tapenum] = state->mergecurrent[tapenum]; + state->mergecurrent[tapenum] += reserve_tuplen; + } + else + { + /* + * Allocate memory, and record as tape's overflow allocation. This + * will be detected quickly, in a similar fashion to a LACKMEM() + * condition, and should not happen again before a new round of + * preloading for caller's tape. Note that we deliberately allocate + * this in the parent tuplesort context, to be on the safe side. + * + * Sometimes, this does not happen because merging runs out of slots + * before running out of memory. + */ + ret = state->mergeoverflow[tapenum] = + MemoryContextAlloc(state->sortcontext, tuplen); + } + + return ret; +} + +/* + * mergepreread - load tuples from merge input tapes + * + * This routine exists to improve sequentiality of reads during a merge pass, + * as explained in the header comments of this file. Load tuples from each + * active source tape until the tape's run is exhausted or it has used up + * its fair share of available memory. In any case, we guarantee that there + * is at least one preread tuple available from each unexhausted input tape. + * + * We invoke this routine at the start of a merge pass for initial load, + * and then whenever any tape's preread data runs out. Note that we load + * as much data as possible from all tapes, not just the one that ran out. + * This is because logtape.c works best with a usage pattern that alternates + * between reading a lot of data and writing a lot of data, so whenever we + * are forced to read, we should fill working memory completely. + * + * In FINALMERGE state, we *don't* use this routine, but instead just preread + * from the single tape that ran dry. There's no read/write alternation in + * that state and so no point in scanning through all the tapes to fix one. + * (Moreover, there may be quite a lot of inactive tapes in that state, since + * we might have had many fewer runs than tapes. In a regular tape-to-tape + * merge we can expect most of the tapes to be active. Plus, only + * FINALMERGE state has to consider memory management for a batch + * allocation.) + */ +static void +mergepreread(Tuplesortstate *state) +{ + int srcTape; + + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + mergeprereadone(state, srcTape); +} + +/* + * mergeprereadone - load tuples from one merge input tape + * + * Read tuples from the specified tape until it has used up its free memory + * or array slots; but ensure that we have at least one tuple, if any are + * to be had. + */ +static void +mergeprereadone(Tuplesortstate *state, int srcTape) +{ + unsigned int tuplen; + SortTuple stup; + int tupIndex; + int64 priorAvail, + spaceUsed; + + if (!state->mergeactive[srcTape]) + return; /* tape's run is already exhausted */ + + /* + * Manage per-tape availMem. Only actually matters when batch memory not + * in use. + */ + priorAvail = state->availMem; + state->availMem = state->mergeavailmem[srcTape]; + + /* + * When batch memory is used if final on-the-fly merge, only mergeoverflow + * test is relevant; otherwise, only LACKMEM() test is relevant. + */ + while ((state->mergeavailslots[srcTape] > 0 && + state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) || + state->mergenext[srcTape] == 0) + { + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + break; + } + READTUP(state, &stup, srcTape, tuplen); + /* find a free slot in memtuples[] for it */ + tupIndex = state->mergefreelist; + if (tupIndex) + state->mergefreelist = state->memtuples[tupIndex].tupindex; + else + { + tupIndex = state->mergefirstfree++; + Assert(tupIndex < state->memtupsize); + } + state->mergeavailslots[srcTape]--; + /* store tuple, append to list for its tape */ + stup.tupindex = 0; + state->memtuples[tupIndex] = stup; + if (state->mergelast[srcTape]) + state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex; + else + state->mergenext[srcTape] = tupIndex; + state->mergelast[srcTape] = tupIndex; + } + /* update per-tape and global availmem counts */ + spaceUsed = state->mergeavailmem[srcTape] - state->availMem; + state->mergeavailmem[srcTape] = state->availMem; + state->availMem = priorAvail - spaceUsed; +} + +/* + * dumptuples - remove tuples from memtuples and write to tape + * + * This is used during initial-run building, but not during merging. + * + * When alltuples = false and replacement selection is still active, dump + * only enough tuples to get under the availMem limit (and leave at least + * one tuple in memtuples, since puttuple will then assume it is a heap that + * has a tuple to compare to). We always insist there be at least one free + * slot in the memtuples[] array. + * + * When alltuples = true, dump everything currently in memory. (This + * case is only used at end of input data, although in practice only the + * first run could fail to dump all tuples when we LACKMEM(), and only + * when replacement selection is active.) + * + * If, when replacement selection is active, we see that the tuple run + * number at the top of the heap has changed, start a new run. This must be + * the first run, because replacement selection is always abandoned for all + * further runs. + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + while (alltuples || + (LACKMEM(state) && state->memtupcount > 1) || + state->memtupcount >= state->memtupsize) + { + if (state->replaceActive) + { + /* + * Still holding out for a case favorable to replacement + * selection. Still incrementally spilling using heap. + * + * Dump the heap's frontmost entry, and sift up to remove it from + * the heap. + */ + Assert(state->memtupcount > 0); + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[0]); + tuplesort_heap_siftup(state, true); + } + else + { + /* + * Once committed to quicksorting runs, never incrementally spill + */ + dumpbatch(state, alltuples); + break; + } + + /* + * If top run number has changed, we've finished the current run (this + * can only be the first run), and will no longer spill incrementally. + */ + if (state->memtupcount == 0 || + state->memtuples[0].tupindex == HEAP_RUN_NEXT) + { + markrunend(state, state->tp_tapenum[state->destTape]); + Assert(state->currentRun == RUN_FIRST); + state->currentRun++; + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished incrementally writing %s run %d to tape %d: %s", + (state->memtupcount == 0) ? "only" : "first", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Done if heap is empty, which is possible when there is only one + * long run. + */ + Assert(state->currentRun == RUN_SECOND); + if (state->memtupcount == 0) + { + /* + * Replacement selection best case; no final merge required, + * because there was only one initial run (second run has no + * tuples). See RUN_SECOND case in mergeruns(). + */ + break; + } + + /* + * Abandon replacement selection for second run (as well as any + * subsequent runs). + */ + state->replaceActive = false; + + /* + * First tuple of next run should not be heapified, and so will + * bear placeholder run number. In practice this must actually be + * the second run, which just became the currentRun, so we're + * clear to quicksort and dump the tuples in batch next time + * memtuples becomes full. + */ + Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT); + selectnewtape(state); + } + } +} + +/* + * dumpbatch - sort and dump all memtuples, forming one run on tape + * + * Second or subsequent runs are never heapified by this module (although + * heapification still respects run number differences between the first and + * second runs), and a heap (replacement selection priority queue) is often + * avoided in the first place. + */ +static void +dumpbatch(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergepreread()/mergeprereadone() are prepared for 0 tuple runs, and + * will reliably mark the tape inactive for the merge when called from + * beginmerge(). This case is therefore similar to the case where + * mergeonerun() finds a dummy run for the tape, and so doesn't need to + * merge a run from the tape (or conceptually "merges" the dummy run, if + * you prefer). According to Knuth, Algorithm D "isn't strictly optimal" + * in its method of distribution and dummy run assignment; this edge case + * seems very unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "starting quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in allocation patterns due to the use of batch memory. + * Fragmentation due to AllocSetFree's bucketing by size class might be + * particularly bad if this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished writing run %d to tape %d: %s", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewind(state->tapeset, + state->result_tape, + false); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + if (!LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset)) + elog(ERROR, "tuplesort_restorepos failed"); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + * spaceUsed is measured in kilobytes. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + const char **sortMethod, + const char **spaceType, + long *spaceUsed) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + *spaceType = "Disk"; + *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + *spaceType = "Memory"; + *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + *sortMethod = "top-N heapsort"; + else + *sortMethod = "quicksort"; + break; + case TSS_SORTEDONTAPE: + *sortMethod = "external sort"; + break; + case TSS_FINALMERGE: + *sortMethod = "external merge"; + break; + default: + *sortMethod = "still in progress"; + break; + } +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + * + * Compare two SortTuples. If checkIndex is true, use the tuple index + * as the front of the sort key; otherwise, no. + * + * Note that for checkIndex callers, the heap invariant is never + * maintained beyond the first run, and so there are no COMPARETUP() + * calls needed to distinguish tuples in HEAP_RUN_NEXT. + */ + +#define HEAPCOMPARE(tup1,tup2) \ + (checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \ + (tup1)->tupindex == HEAP_RUN_NEXT) ? \ + ((tup1)->tupindex) - ((tup2)->tupindex) : \ + COMPARETUP(state, tup1, tup2)) + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + * + * We assume that all entries in a bounded heap will always have tupindex + * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse + * the direction of comparison for tupindexes. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount >= state->bound && + COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + /* New tuple would just get thrown out, so skip it */ + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup, 0, false); + + /* If heap too full, discard largest entry */ + if (state->memtupcount > state->bound) + { + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_siftup(state, false); + } + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + + /* + * We can unheapify in place because each sift-up will remove the largest + * entry, which we can promptly store in the newly freed slot at the end. + * Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_siftup(state, false); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts. Quicksort is also generally + * preferred to replacement selection for generating runs during external sort + * operations, although replacement selection is sometimes used for the first + * run. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: we assume *tuple is a temporary variable that can be scribbled on. + * For some callers, tuple actually points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + int tupleindex, bool checkIndex) +{ + SortTuple *memtuples; + int j; + + /* + * Save the tupleindex --- see notes above about writing on *tuple. It's a + * historical artifact that tupleindex is passed as a separate argument + * and not in *tuple, but it's notationally convenient so let's leave it + * that way. + */ + tuple->tupindex = tupleindex; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + Assert(!checkIndex || tupleindex == RUN_FIRST); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * The tuple at state->memtuples[0] has been removed from the heap. + * Decrement memtupcount, and sift up to maintain the heap invariant. + */ +static void +tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + unsigned int i, + n; + + Assert(!checkIndex || state->currentRun == RUN_FIRST); + if (--state->memtupcount <= 0) + return; + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + tuple = &memtuples[n]; /* tuple that must be reinserted */ + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. Allocate + * memory and account for that, or consume from tape's batch + * allocation. + * + * Memory returned here in the final on-the-fly merge case is recycled + * from tape's batch allocation. Otherwise, callers must pfree() or + * reset tuple child memory context, and account for that with a + * FREEMEM(). Currently, this only ever needs to happen in WRITETUP() + * routines. + */ +static void * +readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen) +{ + if (state->batchUsed) + { + /* + * No USEMEM() call, because during final on-the-fly merge accounting + * is based on tape-private state. ("Overflow" allocations are + * detected as an indication that a new round or preloading is + * required. Preloading marks existing contents of tape's batch buffer + * for reuse.) + */ + return mergebatchalloc(state, tapenum, tuplen); + } + else + { + char *ret; + + /* Batch allocation yet to be performed */ + ret = MemoryContextAlloc(state->tuplecontext, tuplen); + USEMEM(state, GetMemoryChunkSpace(ret)); + return ret; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tapenum, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +static void +movetup_heap(void *dest, void *src, unsigned int len) +{ + memmove(dest, src, len); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_KeyAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + tapenum, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +static void +movetup_cluster(void *dest, void *src, unsigned int len) +{ + HeapTuple tuple; + + memmove(dest, src, len); + + /* Repoint the HeapTupleData header */ + tuple = (HeapTuple) dest; + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); +} + + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + uint32 hash1; + uint32 hash2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + hash1 = DatumGetUInt32(a->datum1) & state->hash_mask; + Assert(!b->isnull1); + hash2 = DatumGetUInt32(b->datum1) & state->hash_mask; + + if (hash1 > hash2) + return 1; + else if (hash1 < hash2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tapenum, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +static void +movetup_index(void *dest, void *src, unsigned int len) +{ + memmove(dest, src, len); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tapenum, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +static void +movetup_datum(void *dest, void *src, unsigned int len) +{ + memmove(dest, src, len); +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} diff --git a/t/001_wal.pl b/t/001_wal.pl index 053aecadb5..a169683ee6 100644 --- a/t/001_wal.pl +++ b/t/001_wal.pl @@ -1,10 +1,31 @@ # Test generic xlog record work for rum index replication. use strict; use warnings; -use PostgresNode; -use TestLib; use Test::More tests => 31; +my $pg_15_modules; + +BEGIN +{ + $pg_15_modules = eval + { + require PostgreSQL::Test::Cluster; + require PostgreSQL::Test::Utils; + return 1; + }; + + unless (defined $pg_15_modules) + { + $pg_15_modules = 0; + + require PostgresNode; + require TestLib; + } +} + +note('PostgreSQL 15 modules are used: ' . ($pg_15_modules ? 'yes' : 'no')); + + my $node_master; my $node_standby; @@ -13,10 +34,23 @@ sub test_index_replay { my ($test_name) = @_; + # Check server version + my $server_version = $node_master->safe_psql("postgres", "SELECT current_setting('server_version_num');") + 0; + # Wait for standby to catch up my $applname = $node_standby->name; - my $caughtup_query = - "SELECT pg_current_xlog_location() <= write_location FROM pg_stat_replication WHERE application_name = '$applname';"; + my $caughtup_query; + + if ($server_version < 100000) + { + $caughtup_query = + "SELECT pg_current_xlog_location() <= replay_location FROM pg_stat_replication WHERE application_name = '$applname';"; + } + else + { + $caughtup_query = + "SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$applname';"; + } $node_master->poll_query_until('postgres', $caughtup_query) or die "Timed out while waiting for standby 1 to catch up"; @@ -37,7 +71,23 @@ sub test_index_replay } # Initialize master node -$node_master = get_new_node('master'); + +# Create node. +# Older versions of PostgreSQL modules use get_new_node function. +# Newer use standard perl object constructor syntax. +# Also applies for node_standby (below). +eval +{ + if ($pg_15_modules) + { + $node_master = PostgreSQL::Test::Cluster->new("master"); + } + else + { + $node_master = PostgresNode::get_new_node("master"); + } +}; + $node_master->init(allows_streaming => 1); $node_master->start; my $backup_name = 'my_backup'; @@ -46,7 +96,18 @@ sub test_index_replay $node_master->backup($backup_name); # Create streaming standby linking to master -$node_standby = get_new_node('standby'); +eval +{ + if ($pg_15_modules) + { + $node_standby = PostgreSQL::Test::Cluster->new("standby"); + } + else + { + $node_standby = PostgresNode::get_new_node("standby"); + } +}; + $node_standby->init_from_backup($node_master, $backup_name, has_streaming => 1); $node_standby->start; @@ -58,7 +119,7 @@ sub test_index_replay to_tsvector('simple', array_to_string(array( select substr('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', trunc(random() * 52)::integer + 1, 1) FROM generate_series(i, i + 4)), '')) - FROM generate_series(1,100000) i;"); + FROM generate_series(1,16000) i;"); $node_master->psql("postgres", "CREATE INDEX rumidx ON tst USING rum (t rum_tsvector_ops);"); # Test that queries give same result diff --git a/t/002_pglist.pl b/t/002_pglist.pl new file mode 100644 index 0000000000..7b2d76c058 --- /dev/null +++ b/t/002_pglist.pl @@ -0,0 +1,207 @@ +# Test RUM index with big base 'pglist'. +use strict; +use warnings; +use Config; +use Test::More; + +plan skip_all => 'This test requires downloading a 1GB archive. ' . + 'The unpacked file weighs almost 3GB. ' . + 'Perform only if the big_values is enabled in PG_TEST_EXTRA' + unless $ENV{PG_TEST_EXTRA} && $ENV{PG_TEST_EXTRA} =~ /\bbig_values\b/; + +plan tests => 4; + +my $node; + +# Utility function + +sub file_exists +{ + my ($file) = @_; + return -e $file; +} + +# Check the existence of the test base, install if necessary + +sub install_pglist +{ + my $dir = Cwd->getcwd; #current directory + + my %config = ( + #directory with pglist dump must be inside the current directory + pglist_tmp_dir => $dir . '/pglist_tmp/', + dump_name => 'pglist-28-04-16.dump', + dump_url => 'http://www.sai.msu.su/~megera/postgres/files/pglist-28-04-16.dump.gz', + pglist_archive => $dir . '/pglist_tmp/' . 'pglist-28-04-16.dump.gz', + ); + + my $path_to_dump = $config{pglist_tmp_dir} . $config{dump_name}; + + if (file_exists($path_to_dump)) + { + note($config{dump_name} . ' already installed'); + } + else + { + # Create folder /contrib/rum/pglist_tmp if not already exists + mkdir($config{pglist_tmp_dir}, 0700) + unless file_exists($config{pglist_tmp_dir}); + + # Download archive pglist-28-04-16.dump.gz if not already exists + unless (file_exists($config{pglist_archive})) + { + note('Downloading pglist dump in ' . $config{pglist_archive}); + + # Flag "-nv" allows us to avoid frequent messages + # about the download status in the log. + # But it can be enabled for debugging purposes. + system("wget -P $config{pglist_tmp_dir} -nv $config{dump_url}") == 0 + or die "Couldn't get archive by link: $?"; + } + + # Unzip the dump. Delete archive to save memory + system("gzip -d $config{pglist_archive}") == 0 + or die "Couldn't extract archive: $?"; + + file_exists($path_to_dump) + or die "Failed to get " . $config{dump_name}; + + note($config{dump_name} . ' is ready to use'); + } + + $node->psql("postgres", "CREATE DATABASE pglist"); + $node->psql("postgres", "CREATE ROLE oleg"); + my $command = "'" . $path_to_dump . "'"; + my $result = $node->psql("pglist", '\i ' . $command); +} + +# Tests SELECT constructions to 'pglist' base + +sub test_select +{ + note("Creating index 'rumidx_orderby_sent'"); + + $node->safe_psql("pglist", "CREATE INDEX rumidx_orderby_sent ON pglist " . + "USING rum (fts rum_tsvector_timestamp_ops, sent) " . + "WITH (attach=sent, to=fts, order_by_attach=t)"); + + note("Test ORDER BY timestamp"); + + my $result1 = $node->safe_psql("pglist", + "SELECT sent, subject FROM pglist WHERE fts @@ " . + "to_tsquery('english', 'backend <-> crushed') " . + "ORDER BY sent <=| '2016-01-01 00:01' LIMIT 5"); + + is($result1, '1999-06-02 11:52:46|Re: [HACKERS] PID of backend'); + + note("Test tsvector filter"); + + my $result2 = $node->safe_psql("pglist", + "SELECT count(*) FROM pglist " . + "WHERE fts @@ to_tsquery('english', 'tom & lane')"); + + is($result2, '222813'); + + $node->safe_psql("pglist", "DROP INDEX rumidx_orderby_sent"); +} + +sub test_order_by +{ + note("Creating index 'pglist_rum_idx'"); + + $node->safe_psql("pglist", + "CREATE INDEX pglist_rum_idx ON pglist " . + "USING rum (fts rum_tsvector_ops)"); + + note("Test ORDER BY tsvector"); + + my $result3 = $node->safe_psql("pglist", + "SELECT id FROM pglist " . + "WHERE fts @@ to_tsquery('english', 'postgres:*') " . + "ORDER BY fts <=> " . + "to_tsquery('english', 'postgres:*') LIMIT 9"); + + is((split(" ", $result3))[0], '816114'); + + # Autovacuum after large update, with active RUM index crashes postgres + note("Test Issue #19"); + + my $stderr; + $node->safe_psql("pglist", "DELETE FROM pglist WHERE id < 100000"); + $node->safe_psql("pglist", "vacuum", stderr => \$stderr); + + is($stderr, undef); + + $node->safe_psql("pglist", "DROP INDEX pglist_rum_idx"); +} + +# Start backend + +my $pg_15_modules; + +BEGIN +{ + $pg_15_modules = eval + { + require PostgreSQL::Test::Cluster; + require PostgreSQL::Test::Utils; + return 1; + }; + + unless (defined $pg_15_modules) + { + $pg_15_modules = 0; + + require PostgresNode; + require TestLib; + } +} + +note('PostgreSQL 15 modules are used: ' . ($pg_15_modules ? 'yes' : 'no')); + +if ($pg_15_modules) +{ + $node = PostgreSQL::Test::Cluster->new("master"); +} +else +{ + $node = PostgresNode::get_new_node("master"); +} + +$node->init(allows_streaming => 1); +$node->append_conf("postgresql.conf", "shared_buffers='4GB'\n" . + "maintenance_work_mem='2GB'\n" . + "max_wal_size='2GB'\n" . + "work_mem='50MB'"); +$node->start; + +# Check the existence of the pglist base + +note('Check the existence of the pglist base...'); +my $check_pglist = $node->psql('postgres', "SELECT count(*) FROM pg_database " . + "WHERE datistemplate = false AND " . + "datname = 'pglist'"); +if ($check_pglist == 1) +{ + note("pglist already exists"); +} +else +{ + note("Create pglist database"); + install_pglist(); +} + +$node->psql("pglist", "CREATE EXTENSION rum"); +note('Setup is completed successfully'); + +eval +{ + test_select(); + test_order_by(); + $node->stop(); + done_testing(); + 1; +} or do { + note('Something went wrong: $@\n'); +}; + diff --git a/tests/README.md b/tests/README.md deleted file mode 100644 index de04c4d617..0000000000 --- a/tests/README.md +++ /dev/null @@ -1,14 +0,0 @@ -## Running tests - -Install testgres: - -``` -pip install testgres -``` - -Run command: - -``` -python -m unittest pglist_tests -``` - diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/pglist_tests.py b/tests/pglist_tests.py deleted file mode 100644 index bb1ce8b8f5..0000000000 --- a/tests/pglist_tests.py +++ /dev/null @@ -1,147 +0,0 @@ -# coding: utf-8 -""" - Test RUM index with big base 'pglist' - - Copyright (c) 2015-2016, Postgres Professional -""" -import unittest -import os -import sys -import gzip -import testgres as tg - -if sys.version_info[0] < 3: - import urllib as request -else: - import urllib.request as request - -from os.path import expanduser - - -class PglistTests(unittest.TestCase): - - def setUp(self): - current_dir = os.path.dirname(os.path.abspath(__file__)) - - self.node = tg.get_new_node("pglist", - os.path.join(current_dir, "tmp_install")) - try: - self.node.init() - self.node.append_conf("postgresql.conf", - "shared_buffers='4GB'\n" - "maintenance_work_mem='2GB'\n" - "max_wal_size='2GB'\n" - "work_mem='50MB'") - self.node.start() - except Exception as e: - self.printlog(os.path.join(self.node.logs_dir, "postgresql.log")) - raise e - - def tearDown(self): - tg.stop_all() - - def init_pglist_data(self, node): - # Check if 'pglist' base exists - bases = node.execute("postgres", - "SELECT count(*) FROM pg_database " - "WHERE datistemplate = false AND " - " datname = 'pglist'") - if bases[0][0] != 0: - return - - # Check if 'pglist' dump exists - home = expanduser("~") - pglist_dump = os.path.join(home, "pglist-28-04-16.dump") - if not os.path.isfile(pglist_dump): - pglist_dumpgz = pglist_dump + ".gz" - if not os.path.isfile(pglist_dumpgz): - print("Downloading: {0}".format(pglist_dumpgz)) - request.urlretrieve( - "http://www.sai.msu.su/~megera/postgres/files/pglist-28-04-16.dump.gz", - pglist_dumpgz) - - print("Decompressing: {0}".format(pglist_dumpgz)) - gz = gzip.open(pglist_dumpgz, 'rb') - with open(pglist_dump, 'wb') as f: - f.write(gz.read()) - - os.remove(pglist_dumpgz) - - # Restore dump file - print("Restoring 'pglist'") - node.safe_psql("postgres", "CREATE DATABASE pglist") - node.psql("pglist", filename=pglist_dump) - - node.safe_psql("pglist", "CREATE EXTENSION rum") - - def printlog(self, logfile): - with open(logfile, 'r') as log: - for line in log.readlines(): - print(line) - - def test_order_by(self): - """Tests SELECT constructions to 'pglist' base""" - try: - self.init_pglist_data(self.node) - - print("Creating index 'rumidx_orderby_sent'") - - self.node.safe_psql( - "pglist", - "CREATE INDEX rumidx_orderby_sent ON pglist USING rum (" - " fts rum_tsvector_timestamp_ops, sent) " - " WITH (attach=sent, to=fts, order_by_attach=t)") - - print("Running tests") - - self.assertEqual( - self.node.safe_psql( - "pglist", - "SELECT sent, subject " - " FROM pglist " - " WHERE fts @@ " - " to_tsquery('english', 'backend <-> crushed') " - " ORDER BY sent <=| '2016-01-01 00:01' LIMIT 5" - ), - b'1999-06-02 11:52:46|Re: [HACKERS] PID of backend\n' - ) - - self.assertEqual( - self.node.safe_psql( - "pglist", - "SELECT count(*) FROM pglist " - "WHERE fts @@ to_tsquery('english', 'tom & lane')" - ), - b'222813\n' - ) - - self.node.safe_psql("pglist", "DROP INDEX rumidx_orderby_sent"); - - print("Creating index 'pglist_rum_idx'") - - self.node.safe_psql( - "pglist", - "CREATE INDEX pglist_rum_idx ON pglist USING rum (" - " fts rum_tsvector_ops)") - - print("Running tests") - - self.assertEqual( - self.node.execute( - "pglist", - "SELECT id FROM pglist " - "WHERE fts @@ to_tsquery('english', 'postgres:*') " - "ORDER BY fts <=> to_tsquery('english', 'postgres:*') " - "LIMIT 9" - )[0][0], - 816114 - ) - - self.node.safe_psql("pglist", "DROP INDEX pglist_rum_idx"); - - except Exception as e: - self.printlog(os.path.join(self.node.logs_dir, "postgresql.log")) - raise e - -if __name__ == "__main__": - unittest.main() diff --git a/travis/Dockerfile.in b/travis/Dockerfile.in new file mode 100644 index 0000000000..3101da42a8 --- /dev/null +++ b/travis/Dockerfile.in @@ -0,0 +1,33 @@ +FROM postgres:${PG_VERSION}-alpine + +# Install dependencies +RUN apk add --no-cache \ + linux-headers \ + openssl curl \ + perl perl-ipc-run perl-dev perl-app-cpanminus perl-dbi \ + make musl-dev gcc bison flex coreutils \ + zlib-dev libedit-dev \ + pkgconf icu-dev clang clang15 clang19 clang-analyzer; + +# Environment +ENV LANG=C.UTF-8 PGDATA=/pg/data + +# Make directories +RUN mkdir -p ${PGDATA} && \ + mkdir -p /pg/testdir + +COPY run_tests.sh /run.sh +RUN chmod 755 /run.sh + +COPY . /pg/testdir +WORKDIR /pg/testdir + +# Grant privileges +RUN chown postgres:postgres ${PGDATA} && \ + chown -R postgres:postgres /pg/testdir && \ + chown postgres:postgres /usr/local/include/postgresql/server/ && \ + chmod a+rwx /usr/local/lib/postgresql && \ + chmod a+rwx /usr/local/share/postgresql/extension + +USER postgres +ENTRYPOINT LEVEL=${LEVEL} /run.sh diff --git a/travis/docker-compose.yml b/travis/docker-compose.yml new file mode 100644 index 0000000000..0544d8597d --- /dev/null +++ b/travis/docker-compose.yml @@ -0,0 +1,3 @@ +services: + tests: + build: . diff --git a/travis/mk_dockerfile.sh b/travis/mk_dockerfile.sh new file mode 100755 index 0000000000..9108d2c68d --- /dev/null +++ b/travis/mk_dockerfile.sh @@ -0,0 +1,16 @@ +if [ -z ${PG_VERSION+x} ]; then + echo PG_VERSION is not set! + exit 1 +fi + +if [ -z ${LEVEL+x} ]; then + LEVEL=standard +fi + +echo PG_VERSION=${PG_VERSION} +echo LEVEL=${LEVEL} + +sed \ + -e 's/${PG_VERSION}/'${PG_VERSION}/g \ + -e 's/${LEVEL}/'${LEVEL}/g \ +Dockerfile.in > Dockerfile diff --git a/travis/run_tests.sh b/travis/run_tests.sh new file mode 100644 index 0000000000..37bba84d64 --- /dev/null +++ b/travis/run_tests.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash + +# +# Copyright (c) 2019-2024, Postgres Professional +# +# supported levels: +# * standard +# * hardcore +# + +set -ux +status=0 + + +# rebuild PostgreSQL with cassert support +if [ "$LEVEL" = "hardcore" ]; then + + set -e + + CUSTOM_PG_BIN=$PWD/pg_bin + CUSTOM_PG_SRC=$PWD/postgresql + + # here PG_VERSION is provided by postgres:X-alpine docker image + curl "https://ftp.postgresql.org/pub/source/v$PG_VERSION/postgresql-$PG_VERSION.tar.bz2" -o postgresql.tar.bz2 + echo "$PG_SHA256 *postgresql.tar.bz2" | sha256sum -c - + + mkdir $CUSTOM_PG_SRC + + tar \ + --extract \ + --file postgresql.tar.bz2 \ + --directory $CUSTOM_PG_SRC \ + --strip-components 1 + + cd $CUSTOM_PG_SRC + + # enable additional options + ./configure \ + CFLAGS='-fno-omit-frame-pointer' \ + --enable-cassert \ + --enable-tap-tests \ + --prefix=$CUSTOM_PG_BIN \ + --quiet + + time make -s -j$(nproc) && make -s install + + # override default PostgreSQL instance + export PATH=$CUSTOM_PG_BIN/bin:$PATH + export LD_LIBRARY_PATH=$CUSTOM_PG_BIN/lib + + # show pg_config path (just in case) + which pg_config + + cd - + + set +e +fi + +# show pg_config just in case +pg_config + +# perform code checks if asked to +if [ "$LEVEL" = "hardcore" ]; then + + # perform static analyzis + scan-build --status-bugs \ + -disable-checker core.UndefinedBinaryOperatorResult \ + -disable-checker core.DivideZero \ + -disable-checker deadcode.DeadStores \ + make USE_PGXS=1 || status=$? + + # something's wrong, exit now! + if [ $status -ne 0 ]; then exit 1; fi + + # don't forget to "make clean" + make USE_PGXS=1 clean +fi + + +# build and install extension (using PG_CPPFLAGS and SHLIB_LINK for gcov) +make USE_PGXS=1 PG_CPPFLAGS="-coverage" SHLIB_LINK="-coverage" install + +# initialize database +initdb -D $PGDATA + +# set appropriate port +export PGPORT=55435 +echo "port = $PGPORT" >> $PGDATA/postgresql.conf + +# restart cluster 'test' +pg_ctl start -l /tmp/postgres.log -w || status=$? + +# something's wrong, exit now! +if [ $status -ne 0 ]; then cat /tmp/postgres.log; exit 1; fi + +# run regression tests +export PG_REGRESS_DIFF_OPTS="-w -U3" # for alpine's diff (BusyBox) +make USE_PGXS=1 installcheck || status=$? + +# show diff if it exists +if test -f regression.diffs; then cat regression.diffs; fi + +# something's wrong, exit now! +if [ $status -ne 0 ]; then exit 1; fi + +# generate *.gcov files +gcov src/*.c src/*.h + + +set +ux + + +# send coverage stats to Codecov +bash <(curl -s https://codecov.io/bash)