Add empty BRIN ranges during CREATE INDEX

When building BRIN indexes, the brinbuildCallback only advances to the
next page range when seeing a tuple that doesn't belong to the current
one. This means that the index may end up missing ranges at the end of
the table, if those pages do not contain any indexable tuples.

We tend not to have completely empty pages at the end of a relation, but
this also applies to partial indexes, where the tuples may simply not
match the index predicate. This results in inefficient scans using the
affected BRIN index - without the summaries, the page ranges have to be
read and processed, which consumes I/O and possibly also CPU time.

The existing code already added empty ranges for earlier parts of the
table, this commit makes sure we add them for the ranges at the end of
the table too.

Patch by Matthias van de Meent, with review/improvements by me.

Author: Matthias van de Meent
Reviewed-by: Tomas Vondra
Discussion: https://postgr.es/m/CAEze2WiMsPZg%3DxkvSF_jt4%3D69k6K7gz5B8V2wY3gCGZ%2B1BzCbQ%40mail.gmail.com
This commit is contained in:
Tomas Vondra 2023-12-08 17:07:30 +01:00
parent 00edb2061f
commit dae761a87e
3 changed files with 152 additions and 4 deletions

View file

@ -89,4 +89,22 @@ SELECT brin_revmap_data(decode(repeat('00', :block_size), 'hex'));
(1 row)
-- Test that partial indexes have all pages, including empty ones.
CREATE TABLE test2 (a int);
INSERT INTO test2 SELECT i FROM generate_series(1,1000) s(i);
-- No rows match the index predicate, make sure the index has the right number
-- of ranges (same as number of page ranges).
CREATE INDEX ON test2 USING brin (a) WITH (pages_per_range=1) WHERE (a IS NULL);
ANALYZE test2;
-- Does the index have one summary of the relation?
SELECT (COUNT(*) = (SELECT relpages FROM pg_class WHERE relname = 'test2')) AS ranges_do_match
FROM generate_series((SELECT (lastrevmappage + 1) FROM brin_metapage_info(get_raw_page('test2_a_idx', 0))),
(SELECT (relpages - 1) FROM pg_class WHERE relname = 'test2_a_idx')) AS pages(p),
LATERAL brin_page_items(get_raw_page('test2_a_idx', p), 'test2_a_idx') AS items;
ranges_do_match
-----------------
t
(1 row)
DROP TABLE test1;
DROP TABLE test2;

View file

@ -36,4 +36,21 @@ SELECT brin_page_items(decode(repeat('00', :block_size), 'hex'), 'test1_a_idx');
SELECT brin_metapage_info(decode(repeat('00', :block_size), 'hex'));
SELECT brin_revmap_data(decode(repeat('00', :block_size), 'hex'));
-- Test that partial indexes have all pages, including empty ones.
CREATE TABLE test2 (a int);
INSERT INTO test2 SELECT i FROM generate_series(1,1000) s(i);
-- No rows match the index predicate, make sure the index has the right number
-- of ranges (same as number of page ranges).
CREATE INDEX ON test2 USING brin (a) WITH (pages_per_range=1) WHERE (a IS NULL);
ANALYZE test2;
-- Does the index have one summary of the relation?
SELECT (COUNT(*) = (SELECT relpages FROM pg_class WHERE relname = 'test2')) AS ranges_do_match
FROM generate_series((SELECT (lastrevmappage + 1) FROM brin_metapage_info(get_raw_page('test2_a_idx', 0))),
(SELECT (relpages - 1) FROM pg_class WHERE relname = 'test2_a_idx')) AS pages(p),
LATERAL brin_page_items(get_raw_page('test2_a_idx', p), 'test2_a_idx') AS items;
DROP TABLE test1;
DROP TABLE test2;

View file

@ -53,9 +53,13 @@ typedef struct BrinBuildState
Buffer bs_currentInsertBuf;
BlockNumber bs_pagesPerRange;
BlockNumber bs_currRangeStart;
BlockNumber bs_maxRangeStart;
BrinRevmap *bs_rmAccess;
BrinDesc *bs_bdesc;
BrinMemTuple *bs_dtuple;
BrinTuple *bs_emptyTuple;
Size bs_emptyTupleLen;
MemoryContext bs_context;
} BrinBuildState;
/*
@ -82,7 +86,9 @@ typedef struct BrinOpaque
#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
BrinRevmap *revmap, BlockNumber pagesPerRange);
BrinRevmap *revmap,
BlockNumber pagesPerRange,
BlockNumber tablePages);
static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
static void terminate_brin_buildstate(BrinBuildState *state);
static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
@ -94,6 +100,8 @@ static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
BrinMemTuple *dtup, const Datum *values, const bool *nulls);
static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
static void brin_fill_empty_ranges(BrinBuildState *state,
BlockNumber prevRange, BlockNumber maxRange);
/*
* BRIN handler function: return IndexAmRoutine with access method parameters
@ -933,7 +941,8 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
* Initialize our state, including the deformed tuple state.
*/
revmap = brinRevmapInitialize(index, &pagesPerRange);
state = initialize_brin_buildstate(index, revmap, pagesPerRange);
state = initialize_brin_buildstate(index, revmap, pagesPerRange,
RelationGetNumberOfBlocks(heap));
/*
* Now scan the relation. No syncscan allowed here because we want the
@ -945,6 +954,17 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
/* process the final batch */
form_and_insert_tuple(state);
/*
* Backfill the final ranges with empty data.
*
* This saves us from doing what amounts to full table scans when the
* index with a predicate like WHERE (nonnull_column IS NULL), or other
* very selective predicates.
*/
brin_fill_empty_ranges(state,
state->bs_currRangeStart,
state->bs_maxRangeStart);
/* release resources */
idxtuples = state->bs_numtuples;
brinRevmapTerminate(state->bs_rmAccess);
@ -1358,9 +1378,10 @@ brinGetStats(Relation index, BrinStatsData *stats)
*/
static BrinBuildState *
initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
BlockNumber pagesPerRange)
BlockNumber pagesPerRange, BlockNumber tablePages)
{
BrinBuildState *state;
BlockNumber lastRange = 0;
state = palloc_object(BrinBuildState);
@ -1373,6 +1394,22 @@ initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
state->bs_bdesc = brin_build_desc(idxRel);
state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
/* Remember the memory context to use for an empty tuple, if needed. */
state->bs_context = CurrentMemoryContext;
state->bs_emptyTuple = NULL;
state->bs_emptyTupleLen = 0;
/*
* Calculate the start of the last page range. Page numbers are 0-based,
* so to calculate the index we need to subtract one. The integer division
* gives us the index of the page range.
*/
if (tablePages > 0)
lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
/* Now calculate the start of the next range. */
state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
return state;
}
@ -1612,7 +1649,8 @@ brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
/* first time through */
Assert(!indexInfo);
state = initialize_brin_buildstate(index, revmap,
pagesPerRange);
pagesPerRange,
InvalidBlockNumber);
indexInfo = BuildIndexInfo(index);
}
summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
@ -1982,3 +2020,78 @@ check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
return true;
}
/*
* brin_build_empty_tuple
* Maybe initialize a BRIN tuple representing empty range.
*
* Returns a BRIN tuple representing an empty page range starting at the
* specified block number. The empty tuple is initialized only once, when it's
* needed for the first time, stored in the memory context bs_context to ensure
* proper life span, and reused on following calls. All empty tuples are
* exactly the same except for the bs_blkno field, which is set to the value
* in blkno parameter.
*/
static void
brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
{
/* First time an empty tuple is requested? If yes, initialize it. */
if (state->bs_emptyTuple == NULL)
{
MemoryContext oldcxt;
BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
/* Allocate the tuple in context for the whole index build. */
oldcxt = MemoryContextSwitchTo(state->bs_context);
state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
&state->bs_emptyTupleLen);
MemoryContextSwitchTo(oldcxt);
}
else
{
/* If we already have an empty tuple, just update the block. */
state->bs_emptyTuple->bt_blkno = blkno;
}
}
/*
* brin_fill_empty_ranges
* Add BRIN index tuples representing empty page ranges.
*
* prevRange/nextRange determine for which page ranges to add empty summaries.
* Both boundaries are exclusive, i.e. only ranges starting at blkno for which
* (prevRange < blkno < nextRange) will be added to the index.
*
* If prevRange is InvalidBlockNumber, this means there was no previous page
* range (i.e. the first empty range to add is for blkno=0).
*
* The empty tuple is built only once, and then reused for all future calls.
*/
static void
brin_fill_empty_ranges(BrinBuildState *state,
BlockNumber prevRange, BlockNumber nextRange)
{
BlockNumber blkno;
/*
* If we already summarized some ranges, we need to start with the next
* one. Otherwise start from the first range of the table.
*/
blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
/* Generate empty ranges until we hit the next non-empty range. */
while (blkno < nextRange)
{
/* Did we already build the empty tuple? If not, do it now. */
brin_build_empty_tuple(state, blkno);
brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
&state->bs_currentInsertBuf,
blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
/* try next page range */
blkno += state->bs_pagesPerRange;
}
}