[PATCH] bitmap indexes
Hi.
This is a cleaned-up and rebased version of the bitmap index patch from
Gavin Sherry, later revised by Gianni Ciolli and Gabriele Bartolini, and
others including Daniel Bausch.
I've been working on this patch for a while, and have made some progress
towards (a) general fixing, and (b) a working VACUUM implementation (the
major remaining piece). Unfortunately, I've been busy moving house, and
the latter is not complete (and not in this patch).
I will continue working on the code, and I'll post updates. I expect to
have more to show in just a few days.
Nevertheless, I'm posting it for review now as I keep working. Given the
size and age of the patch, I would appreciate any comments, no matter
how nitpicky.
Thanks.
-- Abhijit
P.S. There are some brokennesses, marked with XXXes in the code. See
also /messages/by-id/20081101000154.GO27872@fune
and bmi-perf-test.tar.gz in particular.
Attachments:
bmi.difftext/x-diff; charset=us-asciiDownload
diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile
index 0e267eb..bc46939 100644
--- a/contrib/pageinspect/Makefile
+++ b/contrib/pageinspect/Makefile
@@ -1,7 +1,7 @@
# contrib/pageinspect/Makefile
MODULE_big = pageinspect
-OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o
+OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o bmfuncs.o
EXTENSION = pageinspect
DATA = pageinspect--1.1.sql pageinspect--1.0--1.1.sql \
diff --git a/contrib/pageinspect/bmfuncs.c b/contrib/pageinspect/bmfuncs.c
new file mode 100644
index 0000000..ef480d6
--- /dev/null
+++ b/contrib/pageinspect/bmfuncs.c
@@ -0,0 +1,464 @@
+/*
+ * bmfuncs.c
+ * Functions to investigate bitmap index pages.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/pageinspect/bmfuncs.c
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/bitmap.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_type.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_am.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+
+extern Datum bm_metap(PG_FUNCTION_ARGS);
+extern Datum bm_page_headers(PG_FUNCTION_ARGS);
+extern Datum bm_lov_page_stats(PG_FUNCTION_ARGS);
+extern Datum bm_bmv_page_stats(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(bm_metap);
+PG_FUNCTION_INFO_V1(bm_page_headers);
+PG_FUNCTION_INFO_V1(bm_lov_page_stats);
+PG_FUNCTION_INFO_V1(bm_bmv_page_stats);
+
+#define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
+#define IS_BITMAP(r) ((r)->rd_rel->relam == BITMAP_AM_OID)
+
+#define CHECK_PAGE_OFFSET_RANGE(pg, offnum) { \
+ if ( !(FirstOffsetNumber <= (offnum) && \
+ (offnum) <= PageGetMaxOffsetNumber(pg)) ) \
+ elog(ERROR, "page offset number out of range"); }
+
+/* note: BlockNumber is unsigned, hence can't be negative */
+#define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
+ if ( RelationGetNumberOfBlocks(rel) <= (BlockNumber) (blkno) ) \
+ elog(ERROR, "block number %d out of range", blkno); }
+
+/*
+ * cross-call data structure for page headers SRF
+ */
+struct bm_page_headers_args
+{
+ Relation rel;
+ uint32 blockNum;
+};
+
+/*-------------------------------------------------------
+ * bm_page_headers()
+ *
+ * Get the page headers of all the pages of the index
+ *
+ * Usage: SELECT * FROM bm_page_headers('bm_idx');
+ *-------------------------------------------------------
+ */
+
+Datum
+bm_page_headers(PG_FUNCTION_ARGS)
+{
+ text *relname = PG_GETARG_TEXT_P(0);
+ FuncCallContext *fctx;
+ struct bm_page_headers_args *uargs;
+ char* values[7];
+ Datum result = 0;
+
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to use pageinspect functions"))));
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext mctx;
+ TupleDesc tupleDesc;
+ RangeVar* relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+ Relation rel;
+
+ fctx = SRF_FIRSTCALL_INIT();
+
+ rel = relation_openrv(relrv, AccessShareLock);
+
+ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+ uargs = palloc(sizeof(struct bm_page_headers_args));
+
+ uargs->rel = rel;
+ uargs->blockNum = 0;
+
+ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
+ fctx->user_fctx = uargs;
+ fctx->max_calls = RelationGetNumberOfBlocks(rel);
+
+ MemoryContextSwitchTo(mctx);
+ }
+
+ fctx = SRF_PERCALL_SETUP();
+ uargs = fctx->user_fctx;
+
+ if (fctx->call_cntr < fctx->max_calls)
+ {
+ Buffer buffer;
+ HeapTuple tuple;
+ Page page;
+ PageHeader phdr;
+ uint32 page_size;
+ uint32 free_size;
+ int j = 0;
+
+ if (!IS_INDEX(uargs->rel) || !IS_BITMAP(uargs->rel))
+ elog(ERROR, "relation \"%s\" is not a bitmap index",
+ RelationGetRelationName(uargs->rel));
+
+ CHECK_RELATION_BLOCK_RANGE(uargs->rel, uargs->blockNum);
+
+ buffer = ReadBuffer(uargs->rel, uargs->blockNum);
+
+ page = BufferGetPage(buffer);
+ phdr = (PageHeader) page;
+ page_size = PageGetPageSize(page);
+ free_size = PageGetFreeSpace(page);
+
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", uargs->blockNum);
+ values[j] = palloc(32);
+
+ if (uargs->blockNum == 0)
+ snprintf(values[j++], 32, "META");
+ else if (page_size == phdr->pd_special)
+ snprintf(values[j++], 32, "LOV");
+ else
+ snprintf(values[j++], 32, "BMV");
+
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", page_size);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", phdr->pd_lower);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", phdr->pd_upper);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", phdr->pd_special);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", free_size);
+
+ ++uargs->blockNum;
+
+ ReleaseBuffer(buffer);
+
+ tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
+
+ result = HeapTupleGetDatum(tuple);
+
+ SRF_RETURN_NEXT(fctx, result);
+ }
+ else
+ {
+ relation_close(uargs->rel, AccessShareLock);
+ pfree(uargs);
+ SRF_RETURN_DONE(fctx);
+ }
+}
+
+/* ------------------------------------------------
+ * bm_metap()
+ *
+ * Get a bitmap index's meta-page information
+ *
+ * Usage: SELECT * FROM bm_metap('t1_bmkey')
+ * ------------------------------------------------
+ */
+
+Datum
+bm_metap(PG_FUNCTION_ARGS)
+{
+ text *relname = PG_GETARG_TEXT_P(0);
+ Datum result;
+ Relation rel;
+ RangeVar *relrv;
+ BMMetaPage metad;
+ TupleDesc tupleDesc;
+ int j;
+ char *values[3];
+ Buffer buffer;
+ Page page;
+ HeapTuple tuple;
+
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to use pageinspect functions"))));
+
+ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+ rel = relation_openrv(relrv, AccessShareLock);
+
+ if (!IS_INDEX(rel) || !IS_BITMAP(rel))
+ elog(ERROR, "relation \"%s\" is not a bitmap index",
+ RelationGetRelationName(rel));
+
+ buffer = ReadBuffer(rel, BM_METAPAGE);
+ page = BufferGetPage(buffer);
+ metad = (BMMetaPage) PageGetContents(page);
+
+ j = 0;
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", metad->bm_lov_heapId);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", metad->bm_lov_indexId);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", metad->bm_last_vmi_page);
+
+ tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
+ values);
+ result = HeapTupleGetDatum(tuple);
+
+ ReleaseBuffer(buffer);
+
+ relation_close(rel, AccessShareLock);
+
+ PG_RETURN_DATUM(result);
+}
+
+/*
+ * structure for statistics regarding a single LOV page
+ */
+
+typedef struct BMLOVPageStat
+{
+ uint32 blkno;
+ uint32 page_size;
+ uint32 free_size;
+ uint32 max_avail;
+
+ uint32 live_items;
+ uint32 dead_items;
+ uint32 avg_item_size;
+} BMLOVPageStat;
+
+/* -----------------------------------------------
+ * bm_lov_page_stats()
+ *
+ * Usage: SELECT * FROM bm_lov_page_stats('bm_idx');
+ * -----------------------------------------------
+ */
+
+Datum
+bm_lov_page_stats(PG_FUNCTION_ARGS)
+{
+ text *relname = PG_GETARG_TEXT_P(0);
+ uint32 blkno = PG_GETARG_UINT32(1);
+ Relation rel;
+ RangeVar *relrv;
+ Buffer buffer;
+ Page page;
+ PageHeader phdr;
+
+ BMLOVPageStat stat;
+ char *values[7];
+ HeapTuple tuple;
+ TupleDesc tupleDesc;
+ int j = 0;
+ Datum result;
+ int item_size = 0;
+ OffsetNumber maxoff = FirstOffsetNumber;
+ OffsetNumber off = FirstOffsetNumber;
+
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to use pageinspect functions"))));
+
+ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+ rel = relation_openrv(relrv, AccessShareLock);
+
+ if (!IS_INDEX(rel) || !IS_BITMAP(rel))
+ elog(ERROR, "relation \"%s\" is not a bitmap index",
+ RelationGetRelationName(rel));
+
+ if (blkno == BM_METAPAGE)
+ elog(ERROR, "block %d is a meta page", BM_METAPAGE);
+
+ CHECK_RELATION_BLOCK_RANGE(rel, blkno);
+
+ buffer = ReadBuffer(rel, blkno);
+ page = BufferGetPage(buffer);
+ phdr = (PageHeader) page;
+
+ /* Initialise the data to be returned */
+ stat.blkno = blkno;
+ stat.page_size = PageGetPageSize(page);
+ stat.free_size = PageGetFreeSpace(page);
+ stat.max_avail = stat.live_items = stat.dead_items = stat.avg_item_size = 0;
+
+ if (phdr->pd_special != stat.page_size)
+ elog(ERROR, "block %d is a not a LOV page", blkno);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* count live and dead tuples, and free space */
+ for (off = FirstOffsetNumber; off <= maxoff; ++off)
+ {
+ ItemId id = PageGetItemId(page, off);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, id);
+
+ item_size += IndexTupleSize(itup);
+
+ if (!ItemIdIsDead(id))
+ stat.live_items++;
+ else
+ stat.dead_items++;
+ }
+
+ if ((stat.live_items + stat.dead_items) > 0)
+ stat.avg_item_size = item_size / (stat.live_items + stat.dead_items);
+
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.blkno);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.page_size);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.free_size);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.max_avail);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.live_items);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.dead_items);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.avg_item_size);
+
+ tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values);
+
+ result = HeapTupleGetDatum(tuple);
+
+ ReleaseBuffer(buffer);
+
+ relation_close(rel, AccessShareLock);
+
+ PG_RETURN_DATUM(result);
+}
+
+/*
+ * structure for statistics regarding a single bitmap page
+ */
+
+typedef struct BMBMVPageStat
+{
+ uint32 blkno;
+ uint32 page_size;
+ uint32 free_size;
+
+ /* opaque data */
+ uint16 bm_hrl_words_used; /* the number of words used */
+ BlockNumber bm_bitmap_next; /* the next page for this bitmap */
+ uint64 bm_last_tid_location; /* the tid location for the last bit in this page */
+ uint16 bm_page_id; /* bitmap index identifier */
+
+} BMBMVPageStat;
+
+/* -----------------------------------------------
+ * bm_bmv_page_stats()
+ *
+ * Usage: SELECT * FROM bm_bmv_page_stats('bm_idx');
+ * -----------------------------------------------
+ */
+
+Datum
+bm_bmv_page_stats(PG_FUNCTION_ARGS)
+{
+ text *relname = PG_GETARG_TEXT_P(0);
+ uint32 blkno = PG_GETARG_UINT32(1);
+ Relation rel;
+ RangeVar *relrv;
+ Buffer buffer;
+ Page page;
+ PageHeader phdr;
+ BMPageOpaque opaque = 0;
+
+ BMBMVPageStat stat;
+ char *values[7];
+ HeapTuple tuple;
+ TupleDesc tupleDesc;
+ int j = 0;
+ Datum result;
+
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to use pageinspect functions"))));
+
+ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+ rel = relation_openrv(relrv, AccessShareLock);
+
+ if (!IS_INDEX(rel) || !IS_BITMAP(rel))
+ elog(ERROR, "relation \"%s\" is not a bitmap index",
+ RelationGetRelationName(rel));
+
+ if (blkno == BM_METAPAGE)
+ elog(ERROR, "block %d is a meta page", BM_METAPAGE);
+
+ CHECK_RELATION_BLOCK_RANGE(rel, blkno);
+
+ buffer = ReadBuffer(rel, blkno);
+ page = BufferGetPage(buffer);
+ phdr = (PageHeader) page;
+ opaque = (BMPageOpaque) PageGetSpecialPointer(page);
+
+ /* Initialise the data to be returned */
+ stat.blkno = blkno;
+ stat.page_size = PageGetPageSize(page);
+ stat.free_size = PageGetFreeSpace(page);
+
+ if (phdr->pd_special == stat.page_size)
+ elog(ERROR, "block %d is a not a BMV page", blkno);
+
+ stat.bm_hrl_words_used = opaque->bm_hrl_words_used;
+ stat.bm_bitmap_next = opaque->bm_bitmap_next;
+ stat.bm_last_tid_location = opaque->bm_last_tid_location;
+ stat.bm_page_id = opaque->bm_page_id;
+
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.blkno);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.page_size);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.free_size);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.bm_hrl_words_used);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.bm_bitmap_next);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, UINT64_FORMAT, stat.bm_last_tid_location);
+ values[j] = palloc(32);
+ snprintf(values[j++], 32, "%d", stat.bm_page_id);
+
+ tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values);
+
+ result = HeapTupleGetDatum(tuple);
+
+ ReleaseBuffer(buffer);
+
+ relation_close(rel, AccessShareLock);
+
+ PG_RETURN_DATUM(result);
+}
diff --git a/contrib/pageinspect/pageinspect--1.1.sql b/contrib/pageinspect/pageinspect--1.1.sql
index 22a47d5..1eb83f5 100644
--- a/contrib/pageinspect/pageinspect--1.1.sql
+++ b/contrib/pageinspect/pageinspect--1.1.sql
@@ -105,3 +105,56 @@ CREATE FUNCTION fsm_page_contents(IN page bytea)
RETURNS text
AS 'MODULE_PATHNAME', 'fsm_page_contents'
LANGUAGE C STRICT;
+
+--
+-- bm_metap()
+--
+CREATE OR REPLACE FUNCTION bm_metap(IN relname text,
+ OUT lov_heap_oid oid,
+ OUT lov_index_oid oid,
+ OUT lov_last_page int4)
+AS 'MODULE_PATHNAME', 'bm_metap'
+LANGUAGE C STRICT;
+
+--
+-- bm_page_headers()
+--
+CREATE OR REPLACE FUNCTION bm_page_headers(IN relname text,
+ OUT block_no int4,
+ OUT page_type char(16),
+ OUT page_size int4,
+ OUT pd_lower int4,
+ OUT pd_upper int4,
+ OUT pd_special int4,
+ OUT free_size int4
+)
+AS 'MODULE_PATHNAME', 'bm_page_headers'
+LANGUAGE C STRICT;
+
+--
+-- bm_lov_page_stats()
+--
+CREATE OR REPLACE FUNCTION bm_lov_page_stats(IN relname text, IN blkno int4,
+ OUT block_no int4,
+ OUT page_size int4,
+ OUT free_size int4,
+ OUT max_avail int4,
+ OUT live_items int4,
+ OUT dead_items int4,
+ OUT avg_item_size int4)
+AS 'MODULE_PATHNAME', 'bm_lov_page_stats'
+LANGUAGE C STRICT;
+
+--
+-- bm_bmv_page_stats()
+--
+CREATE OR REPLACE FUNCTION bm_bmv_page_stats(IN relname text, IN blkno int4,
+ OUT block_no int4,
+ OUT page_size int4,
+ OUT free_size int4,
+ OUT bm_hrl_words_used int4,
+ OUT bm_bitmap_next int4,
+ OUT bm_last_tid_location int8,
+ OUT bm_page_id int4
+) AS 'MODULE_PATHNAME', 'bm_bmv_page_stats'
+LANGUAGE C STRICT;
diff --git a/contrib/pg_xlogdump/.gitignore b/contrib/pg_xlogdump/.gitignore
index 71f8531..c4d8ef3 100644
--- a/contrib/pg_xlogdump/.gitignore
+++ b/contrib/pg_xlogdump/.gitignore
@@ -1,5 +1,6 @@
/pg_xlogdump
# Source files copied from src/backend/access/
+/bitmapdesc.c
/clogdesc.c
/dbasedesc.c
/gindesc.c
diff --git a/contrib/pg_xlogdump/rmgrdesc.c b/contrib/pg_xlogdump/rmgrdesc.c
index 13ab745..2435c0e 100644
--- a/contrib/pg_xlogdump/rmgrdesc.c
+++ b/contrib/pg_xlogdump/rmgrdesc.c
@@ -8,6 +8,7 @@
#define FRONTEND 1
#include "postgres.h"
+#include "access/bitmap.h"
#include "access/clog.h"
#include "access/gin.h"
#include "access/gist_private.h"
diff --git a/doc/src/sgml/bitmap.sgml b/doc/src/sgml/bitmap.sgml
new file mode 100644
index 0000000..7dba545
--- /dev/null
+++ b/doc/src/sgml/bitmap.sgml
@@ -0,0 +1,136 @@
+<!-- doc/src/sgml/bitmap.sgml -->
+
+<chapter id="BMI">
+<title>Bitmap Indexes</title>
+
+ <indexterm>
+ <primary>index</primary>
+ <secondary>Bitmap</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>Bitmap</primary>
+ <see>index</see>
+ </indexterm>
+
+<sect1 id="bitmap-intro">
+ <title>Introduction</title>
+
+ <para>
+ An <emphasis>on-disk bitmap</emphasis> index (or <emphasis>bit
+ vector</emphasis> index) is an index type that is particularly
+ useful for read-mostly environments, such as data warehouses and
+ decision support systems.
+ </para>
+
+ <para>
+ In its simplest form, a bitmap index consists of a vector of bits
+ (bitmap) for each of the values of an index column. The size of
+ each bitmap is the number of rows of the table.</para>
+
+ <para>Bitmap indexes normally require less disk space than B-tree
+ indexes, because of particular compression
+ algorithms. <productname>PostgreSQL</productname> implementation
+ uses the <firstterm>Hybrid Run-Length</firstterm> (HRL) encoding mechanism. The creation
+ of a bitmap index is generally faster than the creation of a
+ B-tree index.</para>
+
+ <para>
+ <productname>PostgreSQL</productname> bitmap indexes currently
+ support only equality queries. They can be defined on one or more
+ columns (see <xref linkend="indexes-multicolumn"> for more
+ information).
+ </para>
+
+</sect1>
+
+
+<sect1 id="bitmap-procons">
+ <title>Pros and Cons of Bitmap Indexes</title>
+
+ <para>One advantage of on-disk bitmap indexes is that they can locate
+ large numbers of matches at low cost: they can be really effective
+ on queries with multiple conditions in the <literal>WHERE</literal>
+ clause.</para>
+
+ <para>Another advantage is that, under certain conditions, bitmap
+ indexes occupy far less disk space than other index types; this
+ can mean much faster performance, e. g. if I/O is smaller because
+ the whole index relation can fit in RAM.</para>
+
+ <para>The "price to pay" for these advantages is that bitmap indexes
+ are not suited for <emphasis>high cardinality columns</emphasis>:
+ they work better as far as the indexed attribute(s) assume a small
+ number of distinct values, as in the case
+ of <literal>boolean</literal> columns, or more generally for
+ enumerative values of low cardinality.</para>
+
+ <para>To clarify, consider the following simple example. On a table
+ having one row per each person that lives in a certain country,
+ good candidates for bitmap indexes would be the attributes
+ describing age, gender, state, etc.</para>
+
+</sect1>
+
+<sect1 id="bitmap-internals">
+ <title>Internals</title>
+
+<para>On-disk Bitmap indexes are logically organised using three data structures:</para>
+
+ <itemizedlist>
+ <listitem>
+ <para>a list of values (<literal>LOV</literal>), which contains the distinct values
+ of an attribute (or more attributes in case of a multi-column index)
+ for a particular table;</para>
+ </listitem>
+ <listitem>
+ <para>an index for the list of values structure (implemented using a B-tree index);</para>
+ </listitem>
+ <listitem>
+ <para>the real bitmap, a vector of '<literal>n</literal>' bits for each value,
+ where '<literal>n</literal>' is the number of rows (cardinality) of the index table.</para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ During the creation of the index, the relation is scanned and all the distinct values
+ of the column(s) of the index are inserted in the <literal>LOV</literal> table. The associated
+ B-tree index is used for fast retrieval of the distinct values.
+ </para>
+
+ <para>
+ The interesting part is how the index points to the rows - which is the ultimate goal of an index.
+ Every value in the index has a corresponding vector of bits. Every bit represents a tuple
+ in the table. If the bit for a particular tuple is set to <literal>0</literal>, that means
+ that the tuple does not match the value associated to the bitmap. If, on the other hand,
+ the bit is set to <literal>1</literal>, the row matches that value.
+ </para>
+
+ <para>
+ Bitmap indexes in <productname>PostgreSQL</productname> currently support compression
+ of the bit vectors, using an encoding mechanism known as <emphasis>Hybrid Run-Length</emphasis>.
+ The outcomes of the compression algorithm depend on several factors, including the number
+ of distinct values of the index and their distribution along the relation. In general,
+ a low-cardinality of distinct values produces better results in terms of disk usage
+ compared to B-tree indexes.
+ </para>
+
+</sect1>
+
+<sect1 id="bitmap-create">
+ <title>How to create a Bitmap index</title>
+
+ <para>
+ A Bitmap index can be created using the <xref linkend="sql-createindex" endterm="sql-createindex-title">
+ command, by specifying the <literal>bitmap</literal> method.
+ </para>
+
+ <para>
+ For instance, it is possible to create a bitmap index on the <literal>players</literal> table for the
+ <literal>position</literal> attribute using the following command:
+ <programlisting>
+CREATE INDEX idx_bm_position ON players USING bitmap (position);
+ </programlisting>
+ </para>
+
+</sect1>
+</chapter>
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml
index 914090d..6fbd8a8 100644
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -94,6 +94,7 @@
<!ENTITY protocol SYSTEM "protocol.sgml">
<!ENTITY sources SYSTEM "sources.sgml">
<!ENTITY storage SYSTEM "storage.sgml">
+<!ENTITY bitmap SYSTEM "bitmap.sgml">
<!-- contrib information -->
<!ENTITY contrib SYSTEM "contrib.sgml">
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index b1c8f22..2326257 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -116,8 +116,8 @@ CREATE INDEX test1_id_index ON test1 (id);
<para>
<productname>PostgreSQL</productname> provides several index types:
- B-tree, Hash, GiST, SP-GiST and GIN. Each index type uses a different
- algorithm that is best suited to different types of queries.
+ B-tree, Hash, GiST, SP-GiST, GIN, and Bitmap. Each index type uses a
+ different algorithm that is best suited to different types of queries.
By default, the <command>CREATE INDEX</command> command creates
B-tree indexes, which fit the most common situations.
</para>
@@ -318,6 +318,31 @@ SELECT * FROM places ORDER BY location <-> point '(101,456)' LIMIT 10;
classes are available in the <literal>contrib</> collection or as separate
projects. For more information see <xref linkend="GIN">.
</para>
+
+ <para>
+ <indexterm>
+ <primary>index</primary>
+ <secondary>Bitmap</secondary>
+ </indexterm>
+ <indexterm>
+ <primary>Bitmap</primary>
+ <see>index</see>
+ </indexterm>
+
+ An on-disk bitmap index (or bit vector index) is an index type that
+ is particularly useful for read-mostly environments such as data warehouses
+ and decision support systems.
+ The advantage of on-disk bitmap indexes is that they can locate large
+ numbers of matches at low cost. They can be really effective on queries
+ with multiple conditions in the <literal>WHERE</literal> clause.
+ </para>
+
+ <para>
+ <productname>PostgreSQL</productname> bitmap indexes currently support only equality queries.
+ They can be defined on one or more columns (see <xref linkend="indexes-multicolumn"> for
+ more information). For more information see <xref linkend="BMI">.
+ </para>
+
</sect1>
@@ -353,7 +378,7 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor);
</para>
<para>
- Currently, only the B-tree, GiST and GIN index types support multicolumn
+ Currently, only the B-tree, GiST, GIN and Bitmap index types support multicolumn
indexes. Up to 32 columns can be specified. (This limit can be
altered when building <productname>PostgreSQL</productname>; see the
file <filename>pg_config_manual.h</filename>.)
@@ -411,6 +436,7 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor);
<xref linkend="indexes-bitmap-scans"> for some discussion of the
merits of different index configurations.
</para>
+
</sect1>
diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml
index 15e4ef6..f621eb5 100644
--- a/doc/src/sgml/postgres.sgml
+++ b/doc/src/sgml/postgres.sgml
@@ -246,6 +246,7 @@
&gist;
&spgist;
&gin;
+ &bitmap;
&storage;
&bki;
&planstats;
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index 01faa3a..28415a7 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -57,8 +57,8 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ <replaceable class="parameter">name</
<para>
<productname>PostgreSQL</productname> provides the index methods
- B-tree, hash, GiST, SP-GiST, and GIN. Users can also define their own index
- methods, but that is fairly complicated.
+ B-tree, hash, GiST, SP-GiST, GIN, and Bitmap. Users can also define their
+ own index methods, but that is fairly complicated.
</para>
<para>
@@ -154,8 +154,9 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ <replaceable class="parameter">name</
<para>
The name of the index method to be used. Choices are
<literal>btree</literal>, <literal>hash</literal>,
- <literal>gist</literal>, <literal>spgist</> and <literal>gin</>.
- The default method is <literal>btree</literal>.
+ <literal>gist</literal>, <literal>spgist</>, <literal>gin</>, and
+ <literal>bitmap</literal>. The default method is
+ <literal>btree</literal>.
</para>
</listitem>
</varlistentry>
@@ -479,7 +480,7 @@ Indexes:
</caution>
<para>
- Currently, only the B-tree, GiST and GIN index methods support
+ Currently, only the B-tree, GiST, GIN and Bitmap index methods support
multicolumn indexes. Up to 32 fields can be specified by default.
(This limit can be altered when building
<productname>PostgreSQL</productname>.) Only B-tree currently
@@ -609,6 +610,20 @@ CREATE INDEX code_idx ON films (code) TABLESPACE indexspace;
</para>
<para>
+ To create a Bitmap index on the column <literal>media_type</literal> in
+ the table <literal>films</literal>:
+<programlisting>
+CREATE INDEX idx_bm_media_type ON films USING bitmap (media_type);
+</programlisting>
+ </para>
+
+<!--
+<comment>
+Is this example correct?
+</comment>
+-->
+
+ <para>
To create a GiST index on a point attribute so that we
can efficiently use box operators on the result of the
conversion function:
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile
index c32088f..f0e23d6 100644
--- a/src/backend/access/Makefile
+++ b/src/backend/access/Makefile
@@ -8,6 +8,6 @@ subdir = src/backend/access
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam
+SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam bitmap
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/bitmap/Makefile b/src/backend/access/bitmap/Makefile
new file mode 100644
index 0000000..8cb7770
--- /dev/null
+++ b/src/backend/access/bitmap/Makefile
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/bitmap
+#
+# IDENTIFICATION
+# src/backend/access/bitmap/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/bitmap
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = bitmap.o bitmapattutil.o bitmapinsert.o bitmappages.o \
+ bitmapsearch.o bitmaputil.o bitmapxlog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/bitmap/README b/src/backend/access/bitmap/README
new file mode 100644
index 0000000..b90f833
--- /dev/null
+++ b/src/backend/access/bitmap/README
@@ -0,0 +1,138 @@
+src/backend/access/bitmap/README
+
+This directory contains an implementation of an on-disk bitmap index.
+
+An on-disk bitmap index consists of bitmap vectors, one for each
+distinct key value. Each vector is a (compressed) map of locations in the
+underlying heap where the key value occurs.
+
+The advantage of on-disk bitmap indexes is that they can locate large numbers
+of matches at low cost. When compressed, they are also very small. For
+low-cardinality data (less than 50,000 distinct values), on-disk bitmap
+indexes are much less expensive to construct than b-trees.
+
+Hybrid Run-Length (HRL) equality encoding bitmap index
+------------------------------------------------------
+
+HRL is the bitmap encoding mechanism used in this implemention. In HRL,
+each vector is represented in two sections: the header section and the
+content section. The header section contains bits, each of which
+corresponds to a word in the content section. If a bit in the header
+section is 1, then the corresponding word in the content section is a
+compressed word; if the bit is 0, then the corresponding word is not a
+compressed word.
+
+For a compressed word in the content section, the first bit in this word
+indicates whether 1s or 0s are compressed. The rest of the bits represent the
+value of "<the number of bits>/<word size>".
+
+Consider this example. Assume that there is an uncompressed bitmap vector:
+
+ 00000000 00000000 01000000 11111111 11111111 11111111
+
+If the size of a word is set to 8, then an HRL compressed form for
+this bitmap vector is as follows:
+
+ header section: 101
+ content section: 00000010 01000000 10000011
+
+Consider the first word in the content section "00000010". The header
+section tells us that this is a compressed word. As the word represents
+the number two, this word tells us that it compresses 16 bits
+(i.e., 2 * 8 = 16). As the first bit is zero, it is compressing zeroed bits.
+
+The second word is uncompressed.
+
+The third word is compressed and it's first bit is set to one. As such
+it compresses ones. As 0011 evaluates to three, this compressed word
+represents 24 bits of ones (3 * 8 = 24).
+
+The insertion algorithm
+-----------------------
+
+The distinct values are stored as an array of "LOV items" on "LOV pages"
+(LOV stands for List of Values). LOV items also store some vector meta data.
+To deal with high-cardinality cases, we also create an internal heap and a
+btree index on this heap to speed up searches on distinct values. This
+internal heap stores the distinct values and their LOV items in LOV pages,
+which can be retrieved through the block numbers and the offset numbers. In
+other words, the heap has "<number of attributes to be indexed> + 2"
+attributes (one for the block number, the other for the offset number). The
+btree index is built on this heap with the key as attributes to be indexed.
+
+The LOV item for NULL keys is the first LOV item of the first LOV page.
+
+We do not store TIDs in this bitmap index implementation. The reason is
+that TIDs take too much space. Instead, we convert them to a 64 bit number
+as follows:
+
+
+ ((uint64)ItemPointerGetBlockNumber(TID) * MaxNumHeapTuples)
+ + ((uint64)ItemPointerGetOffsetNumber(TID));
+
+where MaxNumHeapTuples represents the maximum number of tuples that
+can be stored on a heap page. This TID location is used as the index position
+of this bit in its bitmap vector.
+
+Each insertion will affect only one bitmap vector. When inserting a
+new tuple into a bitmap index, we search through the internal heap to
+obtain the block number and the offset number of the LOV page that
+contains the given value. From there, we obtain an exclusive lock on
+that LOV page, and try to insert this new bit into the right bitmap
+vector. The index position for this bit is calculated through the
+formula for the tid location above. There are the following three
+cases:
+
+(1) This bit will only affect the last two words. In this case, we
+ simply update the LOV item, which stores this information.
+(2) This bit will require writing words to the last bitmap page, and
+ the last bitmap page has enough space to store these words. In
+ this case, we obtain an exclusive lock on the last bitmap page,
+ and write those words to the page.
+(3) This bit will require writing words to the last bitmap page, and
+ the last bitmap page does not have enough space for these new words.
+ In this case, we create a new bitmap page, and insert these new
+ words to this new bitmap page. We also update the previous
+ bitmap page and the LOV item.
+
+There is a fourth case -- the TID location might be in the middle of a
+vector. We deal with that specifically in the next section.
+
+When building a bitmap index, we also maintain an in-memory buffer to
+store a bunch of tid locations for each distinct value before writing
+them to bitmap vectors in batches. There are two advantages of this
+approach:
+
+(1) The bitmap pages for a bitmap vector are likely to be allocated
+ sequentially.
+(2) This can avoid visiting different bitmap pages for each insert
+ in a sequence of inserts, which can produce a lot of IOs when
+ the cardinality of attributes is high.
+
+Handling tuples that are inserted in the middle of the heap
+-----------------------------------------------------------
+
+When a new tuple is inserted into the middle of the heap, a bit needs
+to be updated in the middle of a bitmap vector. This is called an
+in-place bit update. Since the bitmap vector is compressed, this
+update may require us to convert one compressed word to 2-3 new
+words. Replacing the old compressed word with these new words may
+cause the current bitmap page to overflow. In this case, we create a
+new bitmap page to store overflow words, and insert this page
+right after the current bitmap page.
+
+One limitation about this approach is that this may cause a lot of
+fragmentation in a bitmap vector when many tuples are inserted in the
+middle of the heap.
+
+TODO: Currently, we need to search a bitmap vector from the beginning
+to find the bit to be updated. One potential solution is to maintain a
+list of the first tid locations for all bitmap pages in a bitmap
+vector so that we can find the bitmap page that contains
+the bit to be updated without scanning from the beginning.
+
+Vacuum/Vacuum full
+------------------
+
+During VACUUM FULL, tuples that are re-organized in the heap are not
+inserted into the bitmap index. Instead, we REINDEX the bitmap index(s).
diff --git a/src/backend/access/bitmap/bitmap.c b/src/backend/access/bitmap/bitmap.c
new file mode 100644
index 0000000..c0dd2e2
--- /dev/null
+++ b/src/backend/access/bitmap/bitmap.c
@@ -0,0 +1,532 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmap.c
+ * Implementation of the Hybrid Run-Length (HRL) on-disk bitmap index.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/bitmap/bitmap.c
+ *
+ * NOTES
+ * This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/bitmap.h"
+#include "access/xact.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "nodes/tidbitmap.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+#include "parser/parse_oper.h"
+#include "utils/memutils.h"
+#include "storage/bufmgr.h" /* for buffer manager functions */
+
+static void bmbuildCallback(Relation index, HeapTuple htup, Datum *attdata,
+ bool *nulls, bool tupleIsAlive, void *state);
+static void cleanup_pos(BMScanPosition pos);
+
+/*
+ * bmbuild() -- Build a new bitmap index.
+ */
+Datum
+bmbuild(PG_FUNCTION_ARGS)
+{
+ Relation heap = (Relation) PG_GETARG_POINTER(0);
+ Relation index = (Relation) PG_GETARG_POINTER(1);
+ IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+ double reltuples;
+ BMBuildState bmstate;
+ IndexBuildResult *result;
+
+ /* We expect this to be called exactly once. */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ ereport (ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" already contains data",
+ RelationGetRelationName(index))));
+
+ /* Initialize the bitmap index by preparing the meta page and inserting the
+ * first VMI */
+ _bitmap_init(index, XLogArchivingActive() && RelationNeedsWAL(index));
+
+ /* Initialize the build state. */
+ _bitmap_init_buildstate(index, &bmstate, indexInfo);
+
+ /*
+ * Do the initial heap scan for the relation and calls the bmbuildCallback
+ * callback function for every tuple in the heap, by passing the 'bmstate'
+ * structure
+ */
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[bmbuild] IndexBuildHeapScan PRE");
+#endif
+ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
+ bmbuildCallback, (void *)&bmstate);
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[bmbuild] IndexBuildHeapScan POST");
+#endif
+
+ /* clean up the build state */
+ _bitmap_cleanup_buildstate(index, &bmstate, indexInfo);
+
+ /*
+ * fsync the relevant files to disk, unless we're building
+ * a temporary index
+ */
+ if (!(XLogArchivingActive() && RelationNeedsWAL(index)))
+ {
+ FlushRelationBuffers(bmstate.bm_lov_heap);
+ smgrimmedsync(bmstate.bm_lov_heap->rd_smgr, MAIN_FORKNUM);
+
+ FlushRelationBuffers(bmstate.bm_lov_index);
+ smgrimmedsync(bmstate.bm_lov_index->rd_smgr, MAIN_FORKNUM);
+
+ FlushRelationBuffers(index);
+ /* FlushRelationBuffers will have opened rd_smgr */
+ smgrimmedsync(index->rd_smgr, MAIN_FORKNUM);
+ }
+
+ /* return statistics */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = bmstate.ituples;
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * bmbuildempty() -- build an empty bitmap index in the initialization fork
+ */
+Datum
+bmbuildempty(PG_FUNCTION_ARGS)
+{
+ /* Relation index = (Relation) PG_GETARG_POINTER(0); */
+
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("unlogged bitmap indexes are not supported")));
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * bminsert() -- insert an index tuple into a bitmap index.
+ */
+Datum
+bminsert(PG_FUNCTION_ARGS)
+{
+ Relation rel = (Relation) PG_GETARG_POINTER(0);
+ Datum *datum = (Datum *) PG_GETARG_POINTER(1);
+ bool *nulls = (bool *) PG_GETARG_POINTER(2);
+ ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
+
+ _bitmap_doinsert(rel, *ht_ctid, datum, nulls);
+
+ PG_RETURN_BOOL(true);
+}
+
+/*
+ * bmgettuple() -- return the next tuple in a scan.
+ */
+Datum
+bmgettuple(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
+ BMScanOpaque so = (BMScanOpaque)scan->opaque;
+
+ bool res;
+
+ /*
+ * If we have already begun our scan, continue in the same direction.
+ * Otherwise, start up the scan.
+ */
+ if (so->bm_currPos && so->cur_pos_valid)
+ res = _bitmap_next(scan, dir);
+ else
+ res = _bitmap_first(scan, dir);
+
+ PG_RETURN_BOOL(res);
+}
+
+
+/*
+ * bmgetbitmap() -- gets all matching tuples, and adds them to a bitmap
+ */
+Datum
+bmgetbitmap(PG_FUNCTION_ARGS)
+{
+ /* We ignore the second argument as we're returning a hash bitmap */
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
+ /* BMScanOpaque so = (BMScanOpaque) scan->opaque; */
+ int64 ntids = 0;
+ ItemPointer heapTid;
+
+ /* Fetch the first page & tuple. */
+ if (!_bitmap_first(scan, ForwardScanDirection))
+ {
+ /* empty scan */
+ PG_RETURN_INT64(0);
+ }
+ /* Save tuple ID, and continue scanning */
+ heapTid = &scan->xs_ctup.t_self;
+ tbm_add_tuples(tbm, heapTid, 1, false);
+ ntids++;
+
+ for (;;)
+ {
+ /* let _bitmap_next do the heavy lifting */
+ if (!_bitmap_next(scan, ForwardScanDirection))
+ break;
+
+ /* Save tuple ID, and continue scanning */
+ heapTid = &(scan->xs_ctup.t_self);
+ tbm_add_tuples(tbm, heapTid, 1, false);
+ ntids++;
+ }
+
+ PG_RETURN_INT64(ntids);
+}
+
+
+/*
+ * bmbeginscan() -- start a scan on the bitmap index.
+ */
+Datum
+bmbeginscan(PG_FUNCTION_ARGS)
+{
+ Relation rel = (Relation) PG_GETARG_POINTER(0);
+ int nkeys = PG_GETARG_INT32(1);
+ int norderbys = PG_GETARG_INT32(2);
+ IndexScanDesc scan;
+
+ scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
+ PG_RETURN_POINTER(scan);
+}
+
+/*
+ * bmrescan() -- restart a scan on the bitmap index.
+ */
+Datum
+bmrescan(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
+ BMScanOpaque so = (BMScanOpaque) scan->opaque;
+
+ /* so will be NULL if we were called via index_rescan() */
+ if (so == NULL)
+ {
+ so = (BMScanOpaque) palloc(sizeof(BMScanOpaqueData));
+ so->bm_currPos = NULL;
+ so->bm_markPos = NULL;
+ so->cur_pos_valid = false;
+ so->mark_pos_valid = false;
+ scan->opaque = so;
+ }
+
+ if (so->bm_currPos != NULL)
+ {
+ cleanup_pos(so->bm_currPos);
+ MemSet(so->bm_currPos, 0, sizeof(BMScanPositionData));
+ so->cur_pos_valid = false;
+ }
+
+ if (so->bm_markPos != NULL)
+ {
+ cleanup_pos(so->bm_markPos);
+ MemSet(so->bm_markPos, 0, sizeof(BMScanPositionData));
+ so->cur_pos_valid = false;
+ }
+ /* reset the scan key */
+ if (scankey && scan->numberOfKeys > 0)
+ memmove(scan->keyData, scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * bmendscan() -- close a scan.
+ */
+Datum
+bmendscan(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ BMScanOpaque so = (BMScanOpaque) scan->opaque;
+
+ /* free the space */
+ if (so->bm_currPos != NULL)
+ {
+ /*
+ * release the buffers that have been stored for each related
+ * bitmap vector.
+ */
+ if (so->bm_currPos->nvec > 1)
+ _bitmap_cleanup_batchwords(so->bm_currPos->bm_batchWords);
+ _bitmap_cleanup_scanpos(so->bm_currPos->posvecs,
+ so->bm_currPos->nvec);
+ so->bm_currPos = NULL;
+ }
+
+ if (so->bm_markPos != NULL)
+ {
+ if (so->bm_markPos->nvec > 1)
+ _bitmap_cleanup_batchwords(so->bm_markPos->bm_batchWords);
+ _bitmap_cleanup_scanpos(so->bm_markPos->posvecs,
+ so->bm_markPos->nvec);
+ so->bm_markPos = NULL;
+ }
+
+ pfree(so);
+ scan->opaque = NULL;
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * bmmarkpos() -- save the current scan position.
+ */
+Datum
+bmmarkpos(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ BMScanOpaque so = (BMScanOpaque) scan->opaque;
+ BMVector bmScanPos;
+ uint32 vectorNo;
+
+ /* free the space */
+ if (so->mark_pos_valid)
+ {
+ /*
+ * release the buffers that have been stored for each
+ * related bitmap.
+ */
+ bmScanPos = so->bm_markPos->posvecs;
+
+ for (vectorNo=0; vectorNo < so->bm_markPos->nvec; vectorNo++)
+ {
+ if (BufferIsValid((bmScanPos[vectorNo]).bm_vmiBuffer))
+ {
+ ReleaseBuffer((bmScanPos[vectorNo]).bm_vmiBuffer);
+ (bmScanPos[vectorNo]).bm_vmiBuffer = InvalidBuffer;
+ }
+ }
+ so->mark_pos_valid = false;
+ }
+
+ if (so->cur_pos_valid)
+ {
+ uint32 size = sizeof(BMScanPositionData);
+ bool need_init = false;
+
+ /* set the mark position */
+ if (so->bm_markPos == NULL)
+ {
+ so->bm_markPos = (BMScanPosition) palloc(size);
+ so->bm_markPos->posvecs =
+ (BMVector)palloc0(so->bm_currPos->nvec * sizeof(BMVectorData));
+ need_init = true;
+ }
+
+ bmScanPos = so->bm_currPos->posvecs;
+
+ for (vectorNo = 0; vectorNo < so->bm_currPos->nvec; vectorNo++)
+ {
+ BMVector p = &(so->bm_markPos->posvecs[vectorNo]);
+
+ if (BufferIsValid((bmScanPos[vectorNo]).bm_vmiBuffer))
+ IncrBufferRefCount((bmScanPos[vectorNo]).bm_vmiBuffer);
+
+ if (need_init)
+ {
+ p->bm_batchWords =
+ (BMBatchWords *) palloc0(sizeof(BMBatchWords));
+ _bitmap_init_batchwords(p->bm_batchWords,
+ BM_NUM_OF_HRL_WORDS_PER_PAGE,
+ CurrentMemoryContext);
+ }
+ }
+
+ if (so->bm_currPos->nvec == 1)
+ {
+ so->bm_markPos->bm_batchWords =
+ so->bm_markPos->posvecs->bm_batchWords;
+ }
+
+ memcpy(so->bm_markPos->posvecs, bmScanPos,
+ so->bm_currPos->nvec *
+ sizeof(BMVectorData));
+ memcpy(so->bm_markPos, so->bm_currPos, size);
+
+ so->mark_pos_valid = true;
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * bmrestrpos() -- restore a scan to the last saved position.
+ */
+Datum
+bmrestrpos(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ BMScanOpaque so = (BMScanOpaque) scan->opaque;
+
+ BMVector bmScanPos;
+ uint32 vectorNo;
+
+ /* free space */
+ if (so->cur_pos_valid)
+ {
+ /* release the buffers that have been stored for each related bitmap.*/
+ bmScanPos = so->bm_currPos->posvecs;
+
+ for (vectorNo=0; vectorNo<so->bm_markPos->nvec;
+ vectorNo++)
+ {
+ if (BufferIsValid((bmScanPos[vectorNo]).bm_vmiBuffer))
+ {
+ ReleaseBuffer((bmScanPos[vectorNo]).bm_vmiBuffer);
+ (bmScanPos[vectorNo]).bm_vmiBuffer = InvalidBuffer;
+ }
+ }
+ so->cur_pos_valid = false;
+ }
+
+ if (so->mark_pos_valid)
+ {
+ uint32 size = sizeof(BMScanPositionData);
+
+ /* set the current position */
+ if (so->bm_currPos == NULL)
+ {
+ so->bm_currPos = (BMScanPosition) palloc(size);
+ }
+
+ bmScanPos = so->bm_markPos->posvecs;
+
+ for (vectorNo=0; vectorNo<so->bm_currPos->nvec;
+ vectorNo++)
+ {
+ if (BufferIsValid((bmScanPos[vectorNo]).bm_vmiBuffer))
+ IncrBufferRefCount((bmScanPos[vectorNo]).bm_vmiBuffer);
+ }
+
+ memcpy(so->bm_currPos->posvecs, bmScanPos,
+ so->bm_markPos->nvec *
+ sizeof(BMVectorData));
+ memcpy(so->bm_currPos, so->bm_markPos, size);
+ so->cur_pos_valid = true;
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * bmbulkdelete() -- bulk delete index entries
+ *
+ * Re-index is performed before retrieving the number of tuples
+ * indexed in this index.
+ */
+Datum
+bmbulkdelete(PG_FUNCTION_ARGS)
+{
+ IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+ Relation rel = info->index;
+ IndexBulkDeleteResult * volatile result =
+ (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+ IndexBulkDeleteCallback callback =
+ (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
+ void *callback_state = (void *) PG_GETARG_POINTER(3);
+
+ /* allocate stats if first time through, else re-use existing struct */
+ if (result == NULL)
+ result = (IndexBulkDeleteResult *)
+ palloc0(sizeof(IndexBulkDeleteResult));
+
+ result = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+ _bitmap_vacuum(info, result, callback, callback_state);
+
+ result->num_pages = RelationGetNumberOfBlocks(rel);
+ /* Since we re-build the index, set this to number of heap tuples. */
+ result->num_index_tuples = info->num_heap_tuples;
+ result->tuples_removed = 0;
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * bmvacuumcleanup() -- post-vacuum cleanup.
+ *
+ * We do nothing useful here.
+ */
+Datum
+bmvacuumcleanup(PG_FUNCTION_ARGS)
+{
+ IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+ Relation rel = info->index;
+ IndexBulkDeleteResult *stats =
+ (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *)palloc0(sizeof(IndexBulkDeleteResult));
+
+ /* update statistics */
+ stats->num_pages = RelationGetNumberOfBlocks(rel);
+ stats->pages_deleted = 0;
+ stats->pages_free = 0;
+ /* XXX: dodgy hack to shutup index_scan() and vacuum_index() */
+ stats->num_index_tuples = info->num_heap_tuples;
+
+ PG_RETURN_POINTER(stats);
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+bmbuildCallback(Relation index, HeapTuple htup, Datum *attdata,
+ bool *nulls, bool tupleIsAlive, void *state)
+{
+ BMBuildState *bstate = (BMBuildState *) state;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[bmbuildCallback] BEGIN");
+#endif
+
+ _bitmap_buildinsert(index, htup->t_self, attdata, nulls, bstate);
+ ++bstate->ituples;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[bmbuildCallback] END");
+#endif
+}
+
+static void
+cleanup_pos(BMScanPosition pos)
+{
+ if (pos->nvec == 0)
+ return;
+
+ /*
+ * Only cleanup bm_batchWords if we have more than one vector since
+ * _bitmap_cleanup_scanpos() will clean it up for the single vector
+ * case.
+ */
+ if (pos->nvec > 1)
+ _bitmap_cleanup_batchwords(pos->bm_batchWords);
+ _bitmap_cleanup_scanpos(pos->posvecs, pos->nvec);
+}
diff --git a/src/backend/access/bitmap/bitmapattutil.c b/src/backend/access/bitmap/bitmapattutil.c
new file mode 100644
index 0000000..b6410d1
--- /dev/null
+++ b/src/backend/access/bitmap/bitmapattutil.c
@@ -0,0 +1,358 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmapattutil.c
+ * Defines the routines to maintain all distinct attribute values
+ * which are indexed in the on-disk bitmap index.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/bitmap/bitmapattutil.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/tupdesc.h"
+#include "access/bitmap.h"
+#include "access/nbtree.h"
+#include "access/xact.h"
+#include "nodes/execnodes.h"
+#include "nodes/primnodes.h"
+#include "nodes/makefuncs.h"
+#include "catalog/dependency.h"
+#include "catalog/heap.h"
+#include "catalog/index.h"
+#include "catalog/pg_type.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_namespace.h"
+#include "access/heapam.h"
+#include "optimizer/clauses.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/builtins.h"
+#include "commands/defrem.h"
+#include "commands/tablecmds.h"
+
+static TupleDesc _bitmap_create_lov_heapTupleDesc(Relation rel);
+
+/*
+ * _bitmap_create_lov_heapandindex() -- create a new heap relation and
+ * a btree index for the list of values (LOV).
+ */
+
+void
+_bitmap_create_lov_heapandindex(Relation rel, Oid *lovHeapId, Oid *lovIndexId)
+{
+ char lovHeapName[NAMEDATALEN];
+ char lovIndexName[NAMEDATALEN];
+ TupleDesc tupDesc;
+ IndexInfo *indexInfo;
+ ObjectAddress objAddr, referenced;
+ Oid *classObjectId;
+ Oid *collationObjectId;
+ Oid heapid;
+ Relation heapRel;
+ Oid indid;
+ int indattrs;
+ List *indexColNames;
+ int i;
+
+ /* create the new names for the new lov heap and index */
+ snprintf(lovHeapName, sizeof(lovHeapName),
+ "pg_bm_%u", RelationGetRelid(rel));
+ snprintf(lovIndexName, sizeof(lovIndexName),
+ "pg_bm_%u_index", RelationGetRelid(rel));
+
+ /*
+ * If this is happening during re-indexing, then such a heap should
+ * have existed already. Here, we delete this heap and its btree
+ * index first.
+ */
+ heapid = get_relname_relid(lovHeapName, PG_BITMAPINDEX_NAMESPACE);
+ if (OidIsValid(heapid))
+ {
+ ObjectAddress object;
+ indid = get_relname_relid(lovIndexName, PG_BITMAPINDEX_NAMESPACE);
+
+ Assert(OidIsValid(indid));
+
+ /*
+ * Remove the dependency between the LOV heap relation,
+ * the LOV index, and the parent bitmap index before
+ * we drop the lov heap and index.
+ */
+ deleteDependencyRecordsFor(RelationRelationId, heapid, false);
+ deleteDependencyRecordsFor(RelationRelationId, indid, false);
+ CommandCounterIncrement();
+
+ object.classId = RelationRelationId;
+ object.objectId = indid;
+ object.objectSubId = 0;
+ performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
+
+ object.objectId = heapid;
+ performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
+ }
+
+ /*
+ * create a new empty heap to store all attribute values with their
+ * corresponding block number and offset in LOV.
+ */
+ tupDesc = _bitmap_create_lov_heapTupleDesc(rel);
+
+ *lovHeapId = heap_create_with_catalog(lovHeapName, /* relname */
+ PG_BITMAPINDEX_NAMESPACE, /* relnamespace */
+ rel->rd_rel->reltablespace, /* reltablespace */
+ InvalidOid, /* relid */
+ InvalidOid, /* reltypeid */
+ InvalidOid, /* reloftypeid */
+ rel->rd_rel->relowner, /* ownerid */
+ tupDesc, /* tupdesc */
+ NIL, /* cooked_constraints */
+ RELKIND_RELATION, /* relkind */
+ RELPERSISTENCE_PERMANENT, /* relpersistence */
+ rel->rd_rel->relisshared, /* shared_relation */
+ RelationIsMapped(rel), /* mapped_relation */
+ false, /* oidislocal */
+ 0, /* oidinhcount */
+ ONCOMMIT_NOOP, /* oncommit */
+ (Datum)0, /* reloptions */
+ false, /* use_user_acl */
+ true, /* allow_system_table_mods */
+ true); /* is_internal */
+
+ /*
+ * We must bump the command counter to make the newly-created relation
+ * tuple visible for opening.
+ */
+ CommandCounterIncrement();
+
+ objAddr.classId = RelationRelationId;
+ objAddr.objectId = *lovHeapId;
+ objAddr.objectSubId = 0 ;
+
+ referenced.classId = RelationRelationId;
+ referenced.objectId = RelationGetRelid(rel);
+ referenced.objectSubId = 0;
+
+ recordDependencyOn(&objAddr, &referenced, DEPENDENCY_INTERNAL);
+
+ heapRel = RelationIdGetRelation(*lovHeapId); /* open relation */
+
+ /*
+ * create a btree index on the newly-created heap.
+ * The key includes all attributes to be indexed in this bitmap index.
+ */
+ indattrs = tupDesc->natts - 2;
+ indexInfo = makeNode(IndexInfo);
+ indexInfo->ii_NumIndexAttrs = indattrs;
+ indexInfo->ii_Expressions = NIL;
+ indexInfo->ii_ExpressionsState = NIL;
+ indexInfo->ii_Predicate = make_ands_implicit(NULL);
+ indexInfo->ii_PredicateState = NIL;
+ indexInfo->ii_Unique = true;
+
+ indexColNames = NIL;
+ classObjectId = (Oid *) palloc(indattrs * sizeof(Oid));
+ collationObjectId = (Oid *) palloc(indattrs * sizeof(Oid));
+ for (i = 0; i < indattrs; i++)
+ {
+ indexInfo->ii_KeyAttrNumbers[i] = i + 1;
+
+ /*
+ * Append just a pointer to the name character data in the descriptor
+ * to the column name list. This is safe because it will be
+ * immediately copied by index_create.
+ */
+ indexColNames = lappend(indexColNames,
+ NameStr(tupDesc->attrs[i]->attname));
+
+ /* Use the default class respective to the attribute type. */
+ classObjectId[i] = GetDefaultOpClass(tupDesc->attrs[i]->atttypid,
+ BTREE_AM_OID);
+
+ /* Copy the collation objects from the index. */
+ collationObjectId[i] = tupDesc->attrs[i]->attcollation;
+ }
+
+ *lovIndexId = index_create(heapRel, /* heapRelation */
+ /* *lovHeapId, */ /* heapRelationId */ /* removed */
+ lovIndexName, /* indexRelationName */
+ InvalidOid, /* indexRelationId */
+ InvalidOid, /* relFileNode */
+ indexInfo, /* indexInfo */
+ indexColNames, /* indexColNames */
+ BTREE_AM_OID, /* accessMethodObjectId */
+ rel->rd_rel->reltablespace, /* tableSpaceId */
+ collationObjectId, /* collationObjectId */
+ classObjectId, /* classObjectId */
+ NULL, /* coloptions */
+ 0, /* reloptions */
+ false, /* isprimary */
+ false, /* isconstant */
+ false, /* deferrable */
+ false, /* initdeferred */
+ true, /* allow_system_table_mods */
+ true, /* skip_build */
+ false, /* concurrent */
+ true); /* is_internal */
+
+ list_free(indexColNames);
+ indexColNames = NIL;
+
+ objAddr.classId = RelationRelationId;
+ objAddr.objectId = *lovIndexId;
+ objAddr.objectSubId = 0 ;
+
+ recordDependencyOn(&objAddr, &referenced, DEPENDENCY_INTERNAL);
+
+ RelationClose(heapRel);
+}
+
+/*
+ * _bitmap_create_lov_heapTupleDesc() -- create the new heap tuple descriptor.
+ */
+
+TupleDesc
+_bitmap_create_lov_heapTupleDesc(Relation rel)
+{
+ TupleDesc tupDesc;
+ TupleDesc oldTupDesc;
+ AttrNumber attno;
+ int natts;
+
+ oldTupDesc = RelationGetDescr(rel);
+ natts = oldTupDesc->natts + 2;
+
+ tupDesc = CreateTemplateTupleDesc(natts, false);
+
+ for (attno = 1; attno <= oldTupDesc->natts; attno++)
+ {
+ /* copy the attribute to be indexed. */
+ memcpy(tupDesc->attrs[attno - 1], oldTupDesc->attrs[attno - 1],
+ ATTRIBUTE_FIXED_PART_SIZE);
+ tupDesc->attrs[attno - 1]->attnum = attno;
+ tupDesc->attrs[attno - 1]->attnotnull = false;
+ tupDesc->attrs[attno - 1]->atthasdef = false;
+ }
+
+ /* the block number */
+ TupleDescInitEntry(tupDesc, attno, "blockNumber", INT4OID, -1, 0);
+ attno++;
+
+ /* the offset number */
+ TupleDescInitEntry(tupDesc, attno, "offsetNumber", INT4OID, -1, 0);
+
+ return tupDesc;
+}
+
+/*
+ * _bitmap_open_lov_heapandindex() -- open the heap relation and the btree
+ * index for LOV.
+ */
+
+void
+_bitmap_open_lov_heapandindex(BMMetaPage metapage,
+ Relation *lovHeapP, Relation *lovIndexP,
+ LOCKMODE lockMode)
+{
+ *lovHeapP = heap_open(metapage->bm_lov_heapId, lockMode);
+ *lovIndexP = index_open(metapage->bm_lov_indexId, lockMode);
+}
+
+/*
+ * _bitmap_insert_lov() -- insert a new data into the given heap and index.
+ */
+void
+_bitmap_insert_lov(Relation lovHeap, Relation lovIndex, Datum *datum,
+ bool *nulls, bool use_wal, bool skip_index_insert)
+{
+ TupleDesc tupDesc;
+ HeapTuple tuple;
+ bool result;
+ Datum *indexDatum;
+ bool *indexNulls;
+
+ tupDesc = RelationGetDescr(lovHeap);
+
+ /* insert this tuple into the heap */
+ tuple = heap_form_tuple(tupDesc, datum, nulls);
+ heap_insert(lovHeap, tuple, GetCurrentCommandId(true),
+ use_wal ? 0 : HEAP_INSERT_SKIP_WAL, NULL);
+
+ if (!skip_index_insert)
+ {
+ /* insert a new tuple into the index */
+ indexDatum = palloc0((tupDesc->natts - 2) * sizeof(Datum));
+ indexNulls = palloc0((tupDesc->natts - 2) * sizeof(bool));
+ memcpy(indexDatum, datum, (tupDesc->natts - 2) * sizeof(Datum));
+ memcpy(indexNulls, nulls, (tupDesc->natts - 2) * sizeof(bool));
+ result = index_insert(lovIndex, indexDatum, indexNulls,
+ &(tuple->t_self), lovHeap, true);
+
+ pfree(indexDatum);
+ pfree(indexNulls);
+ Assert(result);
+ }
+
+ heap_freetuple(tuple);
+}
+
+
+/*
+ * _bitmap_close_lov_heapandindex() -- close the heap and the index.
+ */
+void
+_bitmap_close_lov_heapandindex(Relation lovHeap, Relation lovIndex,
+ LOCKMODE lockMode)
+{
+ heap_close(lovHeap, lockMode);
+ index_close(lovIndex, lockMode);
+}
+
+/*
+ * _bitmap_findvalue() -- find a row in a given heap using
+ * a given index that satisfies the given scan key.
+ *
+ * If this value exists, this function returns true. Otherwise,
+ * returns false.
+ *
+ * If this value exists in the heap, this function also returns
+ * the block number and the offset number that are stored in the same
+ * row with this value. This block number and the offset number
+ * are for the LOV item that points the bitmap vector for this value.
+ */
+bool
+_bitmap_findvalue(Relation lovHeap, Relation lovIndex,
+ ScanKey scanKey, IndexScanDesc scanDesc,
+ BMVMIID *vmiid)
+{
+ TupleDesc tupDesc;
+ HeapTuple tuple;
+ bool found = false;
+ bool isNull;
+
+ tupDesc = RelationGetDescr(lovIndex);
+
+ tuple = index_getnext(scanDesc, ForwardScanDirection);
+
+ if (tuple != NULL)
+ {
+ TupleDesc heapTupDesc;
+ Datum d;
+
+ found = true;
+ heapTupDesc = RelationGetDescr(lovHeap);
+
+ d = heap_getattr(tuple, tupDesc->natts + 1, heapTupDesc, &isNull);
+ Assert(!isNull);
+ vmiid->block = DatumGetInt32(d);
+ d = heap_getattr(tuple, tupDesc->natts + 2, heapTupDesc, &isNull);
+ Assert(!isNull);
+ vmiid->offset = DatumGetInt16(d);
+ }
+ return found;
+}
diff --git a/src/backend/access/bitmap/bitmapinsert.c b/src/backend/access/bitmap/bitmapinsert.c
new file mode 100644
index 0000000..2223367
--- /dev/null
+++ b/src/backend/access/bitmap/bitmapinsert.c
@@ -0,0 +1,2653 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmapinsert.c
+ * Tuple insertion in the on-disk bitmap index.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/bitmap/bitmapinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "access/genam.h"
+#include "access/tupdesc.h"
+#include "access/heapam.h"
+#include "access/bitmap.h"
+#include "parser/parse_oper.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "storage/bufmgr.h" /* for buffer manager functions */
+#include "utils/tqual.h" /* for SnapshotAny */
+#include "utils/rel.h" /* for RelationGetDescr */
+#include "utils/lsyscache.h" /* for get_opcode */
+#include "catalog/pg_collation.h"
+
+#ifdef HAVE_LONG_INT_64
+#define UINT64X_FORMAT "%lx"
+#elif defined(HAVE_LONG_LONG_INT_64)
+#define UINT64X_FORMAT "%llx"
+#endif
+
+/*
+ * The following structure along with BMTIDBuffer are used to buffer
+ * words for tids * during index create -- bmbuild().
+ */
+
+/*
+ * BMTIDVMIBuffer represents those bitmap vectors whose VMI would be
+ * stored on the specified vmi_block. The array bufs stores the TIDs for
+ * a distinct vector (see above). The index of the array we're upto tell
+ * us the offset number of the VMI on the vmi_block.
+ */
+
+typedef struct BMTIDVMIBuffer
+{
+ BlockNumber vmi_block;
+ BMTIDBuffer *bufs[BM_MAX_VMI_PER_PAGE];
+} BMTIDVMIBuffer;
+
+static Buffer get_lastbitmappagebuf(Relation rel, BMVectorMetaItem vmi);
+static void create_loventry(Relation rel, Buffer metabuf, uint64 tidnum,
+ TupleDesc tupDesc, Datum *attdata, bool *nulls,
+ Relation lovHeap, Relation lovIndex,
+ BMVMIID *vmiid, bool use_wal,
+ bool skip_index_insert);
+static void build_inserttuple(Relation index, uint64 tidnum,
+ ItemPointerData ht_ctid,
+ Datum *attdata, bool *nulls, BMBuildState *state);
+
+static void inserttuple(Relation rel, Buffer metabuf,
+ uint64 tidnum, ItemPointerData ht_ctid,
+ TupleDesc tupDesc, Datum* attdata,
+ bool *nulls, Relation lovHeap,
+ Relation lovIndex, ScanKey scanKey,
+ IndexScanDesc scanDesc, bool use_wal);
+static void updatesetbit(Relation rel,
+ Buffer lovBuffer, OffsetNumber lovOffset,
+ uint64 tidnum, bool use_wal);
+static void updatesetbit_inword(BM_WORD word, uint64 updateBitLoc,
+ uint64 firstTid, BMTIDBuffer *buf);
+static void updatesetbit_inpage(Relation rel, uint64 tidnum,
+ Buffer lovBuffer, OffsetNumber lovOffset,
+ Buffer bitmapBuffer, uint64 firstTidNumber,
+ bool use_wal);
+static void insertsetbit(Relation rel, BlockNumber vmiBlock,
+ OffsetNumber vmiOffset, uint64 tidnum, bool use_wal);
+static uint64 getnumbits(BM_WORD *contentWords,
+ BM_WORD *headerWords, uint32 nwords);
+static void findbitmappage(Relation rel, BMVectorMetaItem vmi, uint64 tidnum,
+ Buffer *bitmapBufferP, uint64 *firstTidNumberP);
+static void shift_header_bits(BM_WORD *words, uint32 numOfBits,
+ uint32 maxNumOfWords, uint32 startLoc,
+ uint32 numOfShiftingBits);
+static void rshift_header_bits(BM_WORD *words, uint64 nwords,
+ uint32 bits);
+static void lshift_header_bits(BM_WORD *words, uint64 nwords,
+ uint32 bits);
+static void insert_newwords(BMTIDBuffer* words, uint32 insertPos,
+ BMTIDBuffer* new_words, BMTIDBuffer* words_left);
+static int16 mergewords(BMTIDBuffer* buf, bool lastWordFill);
+static void buf_make_space(Relation rel,
+ BMTidBuildBuf *tidLocsBuffer, bool use_wal);
+#ifdef DEBUG_BITMAP
+static void verify_bitmappages(Relation rel, BMLOVItem lovitem);
+#endif
+static int16 buf_add_tid_with_fill(Relation rel, BMTIDBuffer *buf,
+ BlockNumber vmi_block, OffsetNumber off,
+ uint64 tidnum, bool use_wal);
+static int16 buf_add_tid_with_fill_immediate(Relation rel, BMTIDBuffer *buf,
+ BlockNumber vmi_block,
+ OffsetNumber off,
+ uint64 tidnum, bool use_wal);
+static uint16 buf_extend(BMTIDBuffer *buf);
+static uint16 buf_ensure_head_space(Relation rel, BMTIDBuffer *buf,
+ BlockNumber lov_block, OffsetNumber off,
+ bool use_wal);
+static uint16 buf_free_mem(Relation rel, BMTIDBuffer *buf,
+ BlockNumber lov_block, OffsetNumber off,
+ bool use_wal, bool flush_hot_buffer);
+
+#define BUF_INIT_WORDS 8 /* as good a point as any */
+
+/* Debug helper functions */
+void _debug_view_1(BMTidBuildBuf *x, const char *msg) ;
+void _debug_view_2(BMTIDBuffer *x, const char *msg) ;
+
+
+/*
+ * get_lastbitmappagebuf() -- return the buffer for the last
+ * bitmap page that is pointed by a given LOV item.
+ *
+ * The returned buffer will hold an exclusive lock.
+ */
+Buffer
+get_lastbitmappagebuf(Relation rel, BMVectorMetaItem vmi)
+{
+ Buffer lastBuffer = InvalidBuffer;
+
+ if (vmi->bm_bitmap_head != InvalidBlockNumber)
+ lastBuffer = _bitmap_getbuf(rel, vmi->bm_bitmap_tail, BM_WRITE);
+
+ return lastBuffer;
+}
+
+/*
+ * getnumbits() -- return the number of bits included in the given
+ * bitmap words.
+ */
+uint64
+getnumbits(BM_WORD *contentWords, BM_WORD *headerWords, uint32 nwords)
+{
+ uint64 nbits = 0;
+ uint32 i;
+
+ for (i = 0; i < nwords; i++)
+ {
+ if (IS_FILL_WORD(headerWords, i))
+ nbits += FILL_LENGTH(contentWords[i]) * BM_WORD_SIZE;
+ else
+ nbits += BM_WORD_SIZE;
+ }
+
+ return nbits;
+}
+
+/*
+ * updatesetbit() -- update a set bit in a bitmap.
+ *
+ * This function finds the bit in a given bitmap vector whose bit location is
+ * equal to tidnum, and changes this bit to 1.
+ *
+ * If this bit is already 1, then we are done. Otherwise, there are
+ * two possibilities:
+ * (1) This bit appears in a literal word. In this case, we simply change
+ * it to 1.
+ * (2) This bit appears in a fill word with bit 0. In this case, this word
+ * may generate two or three words after changing the corresponding bit
+ * to 1, depending on the position of this bit.
+ *
+ * Case (2) will make the corresponding bitmap page to grow. The words after
+ * this affected word in this bitmap page are shifted right to accommodate
+ * the newly generated words. If this bitmap page does not have enough space
+ * to hold all these words, the last few words will be shifted out of this
+ * page. In this case, the next bitmap page is checked to see if there are
+ * enough space for these extra words. If so, these extra words are inserted
+ * into the next page. Otherwise, we create a new bitmap page to hold
+ * these extra words.
+ */
+void
+updatesetbit(Relation rel, Buffer vmiBuffer, OffsetNumber vmiOffset,
+ uint64 tidnum, bool use_wal)
+{
+ Page vmiPage;
+ BMVectorMetaItem vmi;
+
+ uint64 tidLocation;
+ uint16 insertingPos;
+
+ uint64 firstTidNumber = 1;
+ Buffer bitmapBuffer = InvalidBuffer;
+
+ vmiPage = BufferGetPage(vmiBuffer);
+ vmi = (BMVectorMetaItem)
+ PageGetItem(vmiPage, PageGetItemId(vmiPage, vmiOffset));
+
+ /* Calculate the tid location in the last bitmap page. */
+ tidLocation = vmi->bm_last_tid_location;
+ if (BM_LAST_COMPWORD_IS_FILL(vmi))
+ tidLocation -= (FILL_LENGTH(vmi->bm_last_compword) *
+ BM_WORD_SIZE);
+ else
+ tidLocation -= BM_WORD_SIZE;
+
+ /*
+ * If tidnum is in either bm_last_compword or bm_last_word,
+ * and this does not generate any new words, we simply
+ * need to update the lov item.
+ */
+ if ((tidnum > vmi->bm_last_tid_location) ||
+ ((tidnum > tidLocation) &&
+ ((vmi->vmi_words_header == BM_VMI_WORDS_NO_FILL) ||
+ (FILL_LENGTH(vmi->bm_last_compword) == 1))))
+ {
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(vmiBuffer);
+
+ if (tidnum > vmi->bm_last_tid_location) /* bm_last_word */
+ {
+ insertingPos = (tidnum - 1) % BM_WORD_SIZE;
+ vmi->bm_last_word |= (((BM_WORD) 1) << insertingPos);
+ }
+ else
+ {
+ if (FILL_LENGTH(vmi->bm_last_compword) == 1)
+ vmi->bm_last_compword = 0;
+
+ insertingPos = (tidnum - 1) % BM_WORD_SIZE;
+ vmi->bm_last_compword |= (((BM_WORD) 1) << insertingPos);
+ vmi->vmi_words_header = BM_VMI_WORDS_NO_FILL;
+ }
+
+ if (use_wal)
+ _bitmap_log_bitmap_lastwords(rel, vmiBuffer, vmiOffset, vmi);
+
+ END_CRIT_SECTION();
+
+ return;
+ }
+
+ /*
+ * Here, if tidnum is still in bm_last_compword, we know that
+ * bm_last_compword is a fill zero word with fill length greater
+ * than 1. This update will generate new words, we insert new words
+ * into the last bitmap page and update the lov item.
+ */
+ if ((tidnum > tidLocation) && BM_LASTWORD_IS_FILL(vmi))
+ {
+ /*
+ * We know that bm_last_compwords will be split into two
+ * or three words, depending on the splitting position.
+ */
+ BMTIDBuffer buf;
+
+ MemSet(&buf, 0, sizeof(buf));
+ buf_extend(&buf);
+
+ updatesetbit_inword(vmi->bm_last_compword,
+ tidnum - tidLocation - 1,
+ tidLocation + 1, &buf);
+ _bitmap_write_new_bitmapwords(rel, vmiBuffer, vmiOffset,
+ &buf, use_wal);
+
+ _bitmap_free_tidbuf(&buf);
+
+ return;
+ }
+
+ /*
+ * Now, tidnum is in the middle of the bitmap vector.
+ * We try to find the bitmap page that contains this bit,
+ * and update the bit.
+ */
+ /* find the page that contains this bit. */
+ findbitmappage(rel, vmi, tidnum, &bitmapBuffer, &firstTidNumber);
+
+ /* trade in the read lock for a write lock */
+ LockBuffer(bitmapBuffer, BUFFER_LOCK_UNLOCK);
+ LockBuffer(bitmapBuffer, BM_WRITE);
+
+ updatesetbit_inpage(rel, tidnum, vmiBuffer, vmiOffset,
+ bitmapBuffer, firstTidNumber, use_wal);
+
+ _bitmap_relbuf(bitmapBuffer);
+}
+
+/*
+ * updatesetbit_inword() -- update the given bit to 1 in a given
+ * word.
+ *
+ * The given word will generate at most three new words, depending on
+ * the position of the given bit to be updated. Make sure that the
+ * array 'words' has the size of 3 when you call this function. All new
+ * words will be put in this array, and the final number of new words is
+ * stored in '*numWordsP'. The bit location 'updateBitLoc' is relative to
+ * the beginning of the given word, starting from 0.
+ *
+ * We assume that word is a fill zero word.
+ */
+void
+updatesetbit_inword(BM_WORD word, uint64 updateBitLoc,
+ uint64 firstTid, BMTIDBuffer *buf)
+{
+ uint64 numBits, usedNumBits;
+ uint16 insertingPos;
+
+ Assert(updateBitLoc < BM_WORD_SIZE*FILL_LENGTH(word));
+
+ numBits = FILL_LENGTH(word) * BM_WORD_SIZE;
+ usedNumBits = 0;
+ if (updateBitLoc >= BM_WORD_SIZE)
+ {
+ firstTid += (updateBitLoc/BM_WORD_SIZE) * BM_WORD_SIZE;
+ buf->cwords[buf->curword] =
+ BM_MAKE_FILL_WORD(0, updateBitLoc/BM_WORD_SIZE);
+ buf->last_tids[buf->curword] = firstTid - 1;
+ buf->curword++;
+ buf_extend(buf);
+ buf->hwords[buf->curword/BM_WORD_SIZE] |=
+ (((BM_WORD)1)<<(BM_WORD_SIZE - buf->curword));
+ usedNumBits += (updateBitLoc/BM_WORD_SIZE) * BM_WORD_SIZE;
+ }
+
+ /* construct the literal word */
+ insertingPos = updateBitLoc - usedNumBits;
+ firstTid += BM_WORD_SIZE;
+ buf->cwords[buf->curword] =
+ ((BM_WORD)0) | (((BM_WORD)1) << insertingPos);
+ buf->last_tids[buf->curword] = firstTid - 1;
+ buf->curword++;
+ buf_extend(buf);
+ usedNumBits += BM_WORD_SIZE;
+
+ if (numBits > usedNumBits)
+ {
+ Assert((numBits - usedNumBits) % BM_WORD_SIZE == 0);
+
+ firstTid += ((numBits - usedNumBits) / BM_WORD_SIZE) * BM_WORD_SIZE;
+ buf->cwords[buf->curword] = BM_MAKE_FILL_WORD(0,
+ (numBits - usedNumBits) / BM_WORD_SIZE);
+ buf->last_tids[buf->curword] = firstTid -1;
+ buf->curword++;
+ buf_extend(buf);
+ buf->hwords[buf->curword/BM_WORD_SIZE] |=
+ (1 << (BM_WORD_SIZE - buf->curword));
+ }
+}
+
+/*
+ * rshift_header_bits() -- 'in-place' right-shift bits in given words
+ * 'bits' bits.
+ *
+ * Assume that 'bits' is smaller than BM_WORD_SIZE. The right-most
+ * 'bits' bits will be ignored.
+ */
+void
+rshift_header_bits(BM_WORD* words, uint64 nwords,
+ uint32 bits)
+{
+ BM_WORD shifting_bits = 0;
+ uint32 word_no;
+
+ Assert(bits < BM_WORD_SIZE);
+
+ for (word_no = 0; word_no < nwords; word_no++)
+ {
+ BM_WORD new_shifting_bits =
+ ((BM_WORD)words[word_no]) << (BM_WORD_SIZE - bits);
+ words[word_no] = (words[word_no] >> bits) | shifting_bits;
+
+ shifting_bits = new_shifting_bits;
+ }
+}
+
+/*
+ * lshift_header_bits() -- 'in-place' left-shift bits in given words
+ * 'bits' bits.
+ *
+ * Assume that 'bits' is smaller than BM_WORD_SIZE. The left-most
+ * 'bits' bits will be ignored.
+ */
+void
+lshift_header_bits(BM_WORD* words, uint64 nwords,
+ uint32 bits)
+{
+ uint32 word_no;
+ Assert(bits < BM_WORD_SIZE);
+
+ for (word_no = 0; word_no < nwords; word_no++)
+ {
+ BM_WORD shifting_bits =
+ words[word_no] >> (BM_WORD_SIZE - bits);
+ words[word_no] = ((BM_WORD) words[word_no]) << bits;
+
+ if (word_no != 0)
+ words[word_no - 1] |= shifting_bits;
+ }
+}
+
+/*
+ * shift_header_bits() -- right-shift bits after 'startLoc' for
+ * 'numofShiftingBits' bits.
+ *
+ * These bits are stored in an array of words with the word size of
+ * BM_WORD_SIZE. This shift is done in-place. The maximum number of
+ * words in this array is given. If the shifting causes the array not to
+ * have enough space for all bits, the right-most overflow bits will be
+ * discarded. The value 'startLoc' starts with 0.
+ */
+void
+shift_header_bits(BM_WORD* words, uint32 numOfBits,
+ uint32 maxNumOfWords, uint32 startLoc,
+ uint32 numOfShiftingBits)
+{
+ uint32 startWordNo;
+ uint32 endWordNo;
+ uint32 wordNo;
+ uint32 numOfFinalShiftingBits;
+ BM_WORD tmpWord;
+
+ Assert(startLoc <= numOfBits);
+ Assert(((numOfBits-1) / BM_WORD_SIZE) < maxNumOfWords);
+
+ startWordNo = startLoc/BM_WORD_SIZE;
+ endWordNo = (numOfBits-1)/BM_WORD_SIZE;
+
+ for (wordNo = endWordNo; wordNo > startWordNo; wordNo--)
+ {
+ /*
+ * obtain the last 'numOfShiftingBits' bits in the words[wordNo],
+ * and store them in the high-end of a word.
+ */
+ tmpWord = (((BM_WORD)words[wordNo]) <<
+ (BM_WORD_SIZE - numOfShiftingBits));
+
+ /* right-shift the original word 'numOfShiftingBits' bits. */
+ words[wordNo] = (((BM_WORD)words[wordNo])>>numOfShiftingBits);
+
+ /* OR those shifted bits into the next word in the array. */
+ if (wordNo < maxNumOfWords-1)
+ words[wordNo + 1] |= tmpWord;
+ }
+
+ /* obtain bits after 'startLoc'.*/
+ tmpWord =
+ ((BM_WORD) (words[startWordNo] << (startLoc%BM_WORD_SIZE))) >>
+ (startLoc%BM_WORD_SIZE);
+
+ words[startWordNo] = ((BM_WORD)(words[startWordNo] >>
+ (BM_WORD_SIZE-startLoc%BM_WORD_SIZE))) <<
+ (BM_WORD_SIZE-startLoc%BM_WORD_SIZE);
+
+ numOfFinalShiftingBits = numOfShiftingBits;
+ if (BM_WORD_SIZE - startLoc % BM_WORD_SIZE < numOfShiftingBits)
+ numOfFinalShiftingBits = BM_WORD_SIZE - startLoc % BM_WORD_SIZE;
+
+ words[startWordNo] |= (tmpWord>>numOfFinalShiftingBits);
+
+ if (startWordNo < maxNumOfWords-1)
+ {
+ tmpWord =
+ ((BM_WORD) (tmpWord << (BM_WORD_SIZE - numOfFinalShiftingBits))) >>
+ (numOfShiftingBits - numOfFinalShiftingBits);
+ words[startWordNo+1] |= tmpWord;
+ }
+}
+
+/*
+ * insert_newwords() -- insert a buffer of new words into a given buffer of
+ * words at a specified position.
+ *
+ * The new words will be inserted into the positions starting from
+ * 'insertPos'(>=0). The original words from 'insertPos' will be shifted
+ * to the right. If the given array does not have enough space to
+ * hold all words, the last '(*numWordsP+numNewWords-maxNumWords)' words
+ * will be stored in the buffer 'words_left', for which the caller should set
+ * the enough space to hold these left words.
+ *
+ * All three buffers are specified as BMTIDBuffer objects, in which the following
+ * fields are used:
+ * curword -- the number of content words in this buffer.
+ * num_cwords -- the maximum number of content words that are allowed.
+ * hwords -- the header words
+ * cwords -- the content words
+ *
+ * This function assumes that the number of new words is not greater than BM_WORD_SIZE.
+ */
+void
+insert_newwords(BMTIDBuffer* words, uint32 insertPos,
+ BMTIDBuffer* new_words, BMTIDBuffer* words_left)
+{
+ int32 wordNo;
+ uint16 bitLoc;
+
+ Assert(new_words->curword <= BM_WORD_SIZE);
+ Assert(insertPos <= words->num_cwords);
+
+ words_left->curword = 0;
+
+ /* if there are no words in the original buffer, we simply copy the new words. */
+ if (words->curword == 0)
+ {
+ memcpy(words->cwords, new_words->cwords,
+ new_words->curword * sizeof(BM_WORD));
+ memcpy(words->hwords, new_words->hwords,
+ BM_CALC_H_WORDS(new_words->curword) * sizeof(BM_WORD));
+ words->curword = new_words->curword;
+
+ return;
+ }
+
+ /*
+ * if insertPos is pointing to the position after the maximum position
+ * in this word, we simply copy the new words to leftContentWords.
+ */
+ if (insertPos == words->num_cwords)
+ {
+ memcpy(words_left->cwords, new_words->cwords,
+ new_words->curword * sizeof(BM_WORD));
+ memcpy(words_left->hwords, new_words->hwords,
+ BM_CALC_H_WORDS(new_words->curword) * sizeof(BM_WORD));
+ words_left->curword = new_words->curword;
+
+ return;
+ }
+
+ Assert(words->curword > 0);
+
+ /* Calculate how many words left after this insert. */
+ if (words->curword + new_words->curword > words->num_cwords)
+ words_left->curword =
+ words->curword + new_words->curword - words->num_cwords;
+ MemSet(words_left->hwords, 0, BM_NUM_OF_HEADER_WORDS * sizeof(BM_WORD));
+
+ /*
+ * Walk from the last word in the array back to 'insertPos'.
+ * If the word no + new_words->curword is greater than words->num_cwords,
+ * we store these words in words_left.
+ */
+ for (wordNo = words->curword-1;
+ wordNo >= 0 && wordNo >= insertPos;
+ wordNo--)
+ {
+ if (wordNo + new_words->curword >= words->num_cwords)
+ {
+ words_left->cwords[wordNo+new_words->curword-words->num_cwords] =
+ words->cwords[wordNo];
+ if (IS_FILL_WORD(words->hwords, wordNo))
+ {
+ uint32 o = (int)wordNo/BM_WORD_SIZE;
+ uint32 n = wordNo + new_words->curword - words->num_cwords;
+
+ words_left->hwords[0] |= WORDNO_GET_HEADER_BIT(n);
+ words->hwords[o] &= ~(WORDNO_GET_HEADER_BIT(wordNo));
+ }
+ }
+ else
+ words->cwords[wordNo + new_words->curword] = words->cwords[wordNo];
+ }
+
+ /* insert new words */
+ for (wordNo=0; wordNo<new_words->curword; wordNo++)
+ {
+ if (insertPos+wordNo>= words->num_cwords)
+ {
+ uint32 n = insertPos + wordNo - words->num_cwords;
+
+ words_left->cwords[n] = new_words->cwords[wordNo];
+ if (IS_FILL_WORD(new_words->hwords, wordNo))
+ words_left->hwords[0] |= WORDNO_GET_HEADER_BIT(n);
+ }
+ else
+ words->cwords[insertPos+wordNo] = new_words->cwords[wordNo];
+ }
+
+ /* right-shift the bits in the header words */
+ shift_header_bits(words->hwords, words->curword,
+ BM_NUM_OF_HEADER_WORDS, insertPos,
+ new_words->curword);
+
+ /* set the newWords header bits */
+ for (bitLoc = insertPos;
+ bitLoc < insertPos + new_words->curword && bitLoc < words->num_cwords;
+ bitLoc++)
+ {
+ if (IS_FILL_WORD(new_words->hwords, bitLoc-insertPos))
+ {
+ uint32 off = (uint32) (bitLoc / BM_WORD_SIZE);
+
+ words->hwords[off] |= WORDNO_GET_HEADER_BIT(bitLoc);
+ }
+ }
+
+ words->curword += (new_words->curword - words_left->curword);
+}
+
+/*
+ * updatesetbit_inpage() -- update the given bit to 1 in a given
+ * bitmap page.
+ *
+ * The argument 'firstTidNumber' indicates the first tid location of
+ * the bits stored in this page. This is necessary for locating the bit
+ * of 'tidnum'.
+ *
+ * This update may generate new words that cause this page to overflow.
+ * In this case, we will first check the next bitmap page have enough
+ * space for these new words. If so, we update these two pages. Otherwise,
+ * a new bitmap page is created.
+ */
+static void
+updatesetbit_inpage(Relation rel, uint64 tidnum,
+ Buffer vmiBuffer, OffsetNumber vmiOffset,
+ Buffer bitmapBuffer, uint64 firstTidNumber,
+ bool use_wal)
+{
+ Page bitmapPage;
+ BMPageOpaque bitmapOpaque;
+ BMBitmapVectorPage bitmap;
+ Buffer nextBuffer;
+ Page nextPage;
+ BMPageOpaque nextOpaque;
+ BMBitmapVectorPage nextBitmap;
+
+ uint64 bitNo = 0;
+ uint32 wordNo;
+ uint32 free_words;
+ BM_WORD word = 0;
+ bool found = false;
+
+ BMTIDBuffer words;
+ BMTIDBuffer new_words;
+ BMTIDBuffer words_left;
+
+ bool new_page;
+ bool new_lastpage;
+ int word_no;
+
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmapOpaque = (BMPageOpaque) PageGetSpecialPointer(bitmapPage);
+
+ bitmap = (BMBitmapVectorPage) PageGetContents(bitmapPage);
+ bitNo = 0;
+
+ /* Find the word that contains the bit of tidnum. */
+ for (wordNo = 0; wordNo < bitmapOpaque->bm_hrl_words_used; wordNo++)
+ {
+ word = bitmap->cwords[wordNo];
+ if (IS_FILL_WORD(bitmap->hwords, wordNo))
+ bitNo += FILL_LENGTH(word) * BM_WORD_SIZE;
+ else
+ bitNo += BM_WORD_SIZE;
+
+ if (firstTidNumber + bitNo - 1 >= tidnum)
+ {
+ found = true;
+ break; /* find the word */
+ }
+ }
+
+ if (!found)
+ elog(ERROR, "bitmap word uninitialized");
+
+ Assert (wordNo <= bitmapOpaque->bm_hrl_words_used);
+
+ /*
+ * If the word containing the updating bit is a literal word,
+ * we simply update the word, and return.
+ */
+ if (!IS_FILL_WORD(bitmap->hwords, wordNo))
+ {
+ uint16 insertingPos = (tidnum - 1) % BM_WORD_SIZE;
+
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(bitmapBuffer);
+
+ bitmap->cwords[wordNo] |= (((BM_WORD)1)<<insertingPos);
+
+ if (use_wal)
+ _bitmap_log_updateword(rel, bitmapBuffer, wordNo);
+
+ END_CRIT_SECTION();
+
+ return;
+ }
+
+ /* If this bit is already 1, then simply return. */
+ if (GET_FILL_BIT(word) == 1)
+ return;
+
+ firstTidNumber = firstTidNumber + bitNo -
+ FILL_LENGTH(word) * BM_WORD_SIZE;
+
+ Assert(tidnum >= firstTidNumber);
+
+ MemSet(&new_words, 0, sizeof(new_words));
+ buf_extend(&new_words);
+ updatesetbit_inword(word, tidnum - firstTidNumber, firstTidNumber,
+ &new_words);
+
+ /* Make sure that there are at most 3 new words. */
+ Assert(new_words.curword <= 3);
+
+ if (new_words.curword == 1)
+ {
+ uint32 off = wordNo/BM_WORD_SIZE;
+
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(bitmapBuffer);
+
+ bitmap->cwords[wordNo] = new_words.cwords[0];
+ bitmap->hwords[off] &= ~WORDNO_GET_HEADER_BIT(wordNo);
+
+ if (use_wal)
+ _bitmap_log_updateword(rel, bitmapBuffer, wordNo);
+
+ END_CRIT_SECTION();
+ return;
+ }
+
+ /*
+ * Check if this page has enough space for all new words. If so,
+ * replace this word with new words. Otherwise,
+ * we first check if the next page has enough space for all new words.
+ * If so, insert new words to the next page, otherwise,
+ * create a new page.
+ */
+ free_words = BM_NUM_OF_HRL_WORDS_PER_PAGE -
+ bitmapOpaque->bm_hrl_words_used;
+
+ new_page = false;
+ new_lastpage = false;
+ nextBuffer = InvalidBuffer;
+
+ if (free_words < new_words.curword - 1)
+ {
+ if (bitmapOpaque->bm_bitmap_next != InvalidBlockNumber)
+ {
+ nextBuffer = _bitmap_getbuf(rel, bitmapOpaque->bm_bitmap_next,
+ BM_WRITE);
+ nextPage = BufferGetPage(nextBuffer);
+ nextOpaque = (BMPageOpaque)PageGetSpecialPointer(nextPage);
+ free_words = BM_NUM_OF_HRL_WORDS_PER_PAGE -
+ nextOpaque->bm_hrl_words_used;
+ }
+ else
+ {
+ new_lastpage = true;
+ }
+ }
+
+ if (free_words < new_words.curword - 1)
+ {
+ if (BufferIsValid(nextBuffer))
+ _bitmap_relbuf(nextBuffer);
+
+ nextBuffer = _bitmap_getbuf(rel, P_NEW, BM_WRITE);
+ _bitmap_init_bitmappage(nextBuffer);
+ new_page = true;
+ free_words = BM_NUM_OF_HRL_WORDS_PER_PAGE;
+ }
+
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(bitmapBuffer);
+ if (BufferIsValid(nextBuffer))
+ MarkBufferDirty(nextBuffer);
+
+ if (new_lastpage)
+ {
+ Page vmiPage;
+ BMVectorMetaItem vmi;
+
+ MarkBufferDirty(vmiBuffer);
+
+ vmiPage = BufferGetPage(vmiBuffer);
+ vmi = (BMVectorMetaItem) PageGetItem(vmiPage,
+ PageGetItemId(vmiPage, vmiOffset));
+ vmi->bm_bitmap_tail = BufferGetBlockNumber(nextBuffer);
+ }
+
+ bitmap->cwords[wordNo] = new_words.cwords[0];
+ if (tidnum - firstTidNumber + 1 <= BM_WORD_SIZE)
+ {
+ uint32 off = wordNo/BM_WORD_SIZE;
+
+ bitmap->hwords[off] &= ~WORDNO_GET_HEADER_BIT(wordNo);
+ }
+
+ /* ignore the first word in new_words.cwords. */
+ new_words.hwords[0] = ((BM_WORD)new_words.hwords[0]) << 1;
+ for (word_no = 0; word_no < new_words.curword - 1; word_no++)
+ new_words.cwords[word_no] = new_words.cwords[word_no+1];
+ new_words.curword--;
+
+ /* Create the buffer for the original words */
+ MemSet(&words, 0, sizeof(words));
+ words.cwords = bitmap->cwords;
+ memcpy(words.hwords, bitmap->hwords,
+ BM_CALC_H_WORDS(bitmapOpaque->bm_hrl_words_used) * sizeof(BM_WORD));
+ words.num_cwords = BM_NUM_OF_HRL_WORDS_PER_PAGE;
+ words.curword = bitmapOpaque->bm_hrl_words_used;
+
+ MemSet(&words_left, 0, sizeof(words_left));
+ buf_extend(&words_left);
+
+ insert_newwords(&words, wordNo + 1, &new_words, &words_left);
+
+ /*
+ * We have to copy header words back to the page, and set the correct
+ * number of words in the page.
+ */
+ bitmapOpaque->bm_hrl_words_used = words.curword;
+ memcpy(bitmap->hwords, words.hwords,
+ BM_CALC_H_WORDS(bitmapOpaque->bm_hrl_words_used) * sizeof(BM_WORD));
+
+ if (new_page)
+ {
+ nextPage = BufferGetPage(nextBuffer);
+ nextOpaque = (BMPageOpaque)PageGetSpecialPointer(nextPage);
+ nextBitmap = (BMBitmapVectorPage)PageGetContents(nextPage);
+
+ nextOpaque->bm_last_tid_location = bitmapOpaque->bm_last_tid_location;
+ nextOpaque->bm_bitmap_next = bitmapOpaque->bm_bitmap_next;
+ bitmapOpaque->bm_bitmap_next = BufferGetBlockNumber(nextBuffer);
+ }
+
+ bitmapOpaque->bm_last_tid_location -=
+ getnumbits(words_left.cwords, words_left.hwords, words_left.curword);
+
+ if (words_left.curword > 0)
+ {
+ nextPage = BufferGetPage(nextBuffer);
+ nextOpaque = (BMPageOpaque)PageGetSpecialPointer(nextPage);
+ nextBitmap = (BMBitmapVectorPage)PageGetContents(nextPage);
+
+ /* Create the buffer for the original words */
+ MemSet(&words, 0, sizeof(words));
+ words.cwords = nextBitmap->cwords;
+ memcpy(words.hwords, nextBitmap->hwords,
+ BM_CALC_H_WORDS(nextOpaque->bm_hrl_words_used) *
+ sizeof(BM_WORD));
+ words.num_cwords = BM_NUM_OF_HRL_WORDS_PER_PAGE;
+ words.curword = nextOpaque->bm_hrl_words_used;
+
+ MemSet(&new_words, 0, sizeof(new_words));
+
+ insert_newwords(&words, 0, &words_left, &new_words);
+
+ /*
+ * We have to copy header words back to the page, and set the correct
+ * number of words in the page.
+ */
+ nextOpaque->bm_hrl_words_used = words.curword;
+ memcpy(nextBitmap->hwords, words.hwords,
+ BM_CALC_H_WORDS(nextOpaque->bm_hrl_words_used) *
+ sizeof(BM_WORD));
+
+ Assert(new_words.curword == 0);
+ }
+
+ if (use_wal)
+ _bitmap_log_updatewords(rel, vmiBuffer, vmiOffset,
+ bitmapBuffer, nextBuffer, new_lastpage);
+
+ END_CRIT_SECTION();
+
+ if (BufferIsValid(nextBuffer))
+ _bitmap_relbuf(nextBuffer);
+
+ _bitmap_free_tidbuf(&new_words);
+ _bitmap_free_tidbuf(&words_left);
+}
+
+/*
+ * findbitmappage() -- find the bitmap page that contains
+ * the given tid location, and obtain the first tid location
+ * in this page.
+ *
+ * We assume that this tid location is not in bm_last_compword or
+ * bm_last_word of its VMI.
+ *
+ * We will have read lock on the bitmap page we find.
+ */
+void
+findbitmappage(Relation rel, BMVectorMetaItem vmi, uint64 tidnum,
+ Buffer *bitmapBufferP, uint64 *firstTidNumberP)
+{
+ BlockNumber nextBlockNo = vmi->bm_bitmap_head;
+
+ *firstTidNumberP = 1;
+
+ while (BlockNumberIsValid(nextBlockNo))
+ {
+ Page bitmapPage;
+ BMPageOpaque bitmapOpaque;
+
+ *bitmapBufferP = _bitmap_getbuf(rel, nextBlockNo, BM_READ);
+ bitmapPage = BufferGetPage(*bitmapBufferP);
+ bitmapOpaque = (BMPageOpaque)
+ PageGetSpecialPointer(bitmapPage);
+
+ if (bitmapOpaque->bm_last_tid_location >= tidnum)
+ return; /* find the page */
+
+ (*firstTidNumberP) = bitmapOpaque->bm_last_tid_location + 1;
+ nextBlockNo = bitmapOpaque->bm_bitmap_next;
+
+ _bitmap_relbuf(*bitmapBufferP);
+ }
+
+ /*
+ * We can't find such a page. This should not happen. So we error out.
+ */
+ elog(ERROR, "cannot find the bitmap page containing tid=%08x:%04x",
+ BM_INT_GET_BLOCKNO(tidnum), BM_INT_GET_OFFSET(tidnum));
+}
+
+#ifdef DEBUG_BITMAP
+/*
+ * verify_bitmappages() -- verify if the bm_last_tid_location values
+ * are valid in all bitmap pages. Only used during debugging.
+ */
+static void
+verify_bitmappages(Relation rel, BMVectorMetaItem vmi)
+{
+ BlockNumber nextBlockNo = vmi->bm_bitmap_head;
+ uint64 tidnum = 0;
+
+ while (BlockNumberIsValid(nextBlockNo))
+ {
+ Page bitmapPage;
+ BMPageOpaque bitmapOpaque;
+ Buffer bitmapBuffer;
+ uint32 wordNo;
+ BMBitmapVectorPage bitmap;
+
+ bitmapBuffer = _bitmap_getbuf(rel, nextBlockNo, BM_READ);
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmapOpaque = (BMPageOpaque)
+ PageGetSpecialPointer(bitmapPage);
+ bitmap = (BMBitmapVectorPage) PageGetContents(bitmapPage);
+
+ for (wordNo = 0; wordNo < bitmapOpaque->bm_hrl_words_used; wordNo++)
+ {
+ BM_WORD word = bitmap->cwords[wordNo];
+ if (IS_FILL_WORD(bitmap->hwords, wordNo))
+ tidnum += FILL_LENGTH(word) * BM_WORD_SIZE;
+ else
+ tidnum += BM_WORD_SIZE;
+
+ }
+
+ if (bitmapOpaque->bm_last_tid_location != tidnum)
+ elog(ERROR, "bm_last_tid_location=%lld, tidnum=%lld",
+ bitmapOpaque->bm_last_tid_location, tidnum);
+
+ nextBlockNo = bitmapOpaque->bm_bitmap_next;
+
+ _bitmap_relbuf(bitmapBuffer);
+ }
+}
+#endif /* DEBUG_BITMAP */
+
+/*
+ * mergewords() -- merge last two bitmap words based on the HRL compression
+ * scheme. If these two words can not be merged, the last complete
+ * word will be appended into the word array in the buffer.
+ *
+ * If the buffer is extended, this function returns the number
+ * of bytes used.
+ */
+int16
+mergewords(BMTIDBuffer *buf, bool lastWordFill)
+{
+ int16 bytes_used = 0;
+
+ /* the last_tid in the complete word */
+ uint64 last_tid = buf->last_tid - (buf->last_tid - 1) % BM_WORD_SIZE - 1;
+
+#ifdef DEBUG_BMI
+ _debug_view_2(buf, "[mergewords] BEGIN");
+ elog(NOTICE, "lastWordFill = %d, last_tid = 0x" UINT64X_FORMAT,
+ lastWordFill, last_tid);
+#endif
+
+ /*
+ * If both words are LITERAL_ALL_ONE, then it is the very
+ * first invocation for this VMI, so we skip mergewords.
+ */
+ if (buf->last_compword == LITERAL_ALL_ONE
+ && buf->last_word == LITERAL_ALL_ONE)
+ return bytes_used;
+
+ /*
+ * If last_compword is LITERAL_ALL_ONE, it is not set yet.
+ * We move last_word to it.
+ */
+ if (buf->last_compword == LITERAL_ALL_ONE)
+ {
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[mergewords] CP1"
+ "\n\tbuf->last_compword == LITERAL_ALL_ONE");
+#endif
+ buf->last_compword = buf->last_word;
+ buf->is_last_compword_fill = lastWordFill;
+ if (lastWordFill)
+ last_tid = FILL_LENGTH(buf->last_word) * BM_WORD_SIZE;
+ else
+ last_tid = BM_WORD_SIZE;
+
+ buf->last_word = 0;
+ buf->last_tid = last_tid;
+
+#ifdef DEBUG_BMI
+ _debug_view_2(buf, "[mergewords] END 1");
+#endif
+ return bytes_used;
+ }
+ /*
+ * If both words are fill words, and have the same fill bit, we increment
+ * the fill length of the last complete word by the fill length stored in
+ * the last word.
+ */
+ if (buf->is_last_compword_fill && lastWordFill &&
+ (GET_FILL_BIT(buf->last_compword) ==
+ GET_FILL_BIT(buf->last_word)))
+ {
+ BM_WORD lengthMerged;
+
+ if (FILL_LENGTH(buf->last_compword) +
+ FILL_LENGTH(buf->last_word) <
+ MAX_FILL_LENGTH)
+ {
+ last_tid += FILL_LENGTH(buf->last_word)*BM_WORD_SIZE;
+ buf->last_compword += FILL_LENGTH(buf->last_word);
+ buf->last_word = LITERAL_ALL_ZERO;
+ buf->last_tid = last_tid;
+
+#ifdef DEBUG_BMI
+ _debug_view_2(buf, "[mergewords] END 2");
+#endif
+ return bytes_used;
+ }
+
+ lengthMerged =
+ MAX_FILL_LENGTH - FILL_LENGTH(buf->last_compword);
+ buf->last_word -= lengthMerged;
+ last_tid += lengthMerged * BM_WORD_SIZE;
+ buf->last_compword += lengthMerged;
+ }
+
+ /*
+ * Here, these two words can not be merged together. We move the last
+ * complete word to the array, and set it to be the last word.
+ */
+
+ /*
+ * When there are not enough space in the array of new words, we
+ * re-allocate a bigger space.
+ */
+ bytes_used += buf_extend(buf);
+
+ buf->cwords[buf->curword] = buf->last_compword;
+ buf->last_tids[buf->curword] = last_tid;
+
+ if (buf->is_last_compword_fill)
+ buf->hwords[buf->curword/BM_WORD_SIZE] |=
+ ((BM_WORD)1) << (BM_WORD_SIZE -
+ buf->curword % BM_WORD_SIZE - 1);
+
+ buf->curword++;
+
+ buf->last_compword = buf->last_word;
+ buf->is_last_compword_fill = lastWordFill;
+ if (buf->is_last_compword_fill)
+ last_tid += FILL_LENGTH(buf->last_compword) * BM_WORD_SIZE;
+ else
+ last_tid += BM_WORD_SIZE;
+
+ buf->last_word = 0;
+ buf->last_tid = last_tid;
+
+#ifdef DEBUG_BMI
+ _debug_view_2(buf, "[mergewords] END 3");
+#endif
+ return bytes_used;
+}
+
+/*
+ * _bitmap_write_new_bitmapwords() -- write a given buffer of new bitmap words
+ * into the end of bitmap page(s).
+ *
+ * If the last bitmap page does not have enough space for all these new words,
+ * new pages will be allocated here.
+ *
+ * We consider a write to one bitmap page as one atomic-action WAL record. The
+ * WAL record for the write to the last bitmap page also includes updates on
+ * the VMI. Writes to the non-last bitmap page are not self-consistent. We
+ * need to do some fix-up during WAL logic replay.
+ */
+void
+_bitmap_write_new_bitmapwords(Relation rel, Buffer vmiBuffer,
+ OffsetNumber vmiOffset, BMTIDBuffer *buf,
+ bool use_wal)
+{
+ Page vmiPage;
+ BMVectorMetaItem vmi;
+
+ Buffer bitmapBuffer;
+ Page bitmapPage;
+ BMPageOpaque bitmapPageOpaque;
+
+ uint64 numFreeWords;
+ uint64 words_written = 0;
+ bool isFirst = false;
+
+ vmiPage = BufferGetPage(vmiBuffer);
+ vmi = (BMVectorMetaItem) PageGetItem(vmiPage,
+ PageGetItemId(vmiPage, vmiOffset));
+
+ bitmapBuffer = get_lastbitmappagebuf(rel, vmi);
+
+ if (BufferIsValid(bitmapBuffer))
+ {
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmapPageOpaque =
+ (BMPageOpaque)PageGetSpecialPointer(bitmapPage);
+
+ numFreeWords = BM_NUM_OF_HRL_WORDS_PER_PAGE -
+ bitmapPageOpaque->bm_hrl_words_used;
+ }
+ else
+ {
+ bitmapBuffer = _bitmap_getbuf(rel, P_NEW, BM_WRITE);
+ _bitmap_init_bitmappage(bitmapBuffer);
+
+ numFreeWords = BM_NUM_OF_HRL_WORDS_PER_PAGE;
+ }
+
+ while (numFreeWords < buf->curword - buf->start_wordno)
+ {
+ Buffer newBuffer;
+
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmapPageOpaque =
+ (BMPageOpaque)PageGetSpecialPointer(bitmapPage);
+
+ newBuffer = _bitmap_getbuf(rel, P_NEW, BM_WRITE);
+ _bitmap_init_bitmappage(newBuffer);
+
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(bitmapBuffer);
+
+ if (numFreeWords > 0)
+ {
+ words_written =
+ _bitmap_write_bitmapwords(bitmapBuffer, buf);
+ }
+
+ bitmapPageOpaque->bm_bitmap_next = BufferGetBlockNumber(newBuffer);
+
+ if (vmi->bm_bitmap_head == InvalidBlockNumber)
+ {
+ isFirst = true;
+ MarkBufferDirty(vmiBuffer);
+ vmi->bm_bitmap_head = BufferGetBlockNumber(bitmapBuffer);
+ vmi->bm_bitmap_tail = vmi->bm_bitmap_head;
+ }
+
+ if (use_wal)
+ _bitmap_log_bitmapwords(rel, bitmapBuffer, vmiBuffer, vmiOffset,
+ buf, words_written, buf->last_tid,
+ BufferGetBlockNumber(newBuffer),
+ false, isFirst);
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_write_new_bitmapwords] CP1 (+=) : "
+ "buf->start_wordno = %d , "
+ "words_written = " UINT64_FORMAT,
+ buf->start_wordno, words_written);
+#endif
+ buf->start_wordno += words_written;
+
+ END_CRIT_SECTION();
+
+ _bitmap_relbuf(bitmapBuffer);
+
+ bitmapBuffer = newBuffer;
+ numFreeWords = BM_NUM_OF_HRL_WORDS_PER_PAGE;
+ }
+
+ /*
+ * Write remaining bitmap words to the last bitmap page and the
+ * VMI page.
+ */
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(vmiBuffer);
+ MarkBufferDirty(bitmapBuffer);
+
+ if (buf->curword - buf->start_wordno > 0)
+ words_written = _bitmap_write_bitmapwords(bitmapBuffer, buf);
+ else
+ words_written = 0;
+
+ vmi->bm_last_compword = buf->last_compword;
+ vmi->bm_last_word = buf->last_word;
+ vmi->vmi_words_header = (buf->is_last_compword_fill) ?
+ BM_LAST_COMPWORD_BIT : BM_VMI_WORDS_NO_FILL;
+ vmi->bm_last_setbit = buf->last_tid;
+ vmi->bm_last_tid_location = buf->last_tid - buf->last_tid % BM_WORD_SIZE;
+ vmi->bm_bitmap_tail = BufferGetBlockNumber(bitmapBuffer);
+ if (vmi->bm_bitmap_head == InvalidBlockNumber)
+ {
+ isFirst = true;
+ vmi->bm_bitmap_head = vmi->bm_bitmap_tail;
+ }
+
+ if (use_wal)
+ {
+ _bitmap_log_bitmapwords(rel, bitmapBuffer, vmiBuffer,
+ vmiOffset, buf, words_written,
+ buf->last_tid, InvalidBlockNumber, true,
+ isFirst);
+ }
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_write_new_bitmapwords] CP2 (+=) : "
+ "buf->start_wordno = %d, words_written = " UINT64_FORMAT
+ "\n\tvmi->bm_last_setbit = " UINT64_FORMAT
+ "\n\tvmi->bm_last_tid_location = " UINT64_FORMAT,
+ buf->start_wordno, words_written,
+ vmi->bm_last_setbit,
+ vmi->bm_last_tid_location);
+#endif
+ buf->start_wordno += words_written;
+
+ Assert(buf->start_wordno == buf->curword);
+
+ END_CRIT_SECTION();
+
+ /* release bitmap buffer */
+ _bitmap_relbuf(bitmapBuffer);
+}
+
+
+/*
+ * _bitmap_write_bitmapwords() -- Write an array of bitmap words into a given
+ * bitmap page, and return how many words have been written in this call.
+ *
+ * The number of bitmap words writing to a given bitmap page is the maximum
+ * number of words that can be appended into the page.
+ *
+ * We have the write lock on the given bitmap page.
+ */
+uint64
+_bitmap_write_bitmapwords(Buffer bitmapBuffer, BMTIDBuffer *buf)
+{
+ uint64 startWordNo;
+ Page bitmapPage;
+ BMPageOpaque bitmapPageOpaque;
+ BMBitmapVectorPage bitmap;
+ uint64 cwords;
+ uint64 words_written;
+ uint64 start_hword_no, end_hword_no;
+ uint64 final_start_hword_no, final_end_hword_no;
+ BM_WORD *hwords;
+ uint64 num_hwords;
+ uint32 start_hword_bit, end_hword_bit, final_start_hword_bit;
+
+ startWordNo = buf->start_wordno;
+
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmapPageOpaque = (BMPageOpaque) PageGetSpecialPointer(bitmapPage);
+
+ cwords = bitmapPageOpaque->bm_hrl_words_used;
+
+ words_written = buf->curword - startWordNo;
+ if (words_written > BM_NUM_OF_HRL_WORDS_PER_PAGE - cwords)
+ words_written = BM_NUM_OF_HRL_WORDS_PER_PAGE - cwords;
+
+ Assert (words_written > 0);
+
+ /* Copy the content words */
+ bitmap = (BMBitmapVectorPage) PageGetContents(bitmapPage);
+ memcpy(bitmap->cwords + cwords,
+ buf->cwords + startWordNo,
+ words_written * sizeof(BM_WORD));
+
+ /*
+ * Shift the header words in 'words' to match with the bit positions
+ * in the header words in this page, and then copy them.
+ */
+ start_hword_no = startWordNo/BM_WORD_SIZE;
+ end_hword_no = (startWordNo + words_written - 1) / BM_WORD_SIZE;
+ num_hwords = end_hword_no - start_hword_no + 1;
+
+ hwords = (BM_WORD*)
+ palloc0((num_hwords + 1) * sizeof(BM_WORD));
+
+ memcpy(hwords, buf->hwords + start_hword_no,
+ num_hwords * sizeof(BM_WORD));
+
+ /* clean up the first and last header words */
+ start_hword_bit = startWordNo % BM_WORD_SIZE;
+ end_hword_bit = (startWordNo + words_written - 1) % BM_WORD_SIZE;
+
+ hwords[0] = ((BM_WORD)(hwords[0] << start_hword_bit)) >>
+ start_hword_bit;
+ hwords[num_hwords - 1] =
+ (hwords[num_hwords - 1] >> (BM_WORD_SIZE - end_hword_bit - 1)) <<
+ (BM_WORD_SIZE - end_hword_bit - 1);
+
+ final_start_hword_bit = cwords % BM_WORD_SIZE;
+
+ if (final_start_hword_bit > start_hword_bit)
+ {
+ /* right-shift 'final-start_hword_bit - start_hword_bit' */
+ rshift_header_bits(hwords, num_hwords + 1,
+ final_start_hword_bit - start_hword_bit);
+ }
+ else if (final_start_hword_bit < start_hword_bit)
+ {
+ /* left-shift 'start_hword_bit - final_start_hword_bit' */
+ lshift_header_bits(hwords, num_hwords,
+ start_hword_bit - final_start_hword_bit);
+ }
+
+ /* copy the header bits */
+ final_start_hword_no = cwords / BM_WORD_SIZE;
+ final_end_hword_no = (cwords + words_written - 1) / BM_WORD_SIZE;
+
+ bitmap->hwords[final_start_hword_no] |= hwords[0];
+ memcpy(bitmap->hwords + (final_start_hword_no + 1),
+ hwords + 1,
+ (final_end_hword_no - final_start_hword_no) *
+ sizeof(BM_WORD));
+
+ bitmapPageOpaque->bm_hrl_words_used += words_written;
+ bitmapPageOpaque->bm_last_tid_location =
+ buf->last_tids[startWordNo + words_written-1];
+
+ pfree(hwords);
+
+ return words_written;
+}
+
+/*
+ * create_loventry() -- create a new entry in the list of values.
+ *
+ * Each LOV entry is associated with one distinct value for attributes to be
+ * indexed. The function creates a new entry in the list of values (embodied
+ * by the lovHeap and the lovIndex).
+ *
+ * This function returns the VMI ID (block number and offset number) of this
+ * new LOV entry.
+ *
+ * The caller should have an exclusive lock on metabuf.
+ */
+static void
+create_loventry(Relation rel, Buffer metabuf, uint64 tidnum,
+ TupleDesc tupDesc, Datum *attdata, bool *nulls,
+ Relation lovHeap, Relation lovIndex, BMVMIID *vmiid,
+ bool use_wal, bool skip_index_insert)
+{
+ const int numOfAttrs = tupDesc->natts;
+ Page page;
+ BMMetaPage metapage;
+
+ Buffer vmiBuffer;
+ Page vmiPage;
+
+ BMVectorMetaItem vmi;
+ const OffsetNumber itemSize = sizeof(BMVectorMetaItemData);
+ bool is_new_vmi_blkno = false;
+
+ Datum *lovDatum;
+ bool *lovNulls;
+
+ vmi = _bitmap_formitem(tidnum);
+
+ /* Get the last VMI page. Meta page should be locked. */
+ page = BufferGetPage(metabuf);
+ metapage = (BMMetaPage) PageGetContents(page);
+
+ /* Get the last VMI buffer and page */
+ vmiBuffer = _bitmap_getbuf(rel, metapage->bm_last_vmi_page, BM_WRITE);
+ vmiPage = BufferGetPage(vmiBuffer);
+
+ /*
+ * If there is not enough space in the last VMI page for a new item,
+ * create a new VMI page, and update the metapage.
+ */
+ if (itemSize > PageGetFreeSpace(vmiPage))
+ {
+ Buffer newVmiBuffer;
+
+ /* create a new VMI page */
+ newVmiBuffer = _bitmap_getbuf(rel, P_NEW, BM_WRITE);
+ _bitmap_init_vmipage(newVmiBuffer);
+
+#if 0
+ START_CRIT_SECTION();
+ if (use_wal)
+ _bitmap_log_newpage(rel, XLOG_BITMAP_INSERT_NEWVMIPAGE,
+ newVmiBuffer);
+ END_CRIT_SECTION();
+#endif
+
+ _bitmap_relbuf(vmiBuffer);
+
+ vmiBuffer = newVmiBuffer;
+ vmiPage = BufferGetPage(vmiBuffer);
+
+ is_new_vmi_blkno = true;
+ }
+
+ START_CRIT_SECTION();
+
+ if (is_new_vmi_blkno)
+ {
+ MarkBufferDirty(metabuf);
+ metapage->bm_last_vmi_page = BufferGetBlockNumber(vmiBuffer);
+ }
+
+ MarkBufferDirty(vmiBuffer);
+
+ vmiid->offset = OffsetNumberNext(PageGetMaxOffsetNumber(vmiPage));
+ vmiid->block = metapage->bm_last_vmi_page;
+
+ /* Allocate a new Item */
+ lovDatum = palloc0((numOfAttrs + 2) * sizeof(Datum));
+ lovNulls = palloc0((numOfAttrs + 2) * sizeof(bool));
+ memcpy(lovDatum, attdata, numOfAttrs * sizeof(Datum));
+ memcpy(lovNulls, nulls, numOfAttrs * sizeof(bool));
+ lovDatum[numOfAttrs] = Int32GetDatum(vmiid->block);
+ lovDatum[numOfAttrs + 1] = Int16GetDatum(vmiid->offset);
+ lovNulls[numOfAttrs] = false;
+ lovNulls[numOfAttrs + 1] = false;
+
+ /* Insert the in the LOV HEAP and the LOV btree index */
+ _bitmap_insert_lov(lovHeap, lovIndex, lovDatum, lovNulls, use_wal,
+ skip_index_insert);
+
+ if (PageAddItem(vmiPage, (Item) vmi, itemSize, vmiid->offset,
+ false, false) == InvalidOffsetNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to add vector meta item to \"%s\"",
+ RelationGetRelationName(rel))));
+
+ /* Log the insertion */
+ if (use_wal)
+ _bitmap_log_vmi(rel, vmiBuffer, vmiid->offset, vmi, metabuf,
+ is_new_vmi_blkno);
+
+ END_CRIT_SECTION();
+
+ _bitmap_relbuf(vmiBuffer);
+
+ pfree(vmi);
+ pfree(lovDatum);
+ pfree(lovNulls);
+}
+
+/*
+ * When building an index we try and buffer calls to write tids to disk
+ * as it will result in lots of I/Os.
+ */
+static void
+buf_add_tid(Relation rel, uint64 tidnum, BMBuildState *state,
+ BlockNumber vmi_block, OffsetNumber off)
+{
+ BMTIDBuffer *buf;
+ BMTIDVMIBuffer *vmi_buf = NULL;
+ BMTidBuildBuf *tids = state->bm_tidLocsBuffer;
+
+#ifdef DEBUG_BMI
+ _debug_view_1(tids, "CP1");
+#endif
+ /* If we surpass maintenance_work_mem, free some space from the buffer */
+ if (tids->byte_size >= maintenance_work_mem * 1024L)
+ buf_make_space(rel, tids, state->use_wal);
+
+ /*
+ * tids is lazily initialized. If we do not have a current VMI block
+ * buffer, initialize one.
+ */
+ if (!BlockNumberIsValid(tids->max_vmi_block) ||
+ tids->max_vmi_block < vmi_block)
+ {
+ /*
+ * XXX: We're currently not including the size of this data structure
+ * in out byte_size count... should we?
+ */
+ vmi_buf = palloc(sizeof(BMTIDVMIBuffer));
+ vmi_buf->vmi_block = vmi_block;
+ MemSet(vmi_buf->bufs, 0, BM_MAX_VMI_PER_PAGE * sizeof(BMTIDBuffer *));
+ tids->max_vmi_block = vmi_block;
+
+ /*
+ * Add the new VMI buffer to the list head. It seems reasonable that
+ * future calls to this function will want this vmi_block rather than
+ * older vmi_blocks.
+ */
+ tids->vmi_blocks = lcons(vmi_buf, tids->vmi_blocks);
+ }
+ else
+ {
+ ListCell *cell;
+
+ foreach(cell, tids->vmi_blocks)
+ {
+ BMTIDVMIBuffer *tmp = lfirst(cell);
+ if (tmp->vmi_block == vmi_block)
+ {
+ vmi_buf = tmp;
+ break;
+ }
+ }
+ }
+
+ Assert(vmi_buf);
+ Assert(off - 1 < BM_MAX_VMI_PER_PAGE);
+
+ if (vmi_buf->bufs[off - 1])
+ {
+ buf = vmi_buf->bufs[off - 1];
+
+ buf_add_tid_with_fill(rel, buf, vmi_block, off,
+ tidnum, state->use_wal);
+ }
+ else
+ {
+ /* no pre-existing buffer found, create a new one */
+ Buffer vmibuf;
+ Page page;
+ BMVectorMetaItem vmi;
+ uint16 bytes_added;
+
+ buf = (BMTIDBuffer *) palloc0(sizeof(BMTIDBuffer));
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_add_tid] create new buf - CP1"
+ "\n\tlast_tid = 0x" UINT64X_FORMAT
+ "\n\tlast_compword = %u"
+ "\n\tlast_word = %d",
+ buf->last_tid,
+ buf->last_compword,
+ buf->last_word);
+#endif
+
+ vmibuf = _bitmap_getbuf(rel, vmi_block, BM_READ);
+ page = BufferGetPage(vmibuf);
+ vmi = (BMVectorMetaItem) PageGetItem(page, PageGetItemId(page, off));
+
+ buf->last_tid = vmi->bm_last_setbit;
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_add_tid] create new buf - CP1.1"
+ "\n\tlast_tid = 0x" UINT64X_FORMAT
+ "\n\tlast_compword = %u"
+ "\n\tlast_word = %d",
+ buf->last_tid,
+ buf->last_compword,
+ buf->last_word);
+#endif
+ buf->last_compword = vmi->bm_last_compword;
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_add_tid] create new buf - CP1.2"
+ "\n\tlast_tid = 0x" UINT64X_FORMAT
+ "\n\tlast_compword = %u"
+ "\n\tlast_word = %d",
+ buf->last_tid,
+ buf->last_compword,
+ buf->last_word);
+#endif
+ buf->last_word = vmi->bm_last_word;
+ buf->is_last_compword_fill = BM_LAST_COMPWORD_IS_FILL(vmi);
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_add_tid] create new buf - CP2"
+ "\n\tlast_tid = 0x" UINT64X_FORMAT
+ "\n\tlast_compword = %u"
+ "\n\tlast_word = %d",
+ buf->last_tid,
+ buf->last_compword,
+ buf->last_word);
+#endif
+
+ _bitmap_relbuf(vmibuf); /* we don't care about locking */
+
+ MemSet(buf->hwords, 0, BM_NUM_OF_HEADER_WORDS * sizeof(BM_WORD));
+
+ /* initialisation of HOT buffer data */
+ buf->hot_buffer_count = 0;
+ buf->hot_buffer_last_tid = 0;
+ buf->hot_buffer_block=InvalidBlockNumber;
+ MemSet(buf->hot_buffer, 0, BM_SIZEOF_HOT_BUFFER * sizeof(BM_WORD));
+
+ bytes_added = buf_extend(buf);
+
+ buf->curword = 0;
+ buf->start_wordno = 0;
+
+ buf_add_tid_with_fill(rel, buf, vmi_block, off, tidnum,
+ state->use_wal);
+
+ vmi_buf->bufs[off - 1] = buf;
+ tids->byte_size += bytes_added;
+ }
+}
+
+/*
+ * hot_buffer_flush() -- compress the contents of the HOT buffer into the the
+ * buffer words and write out data blocks when complete
+ */
+static int16
+hot_buffer_flush(Relation rel, BMTIDBuffer *buf, BlockNumber vmi_block,
+ OffsetNumber off, bool use_wal, bool merge_words)
+{
+ int i;
+ int16 bytes_used = 0;
+
+ for (i = 0; i < BM_SIZEOF_HOT_BUFFER; i++)
+ {
+ uint64 word_last_tid =
+ buf->hot_buffer_start_tid + (i + 1) * BM_WORD_SIZE - 1;
+
+ bytes_used -=
+ buf_ensure_head_space(rel, buf, vmi_block, off, use_wal);
+
+ if (word_last_tid <= buf->hot_buffer_last_tid) {
+ switch (buf->hot_buffer[i])
+ {
+ case LITERAL_ALL_ONE:
+ buf->last_word = BM_MAKE_FILL_WORD(1, 1);
+ buf->last_tid = word_last_tid;
+ bytes_used += mergewords(buf, true);
+ break;
+ case LITERAL_ALL_ZERO:
+ buf->last_word = BM_MAKE_FILL_WORD(0, 1);
+ buf->last_tid = word_last_tid;
+ bytes_used += mergewords(buf, true);
+ break;
+ default:
+ buf->last_word = buf->hot_buffer[i];
+ buf->last_tid = word_last_tid;
+ bytes_used += mergewords(buf, false);
+ }
+
+ /*
+ * If the HOT buffer ends exactly with the end of this word, reset
+ * last_word and end the loop.
+ */
+ if (word_last_tid == buf->hot_buffer_last_tid)
+ {
+ buf->last_word = 0;
+ break;
+ }
+ }
+ else
+ {
+ /*
+ * The last HOT buffer word is incomplete. Copy it to last word,
+ * adapt last_tid counter and then we are done with this HOT
+ * buffer.
+ */
+ buf->last_word = buf->hot_buffer[i];
+ buf->last_tid = buf->hot_buffer_last_tid;
+ break;
+ }
+ }
+
+ /* reset the buffer */
+ MemSet(buf->hot_buffer, 0, BM_SIZEOF_HOT_BUFFER * sizeof(BM_WORD));
+ buf->hot_buffer_count = 0;
+ buf->hot_buffer_last_tid = 0;
+
+ return bytes_used;
+}
+
+/*
+ * buf_add_fill() -- Append the given number of zeros to the end of the
+ * buffered vector.
+ */
+static int16
+buf_add_fill(Relation rel, BMTIDBuffer *buf, BlockNumber vmi_block,
+ OffsetNumber off, uint64 zeros, bool use_wal)
+{
+ int16 bytes_used = 0;
+
+ if (zeros > 0)
+ {
+ uint64 zerosNeeded;
+ uint64 numOfTotalFillWords;
+
+ /*
+ * Calculate how many bits are needed to fill up the existing last
+ * bitmap word.
+ */
+ if (buf->last_tid == 0)
+ zerosNeeded = BM_WORD_SIZE;
+ else
+ zerosNeeded =
+ BM_WORD_SIZE - ((buf->last_tid - 1) % BM_WORD_SIZE) - 1;
+
+ buf->last_tid += Min(zeros, zerosNeeded);
+
+ if (zerosNeeded > 0 && zeros >= zerosNeeded)
+ {
+ /*
+ * The last bitmap word is complete now. We merge it with the last
+ * bitmap complete word.
+ */
+ bytes_used -=
+ buf_ensure_head_space(rel, buf, vmi_block, off, use_wal);
+
+ bytes_used += mergewords(buf, false);
+ zeros -= zerosNeeded;
+ }
+
+ /*
+ * If the remaining zeros are more than BM_WORD_SIZE, we construct the
+ * last bitmap word to be a fill word, and merge it with the last
+ * complete bitmap word.
+ */
+ numOfTotalFillWords = zeros / BM_WORD_SIZE;
+
+ while (numOfTotalFillWords > 0)
+ {
+ BM_WORD numOfFillWords;
+
+ if (numOfTotalFillWords >= MAX_FILL_LENGTH)
+ numOfFillWords = MAX_FILL_LENGTH;
+ else
+ numOfFillWords = numOfTotalFillWords;
+
+ buf->last_word = BM_MAKE_FILL_WORD(0, numOfFillWords);
+
+ bytes_used -=
+ buf_ensure_head_space(rel, buf, vmi_block, off, use_wal);
+ bytes_used += mergewords(buf, true);
+
+ numOfTotalFillWords -= numOfFillWords;
+ zeros -= numOfFillWords * BM_WORD_SIZE;
+ }
+
+ buf->last_tid += zeros;
+ }
+
+ Assert((zeros >= 0) && (zeros < BM_WORD_SIZE));
+
+ return bytes_used;
+}
+
+/*
+ * buf_add_tid_with_fill() -- Worker for buf_add_tid().
+ *
+ * Return how many bytes are used. Since we move words to disk when
+ * there is no space left for new header words, this returning number
+ * can be negative.
+ */
+static int16
+buf_add_tid_with_fill(Relation rel, BMTIDBuffer *buf, BlockNumber vmi_block,
+ OffsetNumber off, uint64 tidnum, bool use_wal)
+{
+ int16 bytes_used = 0;
+ BlockNumber blockno = BM_INT_GET_BLOCKNO(tidnum);
+ int hot_buffer_bit_offset;
+
+ /* Checking if block number has changed */
+ if (blockno != buf->hot_buffer_block)
+ {
+ if (buf->hot_buffer_block != InvalidBlockNumber)
+ {
+ buf->hot_buffer_last_tid =
+ (buf->hot_buffer_block + 1) * BM_MAX_HTUP_PER_PAGE;
+ hot_buffer_flush(rel, buf, vmi_block, off, use_wal, true);
+ }
+
+ /*
+ * If there is a gap between the current and the previous block, fill
+ * it with zeros. As we assume that the last word is only merged if it
+ * is complete, we must shorten the fill a bit in case it is not.
+ */
+ if (blockno > 0)
+ {
+ if (buf->hot_buffer_block == InvalidBlockNumber)
+ {
+ uint64 gap_length = blockno * BM_MAX_HTUP_PER_PAGE;
+ uint64 last_word_length = gap_length % BM_WORD_SIZE;
+ uint64 fill_length = gap_length - last_word_length;
+
+ bytes_used += buf_add_fill(rel, buf, vmi_block, off,
+ fill_length, use_wal);
+
+ buf->last_word = 0;
+ buf->last_tid = gap_length;
+ }
+ else if (blockno > (buf->hot_buffer_block + 1))
+ {
+ uint64 gap_blocks = blockno - (buf->hot_buffer_block + 1);
+ uint64 gap_length = gap_blocks * BM_MAX_HTUP_PER_PAGE;
+ uint64 new_last_tid = buf->last_tid + gap_length;
+ uint64 last_word_length = new_last_tid % BM_WORD_SIZE;
+ uint64 fill_length = gap_length - last_word_length;
+
+ bytes_used += buf_add_fill(rel, buf, vmi_block, off,
+ fill_length, use_wal);
+
+ buf->last_word = 0;
+ buf->last_tid = new_last_tid;
+ }
+ }
+
+ /*
+ * If the new block is a consecutive one, the old and the new HOT
+ * buffer may overlap because of word alignment. Therefore we need to
+ * copy the last_word into the first word of the new HOT buffer.
+ */
+ if ((buf->hot_buffer_block != InvalidBlockNumber) &&
+ (blockno == (buf->hot_buffer_block + 1)) &&
+ (buf->last_tid % BM_WORD_SIZE != 0))
+ {
+ buf->hot_buffer[0] = buf->last_word;
+ }
+
+ buf->hot_buffer_block = blockno;
+ buf->hot_buffer_start_tid =
+ (blockno * BM_MAX_HTUP_PER_PAGE) + 1 -
+ ((blockno * BM_MAX_HTUP_PER_PAGE) % BM_WORD_SIZE);
+ }
+
+ /* setting the bit */
+ hot_buffer_bit_offset = tidnum - buf->hot_buffer_start_tid;
+ buf->hot_buffer[hot_buffer_bit_offset / BM_WORD_SIZE] |=
+ ((BM_WORD) 1) << (hot_buffer_bit_offset % BM_WORD_SIZE);
+ buf->hot_buffer_count++;
+ if (buf->hot_buffer_last_tid < tidnum)
+ buf->hot_buffer_last_tid = tidnum;
+
+ return bytes_used;
+}
+
+/*
+ * buf_add_tid_with_fill_immediate() -- Add a bit to the given TIDBuffer
+ *
+ * This version does not use the HOT buffer and is faster for one shot inserts.
+ *
+ * Return how many bytes are used. Since we move words to disk when
+ * there is no space left for new header words, this returning number
+ * can be negative.
+ */
+static int16
+buf_add_tid_with_fill_immediate(Relation rel, BMTIDBuffer *buf,
+ BlockNumber vmi_block, OffsetNumber off,
+ uint64 tidnum, bool use_wal)
+{
+ uint64 zeros;
+ uint16 inserting_pos;
+ int16 bytes_used = 0;
+
+ /*
+ * Compute how many zeros between this set bit and the last inserted
+ * set bit.
+ *
+ * If this is the first time to insert a set bit, then
+ * we assume that the last set bit is
+ * (tidnum/BM_WORD_SIZE)*BM_WORD_SIZE, because we have
+ * already inserted the first 'tidnum - tidnum % BM_WORD_SIZE'
+ * zeros during creating a new lov item.
+ */
+ if (buf->last_tid == 0)
+ zeros = tidnum % BM_WORD_SIZE;
+ else
+ zeros = tidnum - buf->last_tid - 1;
+
+ bytes_used += buf_add_fill(rel, buf, vmi_block, off, zeros, use_wal);
+
+ inserting_pos = (tidnum-1)%BM_WORD_SIZE;
+ buf->last_word |= (((BM_WORD)1) << inserting_pos);
+ buf->last_tid = tidnum;
+
+ if (tidnum % BM_WORD_SIZE == 0)
+ {
+ bool lastWordFill = false;
+
+ if (buf->last_word == LITERAL_ALL_ZERO)
+ {
+ buf->last_word = BM_MAKE_FILL_WORD(0, 1);
+ lastWordFill = true;
+ }
+
+ else if (buf->last_word == LITERAL_ALL_ONE)
+ {
+ buf->last_word = BM_MAKE_FILL_WORD(1, 1);
+ lastWordFill = true;
+ }
+
+ bytes_used -=
+ buf_ensure_head_space(rel, buf, vmi_block, off, use_wal);
+ bytes_used += mergewords(buf, lastWordFill);
+ }
+
+ return bytes_used;
+}
+
+/*
+ * buf_ensure_head_space() -- If there is no space in the header words,
+ * move words in the given buffer to disk and free the existing space,
+ * and then allocate new space for future new words.
+ *
+ * The number of bytes freed are returned.
+ */
+static uint16
+buf_ensure_head_space(Relation rel, BMTIDBuffer *buf,
+ BlockNumber vmi_block, OffsetNumber off, bool use_wal)
+{
+ uint16 bytes_freed = 0;
+
+ if (buf->curword >= (BM_NUM_OF_HEADER_WORDS * BM_WORD_SIZE))
+ {
+#ifdef DEBUG_BMI
+ _debug_view_2(buf, "[buf_ensure_head_space] freeing bytes");
+#endif
+ bytes_freed = buf_free_mem(rel, buf, vmi_block, off, use_wal, false);
+ bytes_freed -= buf_extend(buf);
+ }
+
+ return bytes_freed;
+}
+
+/*
+ * buf_extend() -- Enlarge the memory allocated to a buffer.
+ * Return how many bytes are added to the buffer.
+ */
+static uint16
+buf_extend(BMTIDBuffer *buf)
+{
+ uint16 bytes;
+ uint16 size;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_extend] BEGIN");
+#endif
+ if (buf->num_cwords > 0 && buf->curword < buf->num_cwords - 1)
+ return 0; /* already large enough */
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_extend] not large enough");
+#endif
+
+ if (buf->num_cwords == 0)
+ {
+ size = BUF_INIT_WORDS;
+ buf->cwords = (BM_WORD *) palloc0(BUF_INIT_WORDS * sizeof(BM_WORD));
+ buf->last_tids = (uint64 *)palloc0(BUF_INIT_WORDS * sizeof(uint64));
+ bytes = BUF_INIT_WORDS * (sizeof(BM_WORD) + sizeof(uint64));
+ }
+ else
+ {
+ size = buf->num_cwords;
+ buf->cwords = repalloc(buf->cwords, 2 * size * sizeof(BM_WORD));
+ MemSet(buf->cwords + size, 0, size * sizeof(BM_WORD));
+ buf->last_tids = repalloc(buf->last_tids, 2 * size * sizeof(uint64));
+ MemSet(buf->last_tids + size, 0, size * sizeof(uint64));
+ bytes = 2 * size * sizeof(BM_WORD) + 2 * size * sizeof(uint64);
+ }
+ buf->num_cwords += size;
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_extend] END , bytes ==> %u", bytes);
+#endif
+ return bytes;
+}
+
+/*
+ * Spill some HRL compressed tids to disk
+ */
+
+static uint16
+buf_free_mem(Relation rel, BMTIDBuffer *buf, BlockNumber vmi_block,
+ OffsetNumber off, bool use_wal,
+ bool flush_hot_buffer)
+{
+ Buffer vmibuf;
+ uint16 bytes_freed=0;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_free_mem] BEGIN");
+#endif
+
+ /* flush hot_buffer to BMTIDBuffer */
+ if (flush_hot_buffer)
+ bytes_freed += hot_buffer_flush(rel, buf, vmi_block, off, use_wal,
+ true);
+
+ /* already done */
+ if (buf->num_cwords == 0)
+ return 0;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_free_mem] buf->num_cwords != 0");
+#endif
+
+ vmibuf = _bitmap_getbuf(rel, vmi_block, BM_WRITE);
+
+ _bitmap_write_new_bitmapwords(rel, vmibuf, off, buf, use_wal);
+
+ _bitmap_relbuf(vmibuf);
+
+#ifdef DEBUG_BMI
+ _debug_view_2(buf, "[buf_free_mem] END");
+#endif
+ bytes_freed += _bitmap_free_tidbuf(buf);
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[buf_free_mem] END , bytes_freed ==> %u", bytes_freed);
+#endif
+ return bytes_freed;
+}
+
+/*
+ * Spill some data out of the buffer to free up space.
+ */
+static void
+buf_make_space(Relation rel, BMTidBuildBuf *locbuf, bool use_wal)
+{
+ ListCell *cell;
+
+ elog(NOTICE, "making space");
+
+ /*
+ * Now, we could just pull the head of VMI_blocks but there'd be no
+ * guarantee that we'd free up enough space.
+ */
+ foreach(cell, locbuf->vmi_blocks)
+ {
+ int i;
+ BMTIDVMIBuffer *vmi_buf = (BMTIDVMIBuffer *) lfirst(cell);
+ BlockNumber vmi_block = vmi_buf->vmi_block;
+
+ for (i = 0; i < BM_MAX_VMI_PER_PAGE; i++)
+ {
+ BMTIDBuffer *buf = (BMTIDBuffer *) vmi_buf->bufs[i];
+ OffsetNumber off;
+
+ /* return if we've freed enough space */
+ if (locbuf->byte_size < (maintenance_work_mem * 1024L))
+ return;
+ if (!buf || buf->num_cwords == 0)
+ continue;
+
+ off = i + 1;
+#ifdef DEBUG_BMI
+ elog(NOTICE, "invoking buf_free_mem from buf_make_space");
+#endif
+ locbuf->byte_size -= buf_free_mem(rel, buf, vmi_block, off,
+ use_wal, false);
+ }
+ if (locbuf->byte_size < (maintenance_work_mem * 1024L))
+ return;
+ }
+}
+
+/*
+ * _bitmap_free_tidbuf() -- release the space.
+ */
+uint16
+_bitmap_free_tidbuf(BMTIDBuffer* buf)
+{
+ uint16 bytes_freed = 0;
+
+ if (buf->last_tids)
+ pfree(buf->last_tids);
+ if (buf->cwords)
+ pfree(buf->cwords);
+
+ bytes_freed = buf->num_cwords * sizeof(BM_WORD) +
+ buf->num_cwords * sizeof(uint64);
+
+ buf->num_cwords = 0;
+ buf->curword = 0;
+ buf->start_wordno = 0;
+ /* Paranoia */
+ MemSet(buf->hwords, 0, sizeof(BM_WORD) * BM_NUM_OF_HEADER_WORDS);
+
+ return bytes_freed;
+}
+
+/*
+ * insertsetbit() -- insert a given set bit into a bitmap
+ * specified by vmiBuffer.
+ *
+ * vmiBuffer is pinned and locked.
+ */
+static void
+insertsetbit(Relation rel, BlockNumber vmiBlock, OffsetNumber vmiOffset,
+ uint64 tidnum, bool use_wal)
+{
+ Buffer vmiBuffer = _bitmap_getbuf(rel, vmiBlock, BM_WRITE);
+ Page vmiPage = BufferGetPage(vmiBuffer);
+ BMVectorMetaItem vmi = (BMVectorMetaItem) PageGetItem(vmiPage,
+ PageGetItemId(vmiPage, vmiOffset));
+ BMTIDBuffer buf;
+
+ MemSet(&buf, 0, sizeof(buf));
+ buf_extend(&buf);
+ buf.last_compword = vmi->bm_last_compword;
+ buf.last_word = vmi->bm_last_word;
+ buf.is_last_compword_fill = BM_LAST_COMPWORD_IS_FILL(vmi);
+ buf.start_wordno = 0;
+ buf.last_tid = vmi->bm_last_setbit;
+ if (buf.cwords)
+ {
+ MemSet(buf.cwords, 0,
+ buf.num_cwords * sizeof(BM_WORD));
+ }
+ MemSet(buf.hwords, 0,
+ BM_CALC_H_WORDS(buf.num_cwords) * sizeof(BM_WORD));
+ if (buf.last_tids)
+ MemSet(buf.last_tids, 0,
+ buf.num_cwords * sizeof(uint64));
+ buf.curword = 0;
+
+ /*
+ * Usually, tidnum is greater than vmi->bm_last_setbit. However, if
+ * this is not the case, this should be called while doing 'vacuum full' or
+ * doing insertion after 'vacuum'. In this case, we try to update this bit
+ * in the corresponding bitmap vector.
+ */
+ if (tidnum <= vmi->bm_last_setbit)
+ {
+ /*
+ * Scan through the bitmap vector, and update the bit in tidnum.
+ */
+ updatesetbit(rel, vmiBuffer, vmiOffset, tidnum, use_wal);
+ }
+ else
+ {
+ /*
+ * To insert this new set bit, we also need to add all zeros between
+ * this set bit and last set bit. We construct all new words here.
+ *
+ * Use the immediate version of the function that does not use the HOT
+ * buffer because insertsetbit uses the buffer only for one shot. The
+ * alternative way would be to call hot_buffer_flush directly after
+ * buf_add_tid_with_fill, but this would be superflous overhead.
+ */
+ buf_add_tid_with_fill_immediate(rel, &buf, vmiBuffer, vmiOffset,
+ tidnum, use_wal);
+
+ /*
+ * If there are only updates to the last bitmap complete word and last
+ * bitmap word, we simply needs to update the VMI buffer.
+ */
+ if (buf.num_cwords == 0)
+ {
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(vmiBuffer);
+
+ vmi->bm_last_compword = buf.last_compword;
+ vmi->bm_last_word = buf.last_word;
+ vmi->vmi_words_header = BM_LAST_COMPWORD_IS_FILL(vmi);
+ vmi->bm_last_setbit = tidnum;
+ vmi->bm_last_tid_location = tidnum - tidnum % BM_WORD_SIZE;
+
+ if (use_wal)
+ _bitmap_log_bitmap_lastwords(rel, vmiBuffer, vmiOffset,
+ vmi);
+
+ END_CRIT_SECTION();
+ }
+ else
+ {
+ /*
+ * Write bitmap words to bitmap pages. When there are no enough
+ * space for all these bitmap words, new bitmap pages are created.
+ */
+ _bitmap_write_new_bitmapwords(rel, vmiBuffer, vmiOffset, &buf,
+ use_wal);
+ }
+ }
+
+ _bitmap_relbuf(vmiBuffer);
+ _bitmap_free_tidbuf(&buf);
+}
+
+/*
+ * _bitmap_write_alltids() -- write all tids in the given buffer into disk.
+ */
+void
+_bitmap_write_alltids(Relation rel, BMTidBuildBuf *tids, bool use_wal)
+{
+ ListCell *cell;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_write_alltids] BEGIN");
+#endif
+ foreach(cell, tids->vmi_blocks)
+ {
+ int i;
+ BMTIDVMIBuffer *vmi_buf = (BMTIDVMIBuffer *) lfirst(cell);
+ BlockNumber vmi_block = vmi_buf->vmi_block;
+
+ for (i = 0; i < BM_MAX_VMI_PER_PAGE; i++)
+ {
+ BMTIDBuffer *buf = (BMTIDBuffer *) vmi_buf->bufs[i];
+ OffsetNumber off;
+
+ if (!buf || buf->num_cwords == 0)
+ continue;
+
+ off = i + 1;
+
+ buf_free_mem(rel, buf, vmi_block, off, use_wal, true);
+ pfree(buf);
+
+ vmi_buf->bufs[i] = NULL;
+ }
+ }
+ list_free_deep(tids->vmi_blocks);
+ tids->vmi_blocks = NIL;
+ tids->byte_size = 0;
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_write_alltids] END");
+#endif
+}
+
+/*
+ * build_inserttuple() -- insert a new tuple into the bitmap index
+ * during the bitmap index construction.
+ *
+ * Each new tuple has an assigned number -- tidnum, called a
+ * tid location, which represents the bit location for this tuple in
+ * a bitmap vector. To speed up the construction, this function does not
+ * write this tid location into its bitmap vector immediately. We maintain
+ * a buffer -- BMTidBuildBuf to keep an array of tid locations
+ * for each distinct attribute value.
+ *
+ * If this insertion causes the buffer to overflow, we write tid locations
+ * for enough distinct values to disk to accommodate this new tuple.
+ */
+static void
+build_inserttuple(Relation index, uint64 tidnum,
+ ItemPointerData ht_ctid,
+ Datum *attdata, bool *nulls, BMBuildState *state)
+{
+ /* Tuple descriptor alias */
+ TupleDesc tupDesc = state->bm_tupDesc;
+
+ /* metapage buffer */
+ Buffer metabuf = _bitmap_getbuf(index, BM_METAPAGE, BM_WRITE);
+
+ BMVMIID vmiid;
+
+ /* temporary attribute counter */
+ int attno;
+ /* all attributes are NULL */
+ bool allNulls = true;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[build_inserttuple] BEGIN"
+ "\n\t- tidnum = " UINT64_FORMAT
+ "\n\t- ht_ctid = %08x:%04x"
+ "\n\t- attdata = %p"
+ "\n\t- nulls = %p",
+ tidnum,
+ ItemPointerGetBlockNumber(&ht_ctid),
+ ItemPointerGetOffsetNumber(&ht_ctid),
+ attdata,
+ nulls);
+#endif
+
+ /* Check if all attributes have value of NULL. */
+ for (attno = 0; attno < state->bm_tupDesc->natts; ++attno)
+ {
+ if (!nulls[attno])
+ {
+ allNulls = false;
+ break;
+ }
+ }
+
+ if (allNulls)
+ {
+ /*
+ * Initialise VMI block and offset to point to the special NULL value
+ * of the Bitmap vector
+ */
+ _bitmap_get_null_vmiid(index, &vmiid);
+ }
+ else
+ {
+ /*
+ * Not NULL value for the current tuple (at least one of the attribute
+ * has a not NULL value)
+ */
+ bool found;
+
+ /* See if the attributes allow hashing */
+ if (state->vmi_hash)
+ {
+ /* look up the hash to see if we can find the VMI ID that way */
+ Datum *entry = (Datum *)
+ hash_search(state->vmi_hash, (void *) attdata, HASH_ENTER,
+ &found);
+
+ if (!found)
+ {
+ /* Copy the key values in case someone modifies them */
+ for (attno = 0; attno < tupDesc->natts; attno++)
+ {
+ Form_pg_attribute at = tupDesc->attrs[attno];
+
+ entry[attno] = datumCopy(entry[attno], at->attbyval,
+ at->attlen);
+ }
+
+ /*
+ * If the inserting tuple has a new value, then we create a new
+ * LOV entry.
+ */
+ create_loventry(index, metabuf, 0, tupDesc, attdata,
+ nulls, state->bm_lov_heap, state->bm_lov_index,
+ &vmiid, state->use_wal, true);
+
+ /*
+ * Updates the information in the LOV heap entry about the block
+ * and the offset
+ */
+ *((BMVMIID *) &(entry[tupDesc->natts])) = vmiid;
+ }
+ else
+ {
+ /* Get block and offset or the encoding of the LOV item */
+ vmiid = *((BMVMIID *) &(entry[tupDesc->natts]));
+ }
+ }
+ else
+ {
+ /*
+ * Search the btree to find the right bitmap vector to append
+ * this bit. Here, we reset the scan key and call index_rescan.
+ */
+ for (attno = 0; attno < tupDesc->natts; attno++)
+ {
+ ScanKey theScanKey = (ScanKey)
+ (((char *) state->bm_lov_scanKeys) +
+ attno * sizeof(ScanKeyData));
+ if (nulls[attno])
+ {
+ theScanKey->sk_flags = SK_ISNULL;
+ theScanKey->sk_argument = attdata[attno];
+ }
+ else
+ {
+ theScanKey->sk_flags = 0;
+ theScanKey->sk_argument = attdata[attno];
+ }
+ }
+
+ index_rescan(state->bm_lov_scanDesc, state->bm_lov_scanKeys,
+ tupDesc->natts, NULL, 0);
+
+ found = _bitmap_findvalue(state->bm_lov_heap, state->bm_lov_index,
+ state->bm_lov_scanKeys,
+ state->bm_lov_scanDesc,
+ &vmiid);
+
+ if (!found)
+ {
+ /*
+ * If the inserting tuple has a new value, then we create a new
+ * LOV entry.
+ */
+ create_loventry(index, metabuf, 0, tupDesc, attdata,
+ nulls, state->bm_lov_heap, state->bm_lov_index,
+ &vmiid, state->use_wal, false);
+ }
+ }
+ }
+
+ buf_add_tid(index, tidnum, state, vmiid.block, vmiid.offset);
+ _bitmap_wrtbuf(metabuf);
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[build_inserttuple] END");
+#endif
+}
+
+/*
+ * inserttuple() -- insert a new tuple into the bitmap index.
+ *
+ * This function finds the corresponding bitmap vector(s) associated with the
+ * given attribute value, and inserts a set bit into this bitmap vector(s).
+ * Each distinct attribute value is stored as a VMI, which is stored in a list
+ * of VMI pages.
+ *
+ * If there is no VMI associated with the given attribute value, a new VMI is
+ * created and appended into the last VMI page.
+ *
+ * For support the high-cardinality case for attributes to be indexed, we also
+ * maintain an auxiliary heap and a btree structure for all the distinct
+ * attribute values so that the search for the corresponding bitmap vector can
+ * be done faster. The heap contains all attributes to be indexed and 2 more
+ * attributes -- the block number of the offset number of the block that stores
+ * the corresponding VMI. The b-tree index is on this new heap and the key
+ * contains all attributes to be indexed.
+ */
+static void
+inserttuple(Relation rel, Buffer metabuf, uint64 tidnum,
+ ItemPointerData ht_ctid, TupleDesc tupDesc, Datum *attdata,
+ bool *nulls, Relation lovHeap, Relation lovIndex, ScanKey scanKey,
+ IndexScanDesc scanDesc, bool use_wal)
+{
+ BMVMIID vmiid;
+ bool allNulls = true;
+ int attno;
+
+ /* Check if the values of given attributes are all NULL. */
+ for (attno = 0; attno < tupDesc->natts; attno++)
+ {
+ if (!nulls[attno])
+ {
+ allNulls = false;
+ break;
+ }
+ }
+
+ if (allNulls)
+ {
+ /*
+ * NULL values have a fixed position/encoding
+ */
+ _bitmap_get_null_vmiid(rel, &vmiid);
+ }
+ else
+ {
+ /*
+ * XXX: We lock the meta page to guard against a race condition where
+ * by a concurrent writer is inserting the same key as us and they
+ * create_vmi() between us calling _bitmap_findvalue() and
+ * create_vmi().
+ *
+ * The problem is, locking the metapage is pretty heavy handed
+ * because the read routines need a read lock on it. There are a
+ * few other things we could do instead: use a BM insert lock or
+ * wrap the code below in a PG_TRY and try and catch the unique
+ * constraint violation from the btree code.
+ */
+ LockBuffer(metabuf, BM_WRITE);
+
+ if (!_bitmap_findvalue(lovHeap, lovIndex, scanKey, scanDesc, &vmiid))
+ {
+ /*
+ * Search through the LOV heap and index to find the entry which
+ * has the same value as the inserting tuple. If such an entry is
+ * not found, then we create a new entry, and insert it into the
+ * lov heap and index.
+ */
+ create_loventry(rel, metabuf, tidnum, tupDesc, attdata, nulls,
+ lovHeap, lovIndex, &vmiid, use_wal, false);
+ }
+ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ }
+
+ /*
+ * Here, we have found the ID of the VMI. Append a set bit to the
+ * respective vector.
+ */
+ insertsetbit(rel, vmiid.block, vmiid.offset, tidnum, use_wal);
+}
+
+/*
+ * _bitmap_buildinsert() -- insert an index tuple during index creation.
+ */
+
+void
+_bitmap_buildinsert(Relation index, ItemPointerData ht_ctid, Datum *attdata,
+ bool *nulls, BMBuildState *state)
+{
+ uint64 tidOffset; /* Tuple ID offset */
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_buildinsert] BEGIN");
+#endif
+
+ Assert(ItemPointerGetOffsetNumber(&ht_ctid) <= BM_MAX_HTUP_PER_PAGE);
+
+ tidOffset = BM_IPTR_TO_INT(&ht_ctid);
+
+ /* insert a new bit into the corresponding bitmap */
+ build_inserttuple(index, tidOffset, ht_ctid, attdata, nulls, state);
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_buildinsert] END");
+#endif
+}
+
+/*
+ * _bitmap_doinsert() -- insert an index tuple for a given tuple.
+ */
+void
+_bitmap_doinsert(Relation rel, ItemPointerData ht_ctid, Datum *attdata,
+ bool *nulls)
+{
+ uint64 tidOffset;
+ TupleDesc tupDesc;
+ Buffer metabuf;
+ BMMetaPage metapage;
+ Relation lovHeap, lovIndex;
+ ScanKey scanKeys;
+ IndexScanDesc scanDesc;
+ int attno;
+
+ tupDesc = RelationGetDescr(rel);
+ if (tupDesc->natts <= 0)
+ return ;
+
+ Assert(ItemPointerGetOffsetNumber(&ht_ctid) <= BM_MAX_HTUP_PER_PAGE);
+ tidOffset = BM_IPTR_TO_INT(&ht_ctid);
+
+ /* insert a new bit into the corresponding bitmap using the HRL scheme */
+ metabuf = _bitmap_getbuf(rel, BM_METAPAGE, BM_READ);
+ metapage = (BMMetaPage)PageGetContents(BufferGetPage(metabuf));
+ _bitmap_open_lov_heapandindex(metapage, &lovHeap, &lovIndex,
+ RowExclusiveLock);
+
+ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+
+ scanKeys = (ScanKey) palloc0(tupDesc->natts * sizeof(ScanKeyData));
+
+ for (attno = 0; attno < tupDesc->natts; attno++)
+ {
+ RegProcedure opfuncid;
+ ScanKey scanKey;
+ Oid eq_opr; /* equality operator */
+
+ /* Get the equality operator OID */
+ get_sort_group_operators(tupDesc->attrs[attno]->atttypid, false, true,
+ false, NULL, &eq_opr, NULL, NULL);
+ opfuncid = get_opcode(eq_opr);
+ scanKey = &scanKeys[attno];
+
+ ScanKeyEntryInitialize(scanKey, SK_ISNULL, attno + 1,
+ BTEqualStrategyNumber, InvalidOid,
+ DEFAULT_COLLATION_OID, opfuncid, 0);
+
+ if (nulls[attno])
+ {
+ scanKey->sk_flags = SK_ISNULL;
+ scanKey->sk_argument = attdata[attno];
+ }
+ else
+ {
+ scanKey->sk_flags = 0;
+ scanKey->sk_argument = attdata[attno];
+ }
+ }
+
+ scanDesc = index_beginscan(lovHeap, lovIndex, SnapshotAny,
+ tupDesc->natts, 0);
+
+ index_rescan(scanDesc, scanKeys, tupDesc->natts, NULL, 0);
+
+ /* insert this new tuple into the bitmap index. */
+ inserttuple(rel, metabuf, tidOffset, ht_ctid, tupDesc, attdata, nulls,
+ lovHeap, lovIndex, scanKeys, scanDesc, true);
+
+ index_endscan(scanDesc);
+ _bitmap_close_lov_heapandindex(lovHeap, lovIndex, RowExclusiveLock);
+
+ ReleaseBuffer(metabuf);
+ pfree(scanKeys);
+}
+
+/*
+ * Debug helper functions
+ */
+
+void _debug_view_1(BMTidBuildBuf *x, const char *msg)
+{
+ ListCell* c;
+ int i=0;
+ elog(NOTICE, "[_debug_view_BMTidBuildBuf] %s"
+ "\n\tbyte_size = %u"
+ "\n\tmax_vmi_block = %u"
+ "\n\t\tvmi_blocks:length = %d",
+ msg,
+ x->byte_size,
+ x->max_vmi_block,
+ list_length(x->vmi_blocks));
+ foreach(c, x->vmi_blocks) {
+ i++;
+ elog(NOTICE, "cell %d = %p", i, lfirst(c));
+ }
+}
+
+void _debug_view_2(BMTIDBuffer *x, const char *msg)
+{
+ int i;
+ elog(NOTICE, "[_debug_view_BMTIDBuffer] %s"
+ "\n\tlast_compword = %04x"
+ "\n\tlast_word = %04x"
+ "\n\tis_last_compword_fill = %d"
+ "\n\tstart_tid = %08x:%04x"
+ "\n\tlast_tid = %08x:%04x"
+ "\n\tcurword = %d"
+ "\n\tnum_cwords = %d"
+ "\n\tstart_wordno = %d"
+ "\n\thwords = [ %04x %04x %04x %04x ... ]"
+ "\n\tcwords = [ %04x %04x %04x %04x ... ]"
+ "\n\thot_buffer_block = %08lx"
+ "\n\thot_buffer_count = %d"
+ "\n\thot_buffer_start_tid = %08x:%04x"
+ "\n\thot_buffer_last_tid = %08x:%04x",
+ msg,
+ x->last_compword,
+ x->last_word,
+ x->is_last_compword_fill,
+ BM_INT_GET_BLOCKNO(x->start_tid), BM_INT_GET_OFFSET(x->start_tid),
+ BM_INT_GET_BLOCKNO(x->last_tid), BM_INT_GET_OFFSET(x->last_tid),
+ x->curword,
+ x->num_cwords,
+ x->start_wordno,
+ x->hwords[0], x->hwords[1], x->hwords[2], x->hwords[3],
+ x->cwords[0], x->cwords[1], x->cwords[2], x->cwords[3],
+ (unsigned long)x->hot_buffer_block,
+ x->hot_buffer_count,
+ BM_INT_GET_BLOCKNO(x->hot_buffer_start_tid),
+ BM_INT_GET_OFFSET(x->hot_buffer_start_tid),
+ BM_INT_GET_BLOCKNO(x->hot_buffer_last_tid),
+ BM_INT_GET_OFFSET(x->hot_buffer_last_tid));
+ Assert(BUF_INIT_WORDS==8);
+ for (i = 0; i < x->num_cwords; i += 8)
+ {
+ elog(NOTICE, "last_tids[%03x-%03x] = %08x:%04x %08x:%04x %08x:%04x "
+ "%08x:%04x %08x:%04x %08x:%04x %08x:%04x %08x:%04x",
+ i, i+7,
+ BM_INT_GET_BLOCKNO((x->last_tids)[i]), BM_INT_GET_OFFSET((x->last_tids)[i]),
+ BM_INT_GET_BLOCKNO((x->last_tids)[i+1]), BM_INT_GET_OFFSET((x->last_tids)[i+1]),
+ BM_INT_GET_BLOCKNO((x->last_tids)[i+2]), BM_INT_GET_OFFSET((x->last_tids)[i+2]),
+ BM_INT_GET_BLOCKNO((x->last_tids)[i+3]), BM_INT_GET_OFFSET((x->last_tids)[i+3]),
+ BM_INT_GET_BLOCKNO((x->last_tids)[i+4]), BM_INT_GET_OFFSET((x->last_tids)[i+4]),
+ BM_INT_GET_BLOCKNO((x->last_tids)[i+5]), BM_INT_GET_OFFSET((x->last_tids)[i+5]),
+ BM_INT_GET_BLOCKNO((x->last_tids)[i+6]), BM_INT_GET_OFFSET((x->last_tids)[i+6]),
+ BM_INT_GET_BLOCKNO((x->last_tids)[i+7]), BM_INT_GET_OFFSET((x->last_tids)[i+7]));
+ }
+}
diff --git a/src/backend/access/bitmap/bitmappages.c b/src/backend/access/bitmap/bitmappages.c
new file mode 100644
index 0000000..c1aad57
--- /dev/null
+++ b/src/backend/access/bitmap/bitmappages.c
@@ -0,0 +1,562 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmappage.c
+ * Bitmap index page management code for the bitmap index.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/bitmap/bitmappages.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "access/genam.h"
+#include "access/tupdesc.h"
+#include "access/bitmap.h"
+#include "parser/parse_oper.h"
+#include "storage/lmgr.h"
+#include "utils/memutils.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+#include "storage/bufmgr.h" /* for buffer manager functions */
+#include "utils/tqual.h" /* for SnapshotAny */
+#include "catalog/index.h"
+#include "catalog/pg_collation.h"
+
+/*
+ * Helper functions for hashing and matching build data. At this stage, the
+ * hash API doesn't know about complex keys like those use during index
+ * creation (the key is an array of key attributes). c.f. execGrouping.c.
+ */
+typedef struct BMBuildHashData
+{
+ int natts;
+ FmgrInfo *hash_funcs;
+ FmgrInfo *eq_funcs;
+ MemoryContext tmpcxt;
+ MemoryContext hash_cxt;
+} BMBuildHashData;
+
+static BMBuildHashData *cur_bmbuild = NULL;
+
+static void _bitmap_build_lovindex(BMBuildState *bmstate,
+ IndexInfo *indexInfo);
+static uint32 build_hash_key(const void *key, Size keysize);
+static int build_match_key(const void *key1, const void *key2, Size keysize);
+
+/*
+ * _bitmap_getbuf() -- return the buffer for the given block number and
+ * the access method.
+ */
+Buffer
+_bitmap_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+ Buffer buf;
+
+ buf = ReadBuffer(rel, blkno);
+ if (access != BM_NOLOCK)
+ LockBuffer(buf, access);
+
+ return buf;
+}
+
+/*
+ * _bitmap_wrtbuf() -- write a buffer page to disk.
+ *
+ * Release the lock and the pin held on the buffer.
+ */
+void
+_bitmap_wrtbuf(Buffer buf)
+{
+ MarkBufferDirty(buf);
+ UnlockReleaseBuffer(buf);
+}
+
+/*
+ * _bitmap_wrtnorelbuf() -- write a buffer page to disk without still holding
+ * the pin on this page.
+ */
+void
+_bitmap_wrtnorelbuf(Buffer buf)
+{
+ MarkBufferDirty(buf);
+}
+
+/*
+ * _bitmap_relbuf() -- release the buffer without writing.
+ */
+void
+_bitmap_relbuf(Buffer buf)
+{
+ UnlockReleaseBuffer(buf);
+}
+
+/*
+ * _bitmap_init_vmipage -- initialize a new VMI page.
+ */
+void
+_bitmap_init_vmipage(Buffer buf)
+{
+ Page page;
+
+ page = (Page) BufferGetPage(buf);
+
+ if (PageIsNew(page))
+ PageInit(page, BufferGetPageSize(buf), 0);
+}
+
+/*
+ * _bitmap_init_bitmappage() -- initialize a new page to store the bitmap.
+ */
+void
+_bitmap_init_bitmappage(Buffer buf)
+{
+
+ Page page; /* temporary variable for identifying the buffer page */
+ BMPageOpaque opaque; /* bitmap page */
+
+ /* Get the buffer's page */
+ page = (Page) BufferGetPage(buf);
+
+ /* If the buffer page is new, we initialise the special space of the page
+ * with the the "opaque" structure of the index page. The second argument of
+ * PageInit is the sice of the page, the third one of the special area */
+ if (PageIsNew(page))
+ PageInit(page, BufferGetPageSize(buf), sizeof(BMPageOpaqueData));
+
+ /* Reset all the values (even if the page is not new) */
+ opaque = (BMPageOpaque) PageGetSpecialPointer(page);
+ opaque->bm_hrl_words_used = 0;
+ opaque->bm_bitmap_next = InvalidBlockNumber;
+ opaque->bm_last_tid_location = 0;
+ opaque->bm_page_id = BM_PAGE_ID;
+
+}
+
+/*
+ * _bitmap_init_buildstate() -- initialize the build state before building
+ * a bitmap index.
+ */
+void
+_bitmap_init_buildstate(Relation index, BMBuildState *bmstate,
+ IndexInfo *indexInfo)
+{
+ /* BitMap Index Meta Page (first page of the index) */
+ BMMetaPage mp;
+
+ /* Buffer and page management */
+ Page page; /* temporary page variable */
+ Buffer metabuf; /* META information buffer */
+
+ int attno;
+
+ /*
+ * Initialise the BMBuildState structure which will hold information
+ * about the state for the index build process
+ */
+ bmstate->bm_tupDesc = RelationGetDescr(index); /* index tuples description */
+ bmstate->ituples = 0;
+ /* allocate the index build buffer and ... */
+ bmstate->bm_tidLocsBuffer = (BMTidBuildBuf *)
+ palloc(sizeof(BMTidBuildBuf));
+ bmstate->bm_tidLocsBuffer->byte_size = 0; /* ... initialises it */
+ bmstate->bm_tidLocsBuffer->vmi_blocks = NIL;
+ bmstate->bm_tidLocsBuffer->max_vmi_block = InvalidBlockNumber;
+
+ /* Get the meta page */
+ metabuf = _bitmap_getbuf(index, BM_METAPAGE, BM_READ);
+ page = BufferGetPage(metabuf);
+ mp = (BMMetaPage) PageGetContents(page);
+
+ /* Open the heap and the index in row exclusive mode */
+ _bitmap_open_lov_heapandindex(mp, &(bmstate->bm_lov_heap),
+ &(bmstate->bm_lov_index),
+ RowExclusiveLock);
+
+ /* release the buffer */
+ _bitmap_relbuf(metabuf);
+
+ /*
+ * Initialise the static variable cur_bmbuild with the helper functions for hashing
+ * and matching build data. One per index attribute.
+ */
+ cur_bmbuild = (BMBuildHashData *) palloc(sizeof(BMBuildHashData));
+ cur_bmbuild->hash_funcs = (FmgrInfo *)
+ palloc(sizeof(FmgrInfo) * bmstate->bm_tupDesc->natts);
+ cur_bmbuild->eq_funcs = (FmgrInfo *)
+ palloc(sizeof(FmgrInfo) * bmstate->bm_tupDesc->natts);
+
+ /* Iterate through the index attributes and initialise the helper functions */
+ for (attno = 0; attno < bmstate->bm_tupDesc->natts; ++attno)
+ {
+ Oid typid = bmstate->bm_tupDesc->attrs[attno]->atttypid;
+ Oid eq_opr; /* equality operator */
+ Oid eq_function; /* equality operator function */
+ Oid left_hash_function; /* left hash function */
+ Oid right_hash_function; /* right hash function */
+
+ /* Get the equality operator OID */
+ get_sort_group_operators(typid, false, true, false,
+ NULL, &eq_opr, NULL, NULL);
+
+ /* Get the eq and hash operator functions */
+ eq_function = get_opcode(eq_opr);
+ if (!get_op_hash_functions(eq_opr, &left_hash_function,
+ &right_hash_function))
+ {
+ pfree(cur_bmbuild);
+ cur_bmbuild = NULL;
+ break;
+ }
+
+ fmgr_info(eq_function, &cur_bmbuild->eq_funcs[attno]);
+ fmgr_info(right_hash_function, &cur_bmbuild->hash_funcs[attno]);
+ }
+
+ /* We found the hash functions for every attribute of the index */
+ if (cur_bmbuild)
+ {
+ /* Hash management */
+ HASHCTL hash_ctl;
+ int hash_flags;
+
+ /* Allocate the temporary memory context */
+ cur_bmbuild->natts = bmstate->bm_tupDesc->natts;
+ cur_bmbuild->tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
+ "Bitmap build temp space",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ /* Setup the hash table and map it into the build state variable */
+ MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(Datum) * cur_bmbuild->natts;
+ hash_ctl.entrysize = hash_ctl.keysize + sizeof(BMVMIID) + 200;
+ hash_ctl.hash = build_hash_key;
+ hash_ctl.match = build_match_key;
+ hash_ctl.hcxt = AllocSetContextCreate(CurrentMemoryContext,
+ "Bitmap build hash table",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ cur_bmbuild->hash_cxt = hash_ctl.hcxt;
+
+ hash_flags = HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT;
+
+ /* Create the hash table */
+ bmstate->vmi_hash = hash_create("Bitmap index build lov item hash",
+ 100, &hash_ctl, hash_flags);
+ }
+ else
+ {
+ /*
+ * Contingency plan: no hash functions can be used and we have to
+ * search through the btree
+ */
+ bmstate->vmi_hash = NULL;
+
+ /* so build the LOV index now, so it can be filled with every tuple */
+ _bitmap_build_lovindex(bmstate, indexInfo);
+
+ bmstate->bm_lov_scanKeys =
+ (ScanKey) palloc0(bmstate->bm_tupDesc->natts * sizeof(ScanKeyData));
+
+ for (attno = 0; attno < bmstate->bm_tupDesc->natts; ++attno)
+ {
+ RegProcedure opfuncid;
+ Oid eq_opr; /* equality operator */
+ /* Get the equality operator's function */
+ Oid atttypid =
+ bmstate->bm_tupDesc->attrs[attno]->atttypid;
+
+ get_sort_group_operators(atttypid, false, true, false,
+ NULL, &eq_opr, NULL, NULL);
+ opfuncid = get_opcode(eq_opr);
+
+ /* Initialise the scan key using a btree */
+ ScanKeyEntryInitialize(&(bmstate->bm_lov_scanKeys[attno]), SK_ISNULL,
+ attno + 1, BTEqualStrategyNumber, InvalidOid,
+ DEFAULT_COLLATION_OID, opfuncid, 0);
+ }
+
+ bmstate->bm_lov_scanDesc = index_beginscan(bmstate->bm_lov_heap,
+ bmstate->bm_lov_index,
+ SnapshotAny,
+ bmstate->bm_tupDesc->natts,
+ 0);
+ }
+
+ /*
+ * We need to log index creation in WAL iff WAL archiving is enabled
+ * AND it's not a temp index. Currently, since building an index
+ * writes page to the shared buffer, we can't disable WAL archiving.
+ * We will add this shortly.
+ */
+ bmstate->use_wal = XLogArchivingActive() && RelationNeedsWAL(index);
+
+ /* initialize HOT prebuffer data */
+#ifdef DEBUG_BMI
+ elog(NOTICE, "-[_bitmap_init_buildstate]--------- CP 0");
+#endif
+ bmstate->hot_prebuffer_block = InvalidBlockNumber;
+#if 0
+ MemSet(bmstate->hot_prebuffer_tdn, 0, BM_MAX_HTUP_PER_PAGE * sizeof(uint64));
+#else
+ { /* misteriously, MemSet segfaults... :( */
+ int i;
+ for (i = 0; i < BM_MAX_HTUP_PER_PAGE; i++) {
+ bmstate->hot_prebuffer_tdn[i] = (uint64) 0;
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_init_buildstate]: i == %d", i);
+#endif
+ }
+ }
+#endif
+ bmstate->hot_prebuffer_count = 0;
+#ifdef DEBUG_BMI
+ elog(NOTICE, "-[_bitmap_init_buildstate]--------- CP 99");
+#endif
+}
+
+/*
+ * _bitmap_cleanup_buildstate() -- clean up the build state after
+ * inserting all rows in the heap into the bitmap index.
+ */
+void
+_bitmap_cleanup_buildstate(Relation index, BMBuildState *bmstate,
+ IndexInfo *indexInfo)
+{
+ /* write out remaining tids in bmstate->bm_tidLocsBuffer */
+ BMTidBuildBuf *tidLocsBuffer = bmstate->bm_tidLocsBuffer;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "-----[_bitmap_cleanup_buildstate]----- BEGIN");
+#endif
+#ifdef FIX_GC_3
+ build_inserttuple_flush(index, bmstate);
+#endif
+#ifdef DEBUG_BMI
+ elog(NOTICE, "-----[_bitmap_cleanup_buildstate]----- CP1");
+#endif
+
+ _bitmap_write_alltids(index, tidLocsBuffer, bmstate->use_wal);
+
+ pfree(bmstate->bm_tidLocsBuffer);
+
+ if (cur_bmbuild)
+ {
+ MemoryContextDelete(cur_bmbuild->tmpcxt);
+ MemoryContextDelete(cur_bmbuild->hash_cxt);
+ pfree(cur_bmbuild->hash_funcs);
+ pfree(cur_bmbuild->eq_funcs);
+ pfree(cur_bmbuild);
+ cur_bmbuild = NULL;
+
+ /* now fire the deferred index build for the list of values */
+ _bitmap_build_lovindex(bmstate, indexInfo);
+ }
+ else
+ {
+ /*
+ * We might have build an index on a non-hashable data type, in which
+ * case we will have searched the btree manually. Free associated
+ * memory.
+ */
+ index_endscan(bmstate->bm_lov_scanDesc);
+ pfree(bmstate->bm_lov_scanKeys);
+ }
+
+ _bitmap_close_lov_heapandindex(bmstate->bm_lov_heap, bmstate->bm_lov_index,
+ RowExclusiveLock);
+#ifdef DEBUG_BMI
+ elog(NOTICE, "-----[_bitmap_cleanup_buildstate]----- END");
+#endif
+}
+
+/*
+ * _bitmap_init() -- initialize the bitmap index.
+ *
+ * Create the meta page, a new heap which stores the distinct values for
+ * the attributes to be indexed, a btree index on this new heap for searching
+ * those distinct values, and the first VMI page.
+ */
+void
+_bitmap_init(Relation index, bool use_wal)
+{
+ /*
+ * BitMap Index Meta Page (first page of the index) and first VMI
+ */
+
+ /* BitMap Index Meta Page (first page of the index) */
+ BMMetaPage metapage;
+ /* First item in the VMI page (set to be NULL) */
+ BMVectorMetaItem vmi;
+
+ /*
+ * Buffer and page management
+ */
+
+ Page page; /* temporary page variable */
+ Buffer metabuf; /* META information buffer */
+ Buffer vmibuf; /* VMI buffer */
+ OffsetNumber vmiOffset; /* First VMI page offset */
+ OffsetNumber o; /* temporary offset */
+
+ /* Sanity check (the index MUST be empty) */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("cannot initialize non-empty bitmap index \"%s\"",
+ RelationGetRelationName(index))));
+
+ /*
+ * The first step is to create the META page for the BitMap index, which
+ * contains some meta-data information about the BM index. The META page
+ * MUST ALWAYS be the first page (or page 0) and it is identified by the
+ * macro BM_METAPAGE
+ */
+
+ /* get a new buffer for the index (META buffer) */
+ metabuf = _bitmap_getbuf(index, P_NEW, BM_WRITE);
+ /* set the page associated with the META buffer */
+ page = BufferGetPage(metabuf);
+ /* check that the page is new */
+ Assert(PageIsNew(page));
+
+ START_CRIT_SECTION();
+
+ /* marks the META buffer contents as dirty (uninitialised) */
+ MarkBufferDirty(metabuf);
+
+ /* Initialise the page by setting its opaque fields (am duty) */
+ _bitmap_init_bitmappage(metabuf);
+
+ /* Get the content of the page (first ItemPointer - see bufpage.h) */
+ metapage = (BMMetaPage) PageGetContents(page);
+
+ /* Initialise the META page elements (heap and index) */
+ _bitmap_create_lov_heapandindex(index, &(metapage->bm_lov_heapId),
+ &(metapage->bm_lov_indexId));
+
+ /* Log the metapage in case of archiving */
+ if (use_wal)
+ _bitmap_log_metapage(index, page);
+
+ /*
+ * The second step is to create the first VMI. The very first value is the
+ * NULL value.
+ */
+
+ /* get a new buffer for the VMI */
+ vmibuf = _bitmap_getbuf(index, P_NEW, BM_WRITE);
+ _bitmap_init_vmipage(vmibuf);
+
+ /* mark the VMI buffer contents as dirty (uninitialised) */
+ MarkBufferDirty(vmibuf);
+
+ /* Get the page for the first VMI item */
+ page = BufferGetPage(vmibuf);
+
+ /* Set the first item to support NULL value */
+ vmi = _bitmap_formitem(0);
+ vmiOffset = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ /*
+ * XXX: perhaps this could be a special page, with more efficient storage
+ * after all, we have fixed size data
+ */
+ o = PageAddItem(page, (Item) vmi, sizeof(BMVectorMetaItemData),
+ vmiOffset, false, false);
+
+ if (o == InvalidOffsetNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to add vector meta item to \"%s\"",
+ RelationGetRelationName(index))));
+
+ /* Set the last page for the VMI */
+ metapage->bm_last_vmi_page = BufferGetBlockNumber(vmibuf);
+
+ /* Log that a new VMI has been added to a VMI page */
+ if (use_wal)
+ _bitmap_log_vmi(index, vmibuf, vmiOffset, vmi, metabuf, true);
+
+ END_CRIT_SECTION();
+
+ /* Write the two buffers to disk */
+ _bitmap_wrtbuf(vmibuf);
+ _bitmap_wrtbuf(metabuf);
+
+ pfree(vmi); /* free the item from memory */
+}
+
+/*
+ * _bitmap_build_lovindex() -- index the tuples of the LOV for the first time
+ *
+ * For performance reasons we defer indexing of the LOV tuples when building a
+ * fresh bitmap index when possible. This function allows to initiate the
+ * indexing separately from the creation of the index and insertion of the
+ * tuples.
+ */
+static void
+_bitmap_build_lovindex(BMBuildState *bmstate, IndexInfo *indexInfo)
+{
+ index_build(bmstate->bm_lov_heap, bmstate->bm_lov_index, indexInfo,
+ false, false);
+}
+
+/*
+ * Build a hash of the key we're indexing.
+ */
+
+static uint32
+build_hash_key(const void *key, Size keysize)
+{
+ Datum *k = (Datum *) key;
+ int i;
+ uint32 hashkey = 0;
+
+ for (i = 0; i < cur_bmbuild->natts; i++)
+ {
+ /* rotate hashkey left 1 bit at each step */
+ hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+ hashkey ^= DatumGetUInt32(FunctionCall1(&cur_bmbuild->hash_funcs[i],
+ k[i]));
+ }
+ return hashkey;
+}
+
+/*
+ * Test whether key1 matches key2. Since the equality functions may leak,
+ * reset the temporary context at each call and do all equality calculation
+ * in that context.
+ */
+static int
+build_match_key(const void *key1, const void *key2, Size keysize)
+{
+ int i;
+ MemoryContext old;
+ int result = 0;
+
+ MemoryContextReset(cur_bmbuild->tmpcxt);
+ old = MemoryContextSwitchTo(cur_bmbuild->tmpcxt);
+
+ for (i = 0; i < cur_bmbuild->natts; i++)
+ {
+ Datum attr1 = ((Datum *)key1)[i];
+ Datum attr2 = ((Datum *)key2)[i];
+ if (!DatumGetBool(FunctionCall2(&cur_bmbuild->eq_funcs[i],
+ attr1, attr2)))
+ {
+ result = 1; /* they aren't equal */
+ break;
+ }
+ }
+ MemoryContextSwitchTo(old);
+ return result;
+}
diff --git a/src/backend/access/bitmap/bitmapsearch.c b/src/backend/access/bitmap/bitmapsearch.c
new file mode 100644
index 0000000..e964ca36
--- /dev/null
+++ b/src/backend/access/bitmap/bitmapsearch.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmapsearch.c
+ * Search routines for on-disk bitmap index access method.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/bitmap/bitmapsearch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/tupdesc.h"
+#include "access/bitmap.h"
+#include "storage/lmgr.h"
+#include "parser/parse_oper.h"
+#include "utils/lsyscache.h"
+#include "storage/bufmgr.h" /* for buffer manager functions */
+#include "utils/tqual.h" /* for SnapshotAny */
+
+static void next_batch_words(IndexScanDesc scan);
+static void read_words(Relation rel, Buffer vmiBuffer, OffsetNumber vmiOffset,
+ BlockNumber *nextBlockNoP, BM_WORD *headerWords,
+ BM_WORD *words, uint32 *numOfWordsP,
+ bool *readLastWords);
+/*
+ * _bitmap_first() -- find the first tuple that satisfies a given scan.
+ */
+bool
+_bitmap_first(IndexScanDesc scan, ScanDirection dir)
+{
+ BMScanOpaque so;
+ BMScanPosition scanpos;
+
+ _bitmap_findbitmaps(scan, dir);
+ so = (BMScanOpaque) scan->opaque;
+ scanpos = (BMScanPosition) so->bm_currPos;
+ if (scanpos->done)
+ return false;
+
+ return _bitmap_next(scan, dir);
+}
+
+/*
+ * _bitmap_next() -- return the next tuple that satisfies a given scan.
+ */
+bool
+_bitmap_next(IndexScanDesc scan, ScanDirection dir)
+{
+ BMScanOpaque so;
+ BMScanPosition scanPos;
+ uint64 nextTid;
+
+ so = (BMScanOpaque) scan->opaque;
+ scanPos = so->bm_currPos;
+
+ if (scanPos->done)
+ return false;
+
+ for (;;)
+ {
+ /*
+ * If there are no more words left from the previous scan, we
+ * try to compute the next batch of words.
+ */
+ if (scanPos->bm_batchWords->nwords == 0 &&
+ scanPos->bm_result.nextTidLoc >= scanPos->bm_result.numOfTids)
+ {
+ _bitmap_reset_batchwords(scanPos->bm_batchWords);
+ scanPos->bm_batchWords->firstTid = scanPos->bm_result.nextTid;
+
+ next_batch_words(scan);
+
+ _bitmap_begin_iterate(scanPos->bm_batchWords, &(scanPos->bm_result));
+ }
+
+ /* If we can not find more words, then this scan is over. */
+ if (scanPos->bm_batchWords->nwords == 0 &&
+ scanPos->bm_result.nextTidLoc >= scanPos->bm_result.numOfTids)
+ return false;
+
+ nextTid = _bitmap_findnexttid(scanPos->bm_batchWords,
+ &(scanPos->bm_result));
+ if (nextTid == 0)
+ continue;
+ else
+ break;
+ }
+
+ Assert((nextTid % BM_MAX_HTUP_PER_PAGE) + 1 > 0);
+
+ ItemPointerSet(&(scan->xs_ctup.t_self), BM_INT_GET_BLOCKNO(nextTid),
+ BM_INT_GET_OFFSET(nextTid));
+ so->cur_pos_valid = true;
+
+ return true;
+}
+
+/*
+ * _bitmap_firstbatchwords() -- find the first batch of bitmap words
+ * in a bitmap vector for a given scan.
+ */
+bool
+_bitmap_firstbatchwords(IndexScanDesc scan,
+ ScanDirection dir)
+{
+ _bitmap_findbitmaps(scan, dir);
+
+ return _bitmap_nextbatchwords(scan, dir);
+}
+
+/*
+ * _bitmap_nextbatchwords() -- find the next batch of bitmap words
+ * in a bitmap vector for a given scan.
+ */
+bool
+_bitmap_nextbatchwords(IndexScanDesc scan,
+ ScanDirection dir)
+{
+ BMScanOpaque so;
+
+ so = (BMScanOpaque) scan->opaque;
+
+ /* check if this scan if over */
+ if (so->bm_currPos->done)
+ return false;
+
+ /*
+ * If there are some leftover words from the previous scan, simply
+ * return them.
+ */
+ if (so->bm_currPos->bm_batchWords->nwords > 0)
+ return true;
+
+ next_batch_words(scan);
+
+ return true;
+}
+
+/*
+ * next_batch_words() -- compute the next batch of bitmap words
+ * from a given scan position.
+ */
+static void
+next_batch_words(IndexScanDesc scan)
+{
+ BMScanPosition scanPos;
+ BMVector bmScanPos;
+ int i;
+ BMBatchWords **batches;
+ int numBatches;
+
+ scanPos = ((BMScanOpaque) scan->opaque)->bm_currPos;
+ bmScanPos = scanPos->posvecs;
+
+ batches = (BMBatchWords **)
+ palloc0(scanPos->nvec * sizeof(BMBatchWords *));
+
+ numBatches = 0;
+ /*
+ * Obtains the next batch of words for each bitmap vector.
+ * Ignores those bitmap vectors that contain no new words.
+ */
+ for (i = 0; i < scanPos->nvec; i++)
+ {
+ BMBatchWords *batchWords;
+ batchWords = bmScanPos[i].bm_batchWords;
+
+ /*
+ * If there are no words left from previous scan, read the next
+ * batch of words.
+ */
+ if (bmScanPos[i].bm_batchWords->nwords == 0 &&
+ !(bmScanPos[i].bm_readLastWords))
+ {
+
+ _bitmap_reset_batchwords(batchWords);
+ read_words(scan->indexRelation,
+ bmScanPos[i].bm_vmiBuffer,
+ bmScanPos[i].bm_vmiOffset,
+ &(bmScanPos[i].bm_nextBlockNo),
+ batchWords->hwords,
+ batchWords->cwords,
+ &(batchWords->nwords),
+ &(bmScanPos[i].bm_readLastWords));
+ }
+
+ if (bmScanPos[i].bm_batchWords->nwords > 0)
+ {
+ batches[numBatches] = batchWords;
+ numBatches++;
+ }
+ }
+
+ /*
+ * We handle the case where only one bitmap vector contributes to
+ * the scan separately with other cases. This is because
+ * bmScanPos->bm_batchWords and scanPos->bm_batchWords
+ * are the same.
+ */
+ if (scanPos->nvec == 1)
+ {
+ if (bmScanPos->bm_batchWords->nwords == 0)
+ scanPos->done = true;
+ pfree(batches);
+ scanPos->bm_batchWords = scanPos->posvecs->bm_batchWords;
+
+ return;
+ }
+
+ /*
+ * At least two bitmap vectors contribute to this scan, we
+ * ORed these bitmap vectors.
+ */
+ if (numBatches == 0)
+ {
+ scanPos->done = true;
+ pfree(batches);
+ return;
+ }
+
+ _bitmap_union(batches, numBatches, scanPos->bm_batchWords);
+ pfree(batches);
+}
+
+/*
+ * read_words() -- read one-block of bitmap words from
+ * the bitmap page.
+ *
+ * If nextBlockNo is an invalid block number, then the two last words
+ * are stored in vmi. Otherwise, read words from nextBlockNo.
+ */
+static void
+read_words(Relation rel, Buffer vmiBuffer, OffsetNumber vmiOffset,
+ BlockNumber *nextBlockNoP, BM_WORD *headerWords,
+ BM_WORD *words, uint32 *numOfWordsP, bool *readLastWords)
+{
+ if (BlockNumberIsValid(*nextBlockNoP))
+ {
+ Buffer bitmapBuffer = _bitmap_getbuf(rel, *nextBlockNoP, BM_READ);
+
+ Page bitmapPage;
+ BMBitmapVectorPage bitmap;
+ BMPageOpaque bo;
+
+ bitmapPage = BufferGetPage(bitmapBuffer);
+
+ bitmap = (BMBitmapVectorPage) PageGetContents(bitmapPage);
+ bo = (BMPageOpaque)PageGetSpecialPointer(bitmapPage);
+
+ *numOfWordsP = bo->bm_hrl_words_used;
+ memcpy(headerWords, bitmap->hwords,
+ BM_NUM_OF_HEADER_WORDS * sizeof(BM_WORD));
+ memcpy(words, bitmap->cwords, sizeof(BM_WORD) * *numOfWordsP);
+
+ *nextBlockNoP = bo->bm_bitmap_next;
+
+ _bitmap_relbuf(bitmapBuffer);
+
+ *readLastWords = false;
+
+ /*
+ * If this is the last bitmap page and the total number of words
+ * in this page is less than or equal to
+ * BM_NUM_OF_HRL_WORDS_PER_PAGE - 2, we read the last two words
+ * and append them into 'headerWords' and 'words'.
+ */
+
+ if ((!BlockNumberIsValid(*nextBlockNoP)) &&
+ (*numOfWordsP <= BM_NUM_OF_HRL_WORDS_PER_PAGE - 2))
+ {
+ BM_WORD cwords[2];
+ BM_WORD hword;
+ BM_WORD tmp;
+ uint32 nwords;
+ int offs;
+
+ read_words(rel, vmiBuffer, vmiOffset, nextBlockNoP, &hword,
+ cwords, &nwords, readLastWords);
+
+ Assert(nwords > 0 && nwords <= 2);
+
+ memcpy(words + *numOfWordsP, cwords, nwords * sizeof(BM_WORD));
+
+ offs = *numOfWordsP / BM_WORD_SIZE;
+ tmp = hword >> *numOfWordsP % BM_WORD_SIZE;
+ headerWords[offs] |= tmp;
+
+ if (*numOfWordsP % BM_WORD_SIZE == BM_WORD_SIZE - 1)
+ {
+ offs = (*numOfWordsP + 1)/BM_WORD_SIZE;
+ headerWords[offs] |= hword << 1;
+ }
+ *numOfWordsP += nwords;
+ }
+ }
+ else
+ {
+ BMVectorMetaItem vmi;
+ Page vmiPage;
+
+ LockBuffer(vmiBuffer, BM_READ);
+
+ vmiPage = BufferGetPage(vmiBuffer);
+ vmi = (BMVectorMetaItem)
+ PageGetItem(vmiPage, PageGetItemId(vmiPage, vmiOffset));
+
+ if (vmi->bm_last_compword != LITERAL_ALL_ONE)
+ {
+ *numOfWordsP = 2;
+ headerWords[0] = (((BM_WORD) vmi->vmi_words_header) <<
+ (BM_WORD_SIZE-2));
+ words[0] = vmi->bm_last_compword;
+ words[1] = vmi->bm_last_word;
+ }
+ else
+ {
+ *numOfWordsP = 1;
+ headerWords[0] = (((BM_WORD) vmi->vmi_words_header) <<
+ (BM_WORD_SIZE-1));
+ words[0] = vmi->bm_last_word;
+ }
+
+ LockBuffer(vmiBuffer, BUFFER_LOCK_UNLOCK);
+ *readLastWords = true;
+ }
+}
+
+/*
+ * _bitmap_findbitmaps() -- find the bitmap vectors that satisfy the
+ * index predicate.
+ */
+void
+_bitmap_findbitmaps(IndexScanDesc scan, ScanDirection dir)
+{
+ BMScanOpaque so;
+ BMScanPosition scanPos;
+ Buffer metabuf;
+ BMMetaPage metapage;
+ BlockNumber vmiBlock;
+ OffsetNumber vmiOffset;
+ int vectorNo, keyNo;
+
+ so = (BMScanOpaque) scan->opaque;
+
+ /* allocate space and initialize values for so->bm_currPos */
+ if (so->bm_currPos == NULL)
+ so->bm_currPos = (BMScanPosition) palloc0(sizeof(BMScanPositionData));
+
+ scanPos = so->bm_currPos;
+ scanPos->nvec = 0;
+ scanPos->done = false;
+ MemSet(&scanPos->bm_result, 0, sizeof(BMIterateResult));
+
+
+ for (keyNo = 0; keyNo < scan->numberOfKeys; keyNo++)
+ {
+ if (scan->keyData[keyNo].sk_flags & SK_ISNULL)
+ {
+ scanPos->done = true;
+ return;
+ }
+ }
+
+ metabuf = _bitmap_getbuf(scan->indexRelation, BM_METAPAGE, BM_READ);
+ metapage = (BMMetaPage) PageGetContents(BufferGetPage(metabuf));
+
+ /*
+ * If the values for these keys are all NULL, the bitmap vector
+ * is accessed through the first VMI.
+ */
+ if (0)
+ {
+ vmiBlock = BM_VMI_STARTPAGE;
+ vmiOffset = 1;
+
+ scanPos->posvecs = (BMVector) palloc0(sizeof(BMVectorData));
+
+ _bitmap_initscanpos(scan, scanPos->posvecs, vmiBlock, vmiOffset);
+ scanPos->nvec = 1;
+ }
+ else
+ {
+ Relation lovHeap, lovIndex;
+ ScanKey scanKeys;
+ IndexScanDesc scanDesc;
+ BMVMIID vmiid;
+ List *vmiids = NIL;
+ ListCell *cell;
+
+ /*
+ * We haven't locked the metapage but that's okay... if these
+ * values change underneath us there's something much more
+ * fundamentally wrong. This could change when we have VACUUM
+ * support, of course.
+ */
+ _bitmap_open_lov_heapandindex(metapage, &lovHeap, &lovIndex,
+ AccessShareLock);
+
+ scanKeys = palloc0(scan->numberOfKeys * sizeof(ScanKeyData));
+ for (keyNo = 0; keyNo < scan->numberOfKeys; keyNo++)
+ {
+ ScanKey scanKey = &scanKeys[keyNo];
+
+ elog(NOTICE, "initialize scanKey for attno %d",
+ scan->keyData[keyNo].sk_attno);
+
+ ScanKeyEntryInitialize(scanKey,
+ scan->keyData[keyNo].sk_flags,
+ scan->keyData[keyNo].sk_attno,
+ scan->keyData[keyNo].sk_strategy,
+ scan->keyData[keyNo].sk_subtype,
+ scan->keyData[keyNo].sk_collation,
+ scan->keyData[keyNo].sk_func.fn_oid,
+ scan->keyData[keyNo].sk_argument);
+ }
+
+ /* XXX: is SnapshotAny really the right choice? */
+ scanDesc = index_beginscan(lovHeap, lovIndex, SnapshotAny,
+ scan->numberOfKeys, 0);
+ index_rescan(scanDesc, scanKeys, scan->numberOfKeys, NULL, 0);
+
+ /*
+ * finds all VMI IDs for this scan through lovHeap and lovIndex.
+ */
+ while (_bitmap_findvalue(lovHeap, lovIndex, scanKeys, scanDesc,
+ &vmiid))
+ {
+ /*
+ * We find the VMI ID of one item. Append it into the list.
+ */
+ BMVMIID *idCopy = (BMVMIID *) palloc0(sizeof(BMVMIID));
+
+ *idCopy = vmiid;
+ vmiids = lappend(vmiids, idCopy);
+
+ scanPos->nvec++;
+ }
+
+ scanPos->posvecs =
+ (BMVector)palloc0(sizeof(BMVectorData) * scanPos->nvec);
+ vectorNo = 0;
+ foreach(cell, vmiids)
+ {
+ BMVMIID *_vmiid = (BMVMIID *) lfirst(cell);
+ BMVector bmScanPos = &(scanPos->posvecs[vectorNo]);
+
+ _bitmap_initscanpos(scan, bmScanPos, _vmiid->block,
+ _vmiid->offset);
+
+ vectorNo++;
+ }
+
+ list_free_deep(vmiids);
+
+ index_endscan(scanDesc);
+ _bitmap_close_lov_heapandindex(lovHeap, lovIndex, AccessShareLock);
+ pfree(scanKeys);
+ }
+
+ _bitmap_relbuf(metabuf);
+
+ if (scanPos->nvec == 0)
+ {
+ scanPos->done = true;
+ return;
+ }
+
+ /*
+ * Since there is only one related bitmap vector, we have
+ * the scan position's batch words structure point directly to
+ * the vector's batch words.
+ */
+ if (scanPos->nvec == 1)
+ scanPos->bm_batchWords = scanPos->posvecs->bm_batchWords;
+ else
+ {
+ scanPos->bm_batchWords = (BMBatchWords *) palloc0(sizeof(BMBatchWords));
+ _bitmap_init_batchwords(scanPos->bm_batchWords,
+ BM_NUM_OF_HRL_WORDS_PER_PAGE,
+ CurrentMemoryContext);
+ }
+}
+
+/*
+ * _bitmap_initscanpos() -- initialize a BMScanPosition for a given
+ * bitmap vector.
+ */
+void
+_bitmap_initscanpos(IndexScanDesc scan, BMVector bmScanPos,
+ BlockNumber vmiBlock, OffsetNumber vmiOffset)
+{
+ Page vmiPage;
+ BMVectorMetaItem vmi;
+
+ bmScanPos->bm_vmiOffset = vmiOffset;
+ bmScanPos->bm_vmiBuffer = _bitmap_getbuf(scan->indexRelation, vmiBlock,
+ BM_READ);
+
+ vmiPage = BufferGetPage(bmScanPos->bm_vmiBuffer);
+ vmi = (BMVectorMetaItem)
+ PageGetItem(vmiPage, PageGetItemId(vmiPage, bmScanPos->bm_vmiOffset));
+
+ bmScanPos->bm_nextBlockNo = vmi->bm_bitmap_head;
+ bmScanPos->bm_readLastWords = false;
+ bmScanPos->bm_batchWords = (BMBatchWords *) palloc0(sizeof(BMBatchWords));
+ _bitmap_init_batchwords(bmScanPos->bm_batchWords,
+ BM_NUM_OF_HRL_WORDS_PER_PAGE,
+ CurrentMemoryContext);
+
+ LockBuffer(bmScanPos->bm_vmiBuffer, BUFFER_LOCK_UNLOCK);
+}
+
+/*
+ * _bitmap_get_null_vmiid() -- return the vmiid of the all-nulls entry for the
+ * given relation.
+ */
+void
+_bitmap_get_null_vmiid(Relation index, BMVMIID *vmiid)
+{
+ vmiid->block = BM_VMI_STARTPAGE;
+ vmiid->offset = 1;
+}
diff --git a/src/backend/access/bitmap/bitmaputil.c b/src/backend/access/bitmap/bitmaputil.c
new file mode 100644
index 0000000..cefaf8b
--- /dev/null
+++ b/src/backend/access/bitmap/bitmaputil.c
@@ -0,0 +1,2017 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmaputil.c
+ * Utility routines for on-disk bitmap index access method.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/bitmap/bitmaputil.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "access/bitmap.h"
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/reloptions.h"
+#include "storage/bufmgr.h" /* for buffer manager functions */
+#include "utils/tqual.h" /* for SnapshotAny */
+#include "utils/rel.h" /* for RelationGetDescr */
+
+/*
+ * Struct to pass basic vacuum info around
+ */
+typedef struct bmvacinfo
+{
+ IndexVacuumInfo *info;
+ BMVectorMetaItem vmi;
+} bmvacinfo;
+
+/*
+ * What are we vacuuming?
+ */
+typedef enum bmVacType
+{
+ BM_VAC_PAGE,
+ BM_VAC_LAST_COMPWORD,
+ BM_VAC_LAST_WORD
+} bmVacType;
+
+/*
+ * State structure to store a bunch of useful variables we need to pass
+ * around to bitmap vacuum sub routines.
+ */
+typedef struct bmvacstate
+{
+ BMVectorMetaItem curvmi; /* VMI for the current vector */
+ /* variables marking interesting positions in the vector */
+ uint64 cur_bitpos; /* the hypothetical location we're at */
+ uint64 last_setbit; /* last bit set set as a match */
+ uint64 start_reaped;
+ uint16 readwordno; /* current word we're reading */
+ uint16 writewordno; /* current word we're writing */
+
+ /* block position at the physical level */
+ BlockNumber itr_blk;
+
+ Buffer curbuf; /* current buffer we're examining */
+
+ /* actual bitmap pages for old and new */
+ BMBitmapVectorPage curbm;
+ BMPageOpaque curbmo;
+
+ bool page_updated; /* have we dirtied the page */
+
+ /* callback info */
+ IndexBulkDeleteCallback callback;
+ void *callback_state;
+
+ /* overflow storage */
+ BMBitmapVectorPageData ovrflw;
+ int32 ovrflwwordno;
+} bmvacstate;
+
+static void _bitmap_findnextword(BMBatchWords* words, uint32 nextReadNo);
+static void _bitmap_resetWord(BMBatchWords *words, uint32 prevStartNo);
+static uint8 _bitmap_find_bitset(BM_WORD word, uint8 lastPos);
+static void vacuum_vector(bmvacinfo vacinfo, IndexBulkDeleteCallback callback,
+ void *callback_state);
+static void fill_reaped(bmvacstate *state, uint64 start, uint64 end);
+static void vacuum_fill_word(bmvacstate *state, bmVacType vactype);
+static void vacuum_literal_word(bmvacstate *state, bmVacType vavtype);
+static void try_shrink_bitmap(bmvacstate *state);
+static void progress_write_pos(bmvacstate *state);
+static void vacuum_last_words(bmvacstate *state);
+static BM_WORD vactype_get_word(bmvacstate *state, bmVacType type);
+static void put_vacuumed_literal_word(bmvacstate *state, bmVacType type,
+ BM_WORD word);
+static void vacuum_append_ovrflw_words(bmvacstate *state);
+
+/*
+ * _bitmap_formitem() -- construct a VMI.
+ *
+ * If the given tid number is greater than BM_WORD_SIZE, we
+ * construct the first fill word for this bitmap vector.
+ */
+BMVectorMetaItem
+_bitmap_formitem(uint64 currTidNumber)
+{
+ /* Allocate a new vector meta item */
+ BMVectorMetaItem vmi = (BMVectorMetaItem) palloc(BM_VECTOR_META_ITEM_SIZE);
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_formitem] BEGIN"
+ "\n\tcurrTidNumber = " UINT64_FORMAT,
+ currTidNumber
+ );
+#endif
+
+ /* Initialise the LOV structure */
+ vmi->bm_bitmap_head = vmi->bm_bitmap_tail = InvalidBlockNumber;
+ vmi->bm_last_setbit = 0;
+ vmi->bm_last_compword = LITERAL_ALL_ONE;
+ vmi->bm_last_word = LITERAL_ALL_ZERO;
+ vmi->vmi_words_header = BM_VMI_WORDS_NO_FILL;
+ vmi->bm_last_tid_location = 0;
+
+ /* fill up all existing bits with 0. */
+ if (currTidNumber > BM_WORD_SIZE)
+ {
+ uint32 numOfTotalFillWords;
+ BM_WORD numOfFillWords;
+
+ numOfTotalFillWords = (currTidNumber - 1) / BM_WORD_SIZE;
+
+ numOfFillWords = (numOfTotalFillWords >= MAX_FILL_LENGTH) ?
+ MAX_FILL_LENGTH : numOfTotalFillWords;
+
+ vmi->bm_last_compword = BM_MAKE_FILL_WORD(0, numOfFillWords);
+ vmi->bm_last_word = LITERAL_ALL_ZERO;
+ vmi->vmi_words_header = BM_LAST_COMPWORD_BIT;
+ vmi->bm_last_tid_location = numOfFillWords * BM_WORD_SIZE;
+
+ /*
+ * If all zeros are too many to fit in one word, then we set
+ * bm_last_setbit so that the remaining zeros can be handled outside.
+ */
+ if (numOfTotalFillWords > numOfFillWords)
+ vmi->bm_last_setbit = numOfFillWords * BM_WORD_SIZE;
+ }
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "[_bitmap_formitem] END");
+#endif
+ return vmi;
+}
+
+/*
+ * _bitmap_init_batchwords() -- initialize a BMBatchWords in a given
+ * memory context.
+ *
+ * Allocate spaces for bitmap header words and bitmap content words.
+ */
+void
+_bitmap_init_batchwords(BMBatchWords* words,
+ uint32 maxNumOfWords,
+ MemoryContext mcxt)
+{
+ uint32 numOfHeaderWords;
+ MemoryContext oldcxt;
+
+ words->nwordsread = 0;
+ words->nextread = 1;
+ words->startNo = 0;
+ words->nwords = 0;
+
+ numOfHeaderWords = BM_CALC_H_WORDS(maxNumOfWords);
+
+ words->maxNumOfWords = maxNumOfWords;
+
+ /* Make sure that we have at least one page of words */
+ Assert(words->maxNumOfWords >= BM_NUM_OF_HRL_WORDS_PER_PAGE);
+
+ oldcxt = MemoryContextSwitchTo(mcxt);
+ words->hwords = palloc0(sizeof(BM_WORD)*numOfHeaderWords);
+ words->cwords = palloc0(sizeof(BM_WORD)*words->maxNumOfWords);
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * _bitmap_copy_batchwords() -- copy a given BMBatchWords to another
+ * BMBatchWords.
+ */
+void
+_bitmap_copy_batchwords(BMBatchWords* words, BMBatchWords* copyWords)
+{
+ uint32 numOfHeaderWords;
+
+ copyWords->maxNumOfWords = words->maxNumOfWords;
+ copyWords->nwordsread = words->nwordsread;
+ copyWords->nextread = words->nextread;
+ copyWords->firstTid = words->firstTid;
+ copyWords->startNo = words->startNo;
+ copyWords->nwords = words->nwords;
+
+ numOfHeaderWords = BM_CALC_H_WORDS(copyWords->maxNumOfWords);
+
+ memcpy(copyWords->hwords, words->hwords,
+ sizeof(BM_WORD)*numOfHeaderWords);
+ memcpy(copyWords->cwords, words->cwords,
+ sizeof(BM_WORD)*copyWords->maxNumOfWords);
+}
+
+/*
+ * _bitmap_reset_batchwords() -- reset the BMBatchWords for re-use.
+ */
+void
+_bitmap_reset_batchwords(BMBatchWords *words)
+{
+ words->startNo = 0;
+ words->nwords = 0;
+ MemSet(words->hwords, 0,
+ sizeof(BM_WORD) * BM_CALC_H_WORDS(words->maxNumOfWords));
+}
+
+/*
+ * _bitmap_cleanup_batchwords() -- release spaces allocated for the BMBatchWords.
+ */
+void _bitmap_cleanup_batchwords(BMBatchWords* words)
+{
+ if (words == NULL)
+ return;
+
+ if (words->hwords)
+ pfree(words->hwords);
+ if (words->cwords)
+ pfree(words->cwords);
+}
+
+/*
+ * _bitmap_cleanup_scanpos() -- release space allocated for
+ * BMVector.
+ */
+void
+_bitmap_cleanup_scanpos(BMVector bmScanPos, uint32 numBitmapVectors)
+{
+ uint32 keyNo;
+
+ if (numBitmapVectors == 0)
+ return;
+
+ for (keyNo=0; keyNo<numBitmapVectors; keyNo++)
+ {
+ if (BufferIsValid((bmScanPos[keyNo]).bm_vmiBuffer))
+ {
+ ReleaseBuffer((bmScanPos[keyNo]).bm_vmiBuffer);
+ }
+
+ _bitmap_cleanup_batchwords((bmScanPos[keyNo]).bm_batchWords);
+ if (bmScanPos[keyNo].bm_batchWords != NULL)
+ pfree((bmScanPos[keyNo]).bm_batchWords);
+ }
+
+ pfree(bmScanPos);
+}
+
+/*
+ * _bitmap_findnexttid() -- find the next tid location in a given batch
+ * of bitmap words.
+ */
+uint64
+_bitmap_findnexttid(BMBatchWords *words, BMIterateResult *result)
+{
+ /*
+ * If there is not tids from previous computation, then we
+ * try to find next set of tids.
+ */
+
+ if (result->nextTidLoc >= result->numOfTids)
+ _bitmap_findnexttids(words, result, BM_BATCH_TIDS);
+
+ /* if find more tids, then return the first one */
+ if (result->nextTidLoc < result->numOfTids)
+ {
+ result->nextTidLoc++;
+ return (result->nextTids[result->nextTidLoc-1]);
+ }
+
+ /* no more tids */
+ return 0;
+}
+
+/*
+ * _bitmap_findprevtid() -- find the previous tid location in an array of tids.
+ */
+void
+_bitmap_findprevtid(BMIterateResult *result)
+{
+ Assert(result->nextTidLoc > 0);
+ result->nextTidLoc--;
+}
+
+/*
+ * _bitmap_findnexttids() -- find the next set of tids from a given
+ * batch of bitmap words.
+ *
+ * The maximum number of tids to be found is defined in 'maxTids'.
+ */
+void
+_bitmap_findnexttids(BMBatchWords *words, BMIterateResult *result,
+ uint32 maxTids)
+{
+ bool done = false;
+
+ result->nextTidLoc = result->numOfTids = 0;
+ while (words->nwords > 0 && result->numOfTids < maxTids && !done)
+ {
+ uint8 oldScanPos = result->lastScanPos;
+ BM_WORD word = words->cwords[result->lastScanWordNo];
+
+ /* new word, zero filled */
+ if (oldScanPos == 0 &&
+ ((IS_FILL_WORD(words->hwords, result->lastScanWordNo) &&
+ GET_FILL_BIT(word) == 0) || word == 0))
+ {
+ uint32 fillLength;
+ if (word == 0)
+ fillLength = 1;
+ else
+ fillLength = FILL_LENGTH(word);
+
+ /* skip over non-matches */
+ result->nextTid += fillLength * BM_WORD_SIZE;
+ result->lastScanWordNo++;
+ words->nwords--;
+ result->lastScanPos = 0;
+ continue;
+ }
+ else if (IS_FILL_WORD(words->hwords, result->lastScanWordNo)
+ && GET_FILL_BIT(word) == 1)
+ {
+ uint32 nfillwords = FILL_LENGTH(word);
+ uint8 bitNo;
+
+ while (result->numOfTids + BM_WORD_SIZE <= maxTids &&
+ nfillwords > 0)
+ {
+ /* explain the fill word */
+ for (bitNo = 0; bitNo < BM_WORD_SIZE; bitNo++)
+ result->nextTids[result->numOfTids++] = ++result->nextTid;
+
+ nfillwords--;
+ /* update fill word to reflect expansion */
+ words->cwords[result->lastScanWordNo]--;
+ }
+
+ if (nfillwords == 0)
+ {
+ result->lastScanWordNo++;
+ words->nwords--;
+ result->lastScanPos = 0;
+ continue;
+ }
+ else
+ {
+ done = true;
+ break;
+ }
+ }
+ else
+ {
+ if (oldScanPos == 0)
+ oldScanPos = BM_WORD_SIZE + 1;
+
+ while (oldScanPos != 0 && result->numOfTids < maxTids)
+ {
+ BM_WORD w;
+
+ if (oldScanPos == BM_WORD_SIZE + 1)
+ oldScanPos = 0;
+
+ w = words->cwords[result->lastScanWordNo];
+ result->lastScanPos = _bitmap_find_bitset(w, oldScanPos);
+
+ /* did we fine a bit set in this word? */
+ if (result->lastScanPos != 0)
+ {
+ result->nextTid += (result->lastScanPos - oldScanPos);
+ result->nextTids[result->numOfTids++] = result->nextTid;
+ }
+ else
+ {
+ result->nextTid += BM_WORD_SIZE - oldScanPos;
+ /* start scanning a new word */
+ words->nwords--;
+ result->lastScanWordNo++;
+ result->lastScanPos = 0;
+ }
+ oldScanPos = result->lastScanPos;
+ }
+ }
+ }
+}
+
+/*
+ * _bitmap_intesect() is dead code because streaming intersects
+ * PagetableEntry structures, not raw batch words. It's possible we may
+ * want to intersect batches later though -- it would definately improve
+ * streaming of intersections.
+ */
+
+#ifdef NOT_USED
+
+/*
+ * _bitmap_intersect() -- intersect 'numBatches' bitmap words.
+ *
+ * All 'numBatches' bitmap words are HRL compressed. The result
+ * bitmap words HRL compressed, except that fill set words(1s) may
+ * be lossily compressed.
+ */
+void
+_bitmap_intersect(BMBatchWords **batches, uint32 numBatches,
+ BMBatchWords *result)
+{
+ bool done = false;
+ uint32 *prevStartNos;
+ uint32 nextReadNo;
+ uint32 batchNo;
+
+ Assert(numBatches > 0);
+
+ prevStartNos = (uint32 *)palloc0(numBatches * sizeof(uint32));
+ nextReadNo = batches[0]->nextread;
+
+ while (!done && result->nwords < result->maxNumOfWords)
+ {
+ BM_WORD andWord = LITERAL_ALL_ONE;
+ BM_WORD word;
+
+ bool andWordIsLiteral = true;
+
+ /*
+ * We walk through the bitmap word in each list one by one
+ * without de-compress the bitmap words. 'nextReadNo' defines
+ * the position of the next word that should be read in an
+ * uncompressed format.
+ */
+ for (batchNo = 0; batchNo < numBatches; batchNo++)
+ {
+ uint32 offs;
+ BMBatchWords *bch = batches[batchNo];
+
+ /* skip nextReadNo - nwordsread - 1 words */
+ _bitmap_findnextword(bch, nextReadNo);
+
+ if (bch->nwords == 0)
+ {
+ done = true;
+ break;
+ }
+
+ Assert(bch->nwordsread == nextReadNo - 1);
+
+ /* Here, startNo should point to the word to be read. */
+ offs = bch->startNo;
+ word = bch->cwords[offs];
+
+ if (CUR_WORD_IS_FILL(bch) && (GET_FILL_BIT(word) == 0))
+ {
+ uint32 n;
+
+ bch->nwordsread += FILL_LENGTH(word);
+
+ n = bch->nwordsread - nextReadNo + 1;
+ andWord = BM_MAKE_FILL_WORD(0, n);
+ andWordIsLiteral = false;
+
+ nextReadNo = bch->nwordsread + 1;
+ bch->startNo++;
+ bch->nwords--;
+ break;
+ }
+ else if (CUR_WORD_IS_FILL(bch) && (GET_FILL_BIT(word) == 1))
+ {
+ bch->nwordsread++;
+
+ prevStartNos[batchNo] = bch->startNo;
+
+ if (FILL_LENGTH(word) == 1)
+ {
+ bch->startNo++;
+ bch->nwords--;
+ }
+ else
+ {
+ uint32 s = bch->startNo;
+ bch->cwords[s]--;
+ }
+ andWordIsLiteral = true;
+ }
+ else if (!CUR_WORD_IS_FILL(bch))
+ {
+ prevStartNos[batchNo] = bch->startNo;
+
+ andWord &= word;
+ bch->nwordsread++;
+ bch->startNo++;
+ bch->nwords--;
+ andWordIsLiteral = true;
+ }
+ }
+
+ /* Since there are not enough words in this attribute break this loop */
+ if (done)
+ {
+ uint32 preBatchNo;
+
+ /* reset the attributes before batchNo */
+ for (preBatchNo = 0; preBatchNo < batchNo; preBatchNo++)
+ {
+ _bitmap_resetWord(batches[preBatchNo], prevStartNos[preBatchNo]);
+ }
+ break;
+ }
+ else
+ {
+ if (!andWordIsLiteral)
+ {
+ uint32 off = result->nwords/BM_WORD_SIZE;
+ uint32 w = result->nwords;
+
+ result->hwords[off] |= WORDNO_GET_HEADER_BIT(w);
+ }
+ result->cwords[result->nwords] = andWord;
+ result->nwords++;
+ }
+
+ if (andWordIsLiteral)
+ nextReadNo++;
+
+ if (batchNo == 1 && bch->nwords == 0)
+ done = true;
+ }
+
+ /* set the nextReadNo */
+ for (batchNo = 0; batchNo < numBatches; batchNo++)
+ batches[batchNo]->nextread = nextReadNo;
+
+ pfree(prevStartNos);
+}
+
+#endif /* NOT_USED */
+
+/*
+ * _bitmap_union() -- union 'numBatches' bitmaps
+ *
+ * All bitmap words are HRL compressed. The result bitmap words are also
+ * HRL compressed, except that fill unset words may be lossily compressed.
+ */
+void
+_bitmap_union(BMBatchWords **batches, uint32 numBatches, BMBatchWords *result)
+{
+ bool done = false;
+ uint32 *prevstarts;
+ uint32 nextReadNo;
+ uint32 batchNo;
+
+ Assert (numBatches >= 0);
+
+ if (numBatches == 0)
+ return;
+
+ /* save batch->startNo for each input bitmap vector */
+ prevstarts = (uint32 *)palloc0(numBatches * sizeof(uint32));
+
+ /*
+ * Each batch should have the same next read offset, so take
+ * the first one
+ */
+ nextReadNo = batches[0]->nextread;
+
+ while (!done && result->nwords < result->maxNumOfWords)
+ {
+ BM_WORD orWord = LITERAL_ALL_ZERO;
+ BM_WORD word;
+ bool orWordIsLiteral = true;
+
+ for (batchNo = 0; batchNo < numBatches; batchNo++)
+ {
+ BMBatchWords *bch = batches[batchNo];
+
+ /* skip nextReadNo - nwordsread - 1 words */
+ _bitmap_findnextword(bch, nextReadNo);
+
+ if (bch->nwords == 0)
+ {
+ done = true;
+ break;
+ }
+
+ Assert(bch->nwordsread == nextReadNo - 1);
+
+ /* Here, startNo should point to the word to be read. */
+ word = bch->cwords[bch->startNo];
+
+ if (CUR_WORD_IS_FILL(bch) && GET_FILL_BIT(word) == 1)
+ {
+ /* Fill word represents matches */
+ bch->nwordsread += FILL_LENGTH(word);
+ orWord = BM_MAKE_FILL_WORD(1, bch->nwordsread - nextReadNo + 1);
+ orWordIsLiteral = false;
+
+ nextReadNo = bch->nwordsread + 1;
+ bch->startNo++;
+ bch->nwords--;
+ break;
+ }
+ else if (CUR_WORD_IS_FILL(bch) && GET_FILL_BIT(word) == 0)
+ {
+ /* Fill word represents no matches */
+
+ bch->nwordsread++;
+ prevstarts[batchNo] = bch->startNo;
+ if (FILL_LENGTH(word) == 1)
+ {
+ bch->startNo++;
+ bch->nwords--;
+ }
+ else
+ bch->cwords[bch->startNo]--;
+ orWordIsLiteral = true;
+ }
+ else if (!CUR_WORD_IS_FILL(bch))
+ {
+ /* word is literal */
+ prevstarts[batchNo] = bch->startNo;
+ orWord |= word;
+ bch->nwordsread++;
+ bch->startNo++;
+ bch->nwords--;
+ orWordIsLiteral = true;
+ }
+ }
+
+ if (done)
+ {
+ uint32 i;
+
+ /* reset the attributes before batchNo */
+ for (i = 0; i < batchNo; i++)
+ _bitmap_resetWord(batches[i], prevstarts[i]);
+ break;
+ }
+ else
+ {
+ if (!orWordIsLiteral)
+ {
+ /* Word is not literal, update the result header */
+ uint32 offs = result->nwords/BM_WORD_SIZE;
+ uint32 n = result->nwords;
+ result->hwords[offs] |= WORDNO_GET_HEADER_BIT(n);
+ }
+ result->cwords[result->nwords] = orWord;
+ result->nwords++;
+ }
+
+ if (orWordIsLiteral)
+ nextReadNo++;
+
+ /* we just processed the last batch and it was empty */
+ if (batchNo == numBatches - 1 && batches[batchNo]->nwords == 0)
+ done = true;
+ }
+
+ /* set the next word to read for all input vectors */
+ for (batchNo = 0; batchNo < numBatches; batchNo++)
+ batches[batchNo]->nextread = nextReadNo;
+
+ pfree(prevstarts);
+}
+
+/*
+ * _bitmap_findnextword() -- Find the next word whose position is
+ * 'nextReadNo' in an uncompressed format.
+ */
+static void
+_bitmap_findnextword(BMBatchWords *words, uint32 nextReadNo)
+{
+ /*
+ * 'words->nwordsread' defines how many un-compressed words
+ * have been read in this bitmap. We read from
+ * position 'startNo', and increment 'words->nwordsread'
+ * differently based on the type of words that are read, until
+ * 'words->nwordsread' is equal to 'nextReadNo'.
+ */
+ while (words->nwords > 0 && words->nwordsread < nextReadNo - 1)
+ {
+ /* Get the current word */
+ BM_WORD word = words->cwords[words->startNo];
+
+ if (CUR_WORD_IS_FILL(words))
+ {
+ if (FILL_LENGTH(word) <= (nextReadNo - words->nwordsread - 1))
+ {
+ words->nwordsread += FILL_LENGTH(word);
+ words->startNo++;
+ words->nwords--;
+ }
+ else
+ {
+ words->cwords[words->startNo] -= (nextReadNo - words->nwordsread - 1);
+ words->nwordsread = nextReadNo - 1;
+ }
+ }
+ else
+ {
+ words->nwordsread++;
+ words->startNo++;
+ words->nwords--;
+ }
+ }
+}
+
+/*
+ * _bitmap_resetWord() -- Reset the read position in an BMBatchWords
+ * to its previous value.
+ *
+ * Reset the read position in an BMBatchWords to its previous value,
+ * which is given in 'prevStartNo'. Based on different type of words read,
+ * the actual bitmap word may need to be changed.
+ */
+static void
+_bitmap_resetWord(BMBatchWords *words, uint32 prevStartNo)
+{
+ if (words->startNo > prevStartNo)
+ {
+ Assert(words->startNo == prevStartNo + 1);
+ words->startNo = prevStartNo;
+ words->nwords++;
+ }
+ else
+ {
+ Assert(words->startNo == prevStartNo);
+ Assert(CUR_WORD_IS_FILL(words));
+ words->cwords[words->startNo]++;
+ }
+ words->nwordsread--;
+}
+
+
+/*
+ * _bitmap_find_bitset() -- find the rightmost set bit (bit=1) in the
+ * given word since 'lastPos', not including 'lastPos'.
+ *
+ * The rightmost bit in the given word is considered the position 1, and
+ * the leftmost bit is considered the position BM_WORD_SIZE.
+ *
+ * If such set bit does not exist in this word, 0 is returned.
+ */
+static uint8
+_bitmap_find_bitset(BM_WORD word, uint8 lastPos)
+{
+ uint8 pos = lastPos + 1;
+ BM_WORD rightmostBitWord;
+
+ if (pos > BM_WORD_SIZE)
+ return 0;
+
+ rightmostBitWord = (((BM_WORD)1) << (pos-1));
+
+ while (pos <= BM_WORD_SIZE && (word & rightmostBitWord) == 0)
+ {
+ rightmostBitWord <<= 1;
+ pos++;
+ }
+
+ if (pos > BM_WORD_SIZE)
+ pos = 0;
+
+ return pos;
+}
+
+/*
+ * _bitmap_begin_iterate() -- initialize the given BMIterateResult instance.
+ */
+void
+_bitmap_begin_iterate(BMBatchWords *words, BMIterateResult *result)
+{
+ result->nextTid = words->firstTid;
+ result->lastScanPos = 0;
+ result->lastScanWordNo = words->startNo;
+ result->numOfTids = 0;
+ result->nextTidLoc = 0;
+}
+
+
+/*
+ * _bitmap_log_newpage() -- log a new page.
+ *
+ * This function is called before writing a new buffer.
+ */
+void
+_bitmap_log_newpage(Relation rel, uint8 info, Buffer buf)
+{
+ Page page;
+
+ xl_bm_newpage xlNewPage;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+ page = BufferGetPage(buf);
+
+ xlNewPage.bm_node = rel->rd_node;
+ xlNewPage.bm_new_blkno = BufferGetBlockNumber(buf);
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *)&xlNewPage;
+ rdata[0].len = sizeof(xl_bm_newpage);
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BITMAP_ID, info, rdata);
+
+ PageSetLSN(page, recptr);
+}
+
+/*
+ * _bitmap_log_metapage() -- log the changes to the metapage
+ */
+void
+_bitmap_log_metapage(Relation rel, Page page)
+{
+ BMMetaPage metapage = (BMMetaPage) PageGetContents(page);
+
+ xl_bm_metapage* xlMeta;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+ xlMeta = (xl_bm_metapage *)
+ palloc(MAXALIGN(sizeof(xl_bm_metapage)));
+ xlMeta->bm_node = rel->rd_node;
+ xlMeta->bm_lov_heapId = metapage->bm_lov_heapId;
+ xlMeta->bm_lov_indexId = metapage->bm_lov_indexId;
+ xlMeta->bm_last_vmi_page = metapage->bm_last_vmi_page;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char*)xlMeta;
+ rdata[0].len = MAXALIGN(sizeof(xl_bm_metapage));
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_INSERT_META, rdata);
+
+ PageSetLSN(page, recptr);
+ pfree(xlMeta);
+}
+
+/*
+ * _bitmap_log_bitmap_lastwords() -- log the last two words in a bitmap.
+ */
+void
+_bitmap_log_bitmap_lastwords(Relation rel, Buffer vmiBuffer,
+ OffsetNumber vmiOffset, BMVectorMetaItem vmi)
+{
+ xl_bm_bitmap_lastwords xlLastwords;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+ xlLastwords.bm_node = rel->rd_node;
+ xlLastwords.bm_last_compword = vmi->bm_last_compword;
+ xlLastwords.bm_last_word = vmi->bm_last_word;
+ xlLastwords.vmi_words_header = vmi->vmi_words_header;
+ xlLastwords.bm_vmi_blkno = BufferGetBlockNumber(vmiBuffer);
+ xlLastwords.bm_vmi_offset = vmiOffset;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char*)&xlLastwords;
+ rdata[0].len = sizeof(xl_bm_bitmap_lastwords);
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_INSERT_BITMAP_LASTWORDS,
+ rdata);
+
+ PageSetLSN(BufferGetPage(vmiBuffer), recptr);
+}
+
+/*
+ * _bitmap_log_vmi() -- log adding a new VMI to a VMI page.
+ */
+void
+_bitmap_log_vmi(Relation rel, Buffer vmiBuffer, OffsetNumber offset,
+ BMVectorMetaItem vmi, Buffer metabuf, bool is_new_vmi_blkno)
+{
+ Page vmiPage = BufferGetPage(vmiBuffer);
+
+ xl_bm_vmi xlVmi;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+ xlVmi.bm_node = rel->rd_node;
+ xlVmi.bm_vmi_blkno = BufferGetBlockNumber(vmiBuffer);
+ xlVmi.bm_vmi_offset = offset;
+ memcpy(&(xlVmi.bm_vmi), vmi, sizeof(BMVectorMetaItemData));
+ xlVmi.bm_is_new_vmi_blkno = is_new_vmi_blkno;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &xlVmi;
+ rdata[0].len = sizeof(xl_bm_vmi);
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BITMAP_ID,
+ XLOG_BITMAP_INSERT_VMI, rdata);
+
+ if (is_new_vmi_blkno)
+ {
+ Page metapage = BufferGetPage(metabuf);
+
+ PageSetLSN(metapage, recptr);
+ }
+
+ PageSetLSN(vmiPage, recptr);
+}
+
+/*
+ * _bitmap_log_bitmapwords() -- log new bitmap words to be inserted.
+ */
+void
+_bitmap_log_bitmapwords(Relation rel, Buffer bitmapBuffer, Buffer vmiBuffer,
+ OffsetNumber vmiOffset, BMTIDBuffer* buf,
+ uint64 words_written, uint64 tidnum,
+ BlockNumber nextBlkno,
+ bool isLast, bool isFirst)
+{
+ Page bitmapPage;
+ BMPageOpaque bitmapPageOpaque;
+ xl_bm_bitmapwords *xlBitmapWords;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+ uint64 *lastTids;
+ BM_WORD *cwords;
+ BM_WORD *hwords;
+ int lastTids_size;
+ int cwords_size;
+ int hwords_size;
+ Page vmiPage = BufferGetPage(vmiBuffer);
+
+ lastTids_size = buf->curword * sizeof(uint64);
+ cwords_size = buf->curword * sizeof(BM_WORD);
+ hwords_size = (BM_CALC_H_WORDS(buf->curword)) *
+ sizeof(BM_WORD);
+
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmapPageOpaque =
+ (BMPageOpaque)PageGetSpecialPointer(bitmapPage);
+
+ xlBitmapWords = (xl_bm_bitmapwords *)
+ palloc0(sizeof(xl_bm_bitmapwords) + lastTids_size +
+ cwords_size + hwords_size);
+
+ xlBitmapWords->bm_node = rel->rd_node;
+ xlBitmapWords->bm_blkno = BufferGetBlockNumber(bitmapBuffer);
+ xlBitmapWords->bm_next_blkno = nextBlkno;
+ xlBitmapWords->bm_last_tid = bitmapPageOpaque->bm_last_tid_location;
+ xlBitmapWords->bm_vmi_blkno = BufferGetBlockNumber(vmiBuffer);
+ xlBitmapWords->bm_vmi_offset = vmiOffset;
+ xlBitmapWords->bm_last_compword = buf->last_compword;
+ xlBitmapWords->bm_last_word = buf->last_word;
+ xlBitmapWords->vmi_words_header =
+ (buf->is_last_compword_fill) ?
+ BM_LAST_COMPWORD_BIT : BM_VMI_WORDS_NO_FILL;
+ xlBitmapWords->bm_last_setbit = tidnum;
+ xlBitmapWords->bm_is_last = isLast;
+ xlBitmapWords->bm_is_first = isFirst;
+
+ xlBitmapWords->bm_start_wordno = buf->start_wordno;
+ xlBitmapWords->bm_words_written = words_written;
+ xlBitmapWords->bm_num_cwords = buf->curword;
+ lastTids = (uint64*)(((char*)xlBitmapWords) +
+ sizeof(xl_bm_bitmapwords));
+ memcpy(lastTids, buf->last_tids,
+ buf->curword * sizeof(uint64));
+
+ cwords = (BM_WORD*)(((char*)xlBitmapWords) +
+ sizeof(xl_bm_bitmapwords) + lastTids_size);
+ memcpy(cwords, buf->cwords, cwords_size);
+ hwords = (BM_WORD*)(((char*)xlBitmapWords) +
+ sizeof(xl_bm_bitmapwords) + lastTids_size +
+ cwords_size);
+ memcpy(hwords, buf->hwords, hwords_size);
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char*)xlBitmapWords;
+ rdata[0].len = sizeof(xl_bm_bitmapwords) + lastTids_size +
+ cwords_size + hwords_size;
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_INSERT_WORDS, rdata);
+
+ PageSetLSN(bitmapPage, recptr);
+
+ PageSetLSN(vmiPage, recptr);
+
+ pfree(xlBitmapWords);
+}
+
+/*
+ * _bitmap_log_updateword() -- log updating a single word in a given
+ * bitmap page.
+ */
+void
+_bitmap_log_updateword(Relation rel, Buffer bitmapBuffer, int word_no)
+{
+ Page bitmapPage;
+ BMBitmapVectorPage bitmap;
+ xl_bm_updateword xlBitmapWord;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmap = (BMBitmapVectorPage) PageGetContents(bitmapPage);
+
+ xlBitmapWord.bm_node = rel->rd_node;
+ xlBitmapWord.bm_blkno = BufferGetBlockNumber(bitmapBuffer);
+ xlBitmapWord.bm_word_no = word_no;
+ xlBitmapWord.bm_cword = bitmap->cwords[word_no];
+ xlBitmapWord.bm_hword = bitmap->hwords[word_no/BM_WORD_SIZE];
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char*)&xlBitmapWord;
+ rdata[0].len = sizeof(xl_bm_updateword);
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_UPDATEWORD, rdata);
+
+ PageSetLSN(bitmapPage, recptr);
+}
+
+/*
+ * _bitmap_log_updatewords() -- log updating bitmap words in one or
+ * two bitmap pages.
+ *
+ * If nextBuffer is Invalid, we only update one page.
+ *
+ */
+void
+_bitmap_log_updatewords(Relation rel, Buffer vmiBuffer, OffsetNumber vmiOffset,
+ Buffer firstBuffer, Buffer secondBuffer,
+ bool new_lastpage)
+{
+ Page firstPage = NULL;
+ Page secondPage = NULL;
+ BMBitmapVectorPage firstBitmap;
+ BMBitmapVectorPage secondBitmap;
+ BMPageOpaque firstOpaque;
+ BMPageOpaque secondOpaque;
+
+ xl_bm_updatewords xlBitmapWords;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+
+ firstPage = BufferGetPage(firstBuffer);
+ firstBitmap = (BMBitmapVectorPage) PageGetContents(firstPage);
+ firstOpaque = (BMPageOpaque)PageGetSpecialPointer(firstPage);
+ xlBitmapWords.bm_two_pages = false;
+ xlBitmapWords.bm_first_blkno = BufferGetBlockNumber(firstBuffer);
+ memcpy(&xlBitmapWords.bm_first_cwords,
+ firstBitmap->cwords,
+ BM_NUM_OF_HRL_WORDS_PER_PAGE * sizeof(BM_WORD));
+ memcpy(&xlBitmapWords.bm_first_hwords,
+ firstBitmap->hwords,
+ BM_NUM_OF_HEADER_WORDS * sizeof(BM_WORD));
+ xlBitmapWords.bm_first_last_tid = firstOpaque->bm_last_tid_location;
+ xlBitmapWords.bm_first_num_cwords =
+ firstOpaque->bm_hrl_words_used;
+ xlBitmapWords.bm_next_blkno = firstOpaque->bm_bitmap_next;
+
+ if (BufferIsValid(secondBuffer))
+ {
+ secondPage = BufferGetPage(secondBuffer);
+ secondBitmap = (BMBitmapVectorPage) PageGetContents(secondPage);
+ secondOpaque = (BMPageOpaque)PageGetSpecialPointer(secondPage);
+
+ xlBitmapWords.bm_two_pages = true;
+ xlBitmapWords.bm_second_blkno = BufferGetBlockNumber(secondBuffer);
+
+ memcpy(&xlBitmapWords.bm_second_cwords,
+ secondBitmap->cwords,
+ BM_NUM_OF_HRL_WORDS_PER_PAGE * sizeof(BM_WORD));
+ memcpy(&xlBitmapWords.bm_second_hwords,
+ secondBitmap->hwords,
+ BM_NUM_OF_HEADER_WORDS * sizeof(BM_WORD));
+ xlBitmapWords.bm_second_last_tid = secondOpaque->bm_last_tid_location;
+ xlBitmapWords.bm_second_num_cwords =
+ secondOpaque->bm_hrl_words_used;
+ xlBitmapWords.bm_next_blkno = secondOpaque->bm_bitmap_next;
+ }
+
+ xlBitmapWords.bm_node = rel->rd_node;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char*)&xlBitmapWords;
+ rdata[0].len = sizeof(xl_bm_updatewords);
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_UPDATEWORDS, rdata);
+
+ PageSetLSN(firstPage, recptr);
+
+ if (BufferIsValid(secondBuffer))
+ {
+ PageSetLSN(secondPage, recptr);
+ }
+
+ if (new_lastpage)
+ {
+ Page vmiPage = BufferGetPage(vmiBuffer);
+
+ PageSetLSN(vmiPage, recptr);
+ }
+}
+
+Datum
+bmoptions(PG_FUNCTION_ARGS)
+{
+ Datum reloptions = PG_GETARG_DATUM(0);
+ bool validate = PG_GETARG_BOOL(1);
+ bytea *result;
+
+ result = default_reloptions(reloptions, validate, RELOPT_KIND_BITMAP);
+ if (result)
+ PG_RETURN_BYTEA_P(result);
+ PG_RETURN_NULL();
+}
+
+/*
+ * Vacuum tuples out of a bitmap index.
+ */
+
+void
+_bitmap_vacuum(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ Buffer metabuf;
+ BMMetaPage metapage;
+ Relation index = info->index;
+ bmvacinfo vacinfo;
+ HeapScanDesc scan;
+ Relation lovheap;
+ HeapTuple tuple;
+
+ vacinfo.info = info;
+
+ metabuf = _bitmap_getbuf(info->index, BM_METAPAGE, BM_READ);
+ metapage = (BMMetaPage)PageGetContents(BufferGetPage(metabuf));
+
+ lovheap = heap_open(metapage->bm_lov_heapId, AccessShareLock);
+ scan = heap_beginscan(lovheap, SnapshotAny, 0, NULL);
+
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)))
+ {
+ BMVectorMetaItem vmi;
+ BlockNumber vmi_block;
+ OffsetNumber vmi_off;
+ TupleDesc desc;
+ Datum d;
+ bool isnull;
+ Buffer vmi_buf;
+ Page page;
+
+ desc = RelationGetDescr(lovheap);
+
+ d = heap_getattr(tuple, desc->natts - 1, desc, &isnull);
+ Assert(!isnull);
+ vmi_block = DatumGetInt32(d);
+
+ d = heap_getattr(tuple, desc->natts - 0, desc, &isnull);
+ Assert(!isnull);
+ vmi_off = DatumGetInt16(d);
+
+ vmi_buf = _bitmap_getbuf(index, vmi_block, BM_READ);
+ page = BufferGetPage(vmi_buf);
+ vmi = (BMVectorMetaItem)
+ PageGetItem(page, PageGetItemId(page, vmi_off));
+ vacinfo.vmi = vmi;
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "---- start vac");
+ elog(NOTICE, "value = %i", (int)heap_getattr(tuple, 1, desc, &isnull));
+#endif
+ vacuum_vector(vacinfo, callback, callback_state);
+ _bitmap_relbuf(vmi_buf);
+ }
+
+ /* XXX: be careful to vacuum NULL vector */
+
+ /*vacuum null vector */
+
+ /* iterate over all vectors, call for each */
+
+ /* track free pages over calls, shrink when necessary */
+ /* truncate if necessary */
+
+ heap_endscan(scan);
+ heap_close(lovheap, AccessShareLock);
+
+ _bitmap_relbuf(metabuf);
+}
+
+/*
+ * Vacuum a single bitmap vector.
+ *
+ * We traverse the vector page by page, checking if any TIDs have been
+ * reaped (via the callback). There are three cases:
+ *
+ * If a given word is literal and a TID is reaped, we flip the bit
+ * from 1 to 0. If the whole word, when we finish, is 0, transform
+ * it to a compressed word. If the previous word is compressed and is non-
+ * match fill, we merge these two words.
+ *
+ * If the word is non-match fill, we first test if we can merge it with the
+ * previous word. If not, we just copy it directly.
+ *
+ * If the word is match fill, we must iterate through the range and see if
+ * any TIDs have been reaped. If we find a reaped TID, we have to break up
+ * the fill word. This will result in 2 or 3 words replacing one word. That is,
+ * If the first TID in the range is reaped, we'll create a new literal word and
+ * have a fill word representing the rest of the range. If the reaped TID is
+ * in the middle of the fill, we'll have an initial word representing the
+ * initial matches, a literal word with the non-match and then a proceeding
+ * word of matches again.
+ *
+ * It's reasonable that reaped TIDs might be clustered. So, instead of
+ * breaking the fill word straight away, we construct a word for the initial
+ * part of the range and then set a flag to indicate that we found a reaped
+ * TID. Once we find a non-reaped TID, we construct a word representing the
+ * non-match range and then continue processing the rest of the range.
+ *
+ * When we shrink or grow the index, we must physically shift data on the page.
+ * Shrinking is not a problem, we just maintain it logically. If we need to
+ * grow the storage (because we create a non-match in a match fill word) we
+ * either absorb the space between the logical write position and the read
+ * position OR, if there is no such space, we must pull data off the end
+ * of the page into 'overflow' storage and push the remaining data toward the
+ * end of the page.
+ */
+static void
+vacuum_vector(bmvacinfo vacinfo, IndexBulkDeleteCallback callback,
+ void *callback_state)
+{
+ /*
+ * Iterate over the bitmap vector. For each setbit, see if it's in the
+ * reaped tids (via the callback). Once we find a reaped tid, we continue
+ * to iterate until we find a non-reaped tid. Then we patch things up.
+ *
+ * There are two ways we could do this: memmove() things around a lot
+ * on the live page or create a new page and overwrite the existing one.
+ * For now, we do the latter.
+ */
+ bmvacstate state;
+
+ state.cur_bitpos = 1;
+ state.readwordno = 0;
+ state.writewordno = 0;
+
+ state.callback = callback;
+ state.callback_state = callback_state;
+
+ state.itr_blk = vacinfo.vmi->bm_bitmap_head;
+ state.curbuf = InvalidBuffer;
+
+ state.ovrflwwordno = 0;
+
+ state.curvmi = vacinfo.vmi;
+
+ if (!BlockNumberIsValid(state.itr_blk))
+ {
+ elog(NOTICE, "invalid head, compword: %i, lastword: %i",
+ vacinfo.vmi->bm_last_compword, vacinfo.vmi->bm_last_word);
+ return;
+ }
+
+ /* The outer loop iterates over each block in the vector */
+ do
+ {
+ if (!BufferIsValid(state.curbuf))
+ {
+ state.curbuf = _bitmap_getbuf(vacinfo.info->index, state.itr_blk,
+ BM_WRITE);
+ }
+ state.curbm = (BMBitmapVectorPage)PageGetContents(BufferGetPage(state.curbuf));
+ state.curbmo =
+ (BMPageOpaque)PageGetSpecialPointer(BufferGetPage(state.curbuf));
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "words used: %i, comp %i, last %i",
+ state.curbmo->bm_hrl_words_used,
+ vacinfo.vmi->bm_last_compword, vacinfo.vmi->bm_last_word);
+#endif
+ while (state.readwordno < state.curbmo->bm_hrl_words_used)
+ {
+ /* Fill word */
+ if (IS_FILL_WORD(state.curbm->hwords, state.readwordno))
+ {
+ vacuum_fill_word(&state, BM_VAC_PAGE);
+ }
+ else
+ {
+ vacuum_literal_word(&state, BM_VAC_PAGE);
+ }
+ state.readwordno++;
+ state.writewordno++;
+
+ /*
+ * If we're reached the last word, see if there are any overflows
+ * and if so, merge them back into the page.
+ */
+ vacuum_append_ovrflw_words(&state);
+ }
+
+ if (state.ovrflwwordno)
+ {
+ /*
+ * We must merge the over flow into the next page. Write lock that
+ * page first so that no one can miss the data held privately
+ * by us.
+ */
+ Buffer nbuf = InvalidBuffer;
+ elog(NOTICE, "---------");
+ elog(NOTICE, "overflows!");
+ elog(NOTICE, "---------");
+
+ state.itr_blk = state.curbmo->bm_bitmap_next;
+ _bitmap_wrtbuf(state.curbuf);
+ state.curbuf = nbuf;
+ }
+ else
+ {
+ state.itr_blk = state.curbmo->bm_bitmap_next;
+ _bitmap_wrtbuf(state.curbuf);
+ }
+ } while (BlockNumberIsValid(state.itr_blk));
+
+ vacuum_last_words(&state);
+}
+
+static void
+fill_matched(bmvacstate *state, uint64 start, uint64 end)
+{
+ if (start && start < end)
+ {
+ BM_WORD newword;
+ uint64 len = end - start - 1;
+
+ Assert(len > 0);
+
+ if (len >= BM_WORD_SIZE)
+ {
+ /* set the new fill length */
+
+ elog(NOTICE, "shrinking fill word. old len = new len = "
+#ifdef HAVE_LONG_INT_64
+ "%li",
+#elif defined(HAVE_LONG_LONG_INT_64)
+ "%lli",
+#endif
+ len / BM_WORD_SIZE);
+
+ newword = BM_MAKE_FILL_WORD(1, len / BM_WORD_SIZE);
+ state->curbm->cwords[state->writewordno] = newword;
+ HEADER_SET_FILL_BIT_ON(state->curbm->hwords,
+ state->writewordno);
+
+ try_shrink_bitmap(state);
+ progress_write_pos(state);
+
+ len = len % BM_WORD_SIZE;
+ start = end - len;
+ newword = 0;
+ }
+
+ if (len)
+ {
+ uint64 i;
+
+ /*
+ * We need to create a literal representation of the
+ * matches. If the current write word is fill, create
+ * a new word. If the existing word is literal, merge
+ * in our matches.
+ */
+ if (IS_FILL_WORD(state->curbm->hwords,
+ state->writewordno))
+ progress_write_pos(state);
+
+ newword = state->curbm->cwords[state->writewordno];
+
+ for (i = start; i < state->cur_bitpos; i++)
+ {
+ if ((i % BM_WORD_SIZE) == 0)
+ {
+ progress_write_pos(state);
+ newword = state->curbm->cwords[state->writewordno];
+ }
+ newword |= 1 << ((i % BM_WORD_SIZE) - 1);
+ }
+ try_shrink_bitmap(state);
+ }
+ }
+}
+
+static void
+fill_reaped(bmvacstate *state, uint64 start, uint64 end)
+{
+ uint64 len = end - start;
+
+ elog(NOTICE, "inserting fill worked for "
+ "reaped tids, from (%i, %i) to (%i, %i)",
+ BM_INT_GET_BLOCKNO(start),
+ BM_INT_GET_OFFSET(start),
+ BM_INT_GET_BLOCKNO(end),
+ BM_INT_GET_OFFSET(end));
+ if (len < BM_WORD_SIZE)
+ {
+ if (IS_FILL_WORD(state->curbm->hwords, state->writewordno))
+ {
+ if (GET_FILL_BIT(state->curbm->cwords[state->writewordno]) == 0 ||
+ state->curbm->cwords[state->writewordno])
+ {
+ /*
+ * Consuming a word is enough to insert the fill, because the
+ * length is less than BM_WORD_SIZE.
+ */
+ progress_write_pos(state);
+ }
+ else
+ {
+ /* match fill */
+ progress_write_pos(state);
+ }
+ }
+ else
+ {
+ /*
+ * Literal word. Check if we can pull all the non-matches in this
+ * word.
+ */
+ if (((start % BM_WORD_SIZE) - 1) + len > BM_WORD_SIZE)
+ {
+ progress_write_pos(state);
+ }
+ }
+ }
+ else
+ {
+ /* Fill and non-fill word */
+ BM_WORD fillword = BM_MAKE_FILL_WORD(0, len / BM_WORD_SIZE);
+ if (IS_FILL_WORD(state->curbm->hwords, state->writewordno))
+ {
+ if (GET_FILL_BIT(state->curbm->cwords[state->writewordno]) == 0 ||
+ state->curbm->cwords[state->writewordno] == 0)
+ {
+ BM_WORD curword = state->curbm->cwords[state->writewordno];
+ if (FILL_LENGTH(curword) + FILL_LENGTH(fillword) <
+ MAX_FILL_LENGTH)
+ {
+ state->curbm->cwords[state->writewordno] += fillword;
+ }
+ else
+ {
+ fillword -= MAX_FILL_LENGTH -
+ state->curbm->cwords[state->writewordno];
+ progress_write_pos(state);
+ state->curbm->cwords[state->writewordno] = fillword;
+ HEADER_SET_FILL_BIT_ON(state->curbm->hwords,
+ state->writewordno);
+ }
+ len = len % BM_WORD_SIZE;
+ if (len)
+ {
+ progress_write_pos(state);
+ }
+ }
+ }
+ else
+ {
+ progress_write_pos(state);
+ state->curbm->cwords[state->writewordno] = fillword;
+ HEADER_SET_FILL_BIT_ON(state->curbm->hwords,
+ state->writewordno);
+ }
+ }
+}
+
+static void
+vacuum_fill_word(bmvacstate *state, bmVacType vactype)
+{
+ BM_WORD word = vactype_get_word(state, vactype);
+
+ /*
+ * If the fill word is for non-matches, we do not have to
+ * check if there are any reaped TIDs. There wont be by
+ * definition.
+ */
+ elog(NOTICE, "new word is FILL: %i", word);
+
+ /* only vacuum non-match fill words if this is a physical page */
+ if (GET_FILL_BIT(word) == 0 && vactype == BM_VAC_PAGE)
+ {
+ /* skip over */
+ elog(NOTICE, "new word is non-match fill");
+
+ /* if the current read pos is the same as the write pos, do nothing */
+ if (state->readwordno != state->writewordno)
+ {
+ state->curbm->cwords[state->writewordno] = word;
+ HEADER_SET_FILL_BIT_ON(state->curbm->hwords, state->writewordno);
+ }
+ progress_write_pos(state);
+ try_shrink_bitmap(state);
+ }
+ else if (GET_FILL_BIT(word) == 1)
+ {
+ /*
+ * XXX: the call back does not take advantage of the
+ * fact that we know that this is a range of matches. In
+ * the future, we might have a different call back which
+ * can use a better search algorithm. For now, play
+ * dumb.
+ */
+ bool found_reaped = false;
+ uint64 start_setbit = state->cur_bitpos; /* start of match range */
+ uint64 end = state->cur_bitpos +
+ (FILL_LENGTH(word) * BM_WORD_SIZE);
+
+ state->start_reaped = 0;
+
+ elog(NOTICE, "new word is match fill");
+
+ elog(NOTICE, "testing match fill range. start = (%i, %i) "
+ "end = (%i, %i)",
+ BM_INT_GET_BLOCKNO(state->cur_bitpos),
+ BM_INT_GET_OFFSET(state->cur_bitpos),
+ BM_INT_GET_BLOCKNO(end),
+ BM_INT_GET_OFFSET(end));
+
+ while (state->cur_bitpos < end)
+ {
+ ItemPointerData tid;
+
+ ItemPointerSet(&tid, BM_INT_GET_BLOCKNO(state->cur_bitpos),
+ BM_INT_GET_OFFSET(state->cur_bitpos));
+
+ elog(NOTICE, "testing fill tid (%i, %i)",
+ BM_INT_GET_BLOCKNO(state->cur_bitpos),
+ BM_INT_GET_OFFSET(state->cur_bitpos));
+
+ if (state->callback(&tid, state->callback_state))
+ {
+ elog(NOTICE, "tid is reaped");
+ /*
+ * We found a match. We don't just break the fill
+ * word in to three. Instead, we shrink the
+ * original fill word and continue to loop
+ * until we find a set bit which isn't reaped,
+ * then we'll add a word reflecting the non-matches.
+ */
+
+ found_reaped = true;
+
+ fill_matched(state, start_setbit, state->cur_bitpos - 1);
+ if (!state->start_reaped)
+ {
+ state->start_reaped = state->cur_bitpos;
+ start_setbit = 0;
+ }
+ }
+ else
+ {
+ /*
+ * If we're already seen a range of reaped TIDs, fill those in
+ * on the current word.
+ */
+ if (state->start_reaped &&
+ state->start_reaped < state->cur_bitpos - 1)
+ {
+ /*
+ * insert fill word. remember, previous word
+ * might have been a fill word which we can
+ * extend.
+ */
+ fill_reaped(state, state->start_reaped,
+ state->cur_bitpos - 1);
+
+ state->start_reaped = 0;
+ Assert(start_setbit == 0);
+ start_setbit = state->cur_bitpos;
+ }
+ else
+ {
+ if (start_setbit == 0)
+ start_setbit = state->cur_bitpos;
+ }
+
+ /* XXX: just do this logically!!! */
+ word |= 1 << ((state->cur_bitpos % BM_WORD_SIZE) - 1);
+ state->curbm->cwords[state->writewordno] = word;
+
+ try_shrink_bitmap(state);
+
+ /* roll over to a new word if need be */
+ if (!IS_FILL_WORD(state->curbm->hwords, state->writewordno) &&
+ (state->cur_bitpos % BM_WORD_SIZE) == 0)
+ {
+ progress_write_pos(state);
+ }
+ state->last_setbit = state->cur_bitpos;
+ }
+ state->cur_bitpos++;
+ }
+ if (!found_reaped)
+ {
+ elog(NOTICE, "didn't reap any, copy directly");
+ /* only do this if we've got an on page word */
+ if (vactype == BM_VAC_PAGE)
+ {
+ state->curbm->cwords[state->writewordno] = word;
+ HEADER_SET_FILL_BIT_ON(state->curbm->hwords,
+ state->writewordno);
+ }
+ return;
+ }
+ else if (start_setbit)
+ {
+ fill_matched(state, start_setbit, state->cur_bitpos - 1);
+ }
+ else if (state->start_reaped)
+ {
+ fill_reaped(state, state->start_reaped, state->cur_bitpos - 1);
+ }
+
+ /*
+ * If this is last complete word or last word, we've just put the data
+ * for that word on the physical page so get it back.
+ */
+ if (vactype == BM_VAC_LAST_COMPWORD ||
+ vactype == BM_VAC_LAST_WORD)
+ {
+ BM_WORD newword = state->curbm->cwords[state->writewordno];
+
+ switch(vactype)
+ {
+ case BM_VAC_LAST_COMPWORD:
+ state->curvmi->bm_last_compword = newword;
+ state->curvmi->vmi_words_header &=
+ BM_LAST_COMPWORD_BIT;
+ break;
+ case BM_VAC_LAST_WORD:
+ state->curvmi->bm_last_word = newword;
+ state->curvmi->vmi_words_header &=
+ BM_LAST_WORD_BIT;
+ break;
+ default:
+ /* wont happen */
+ break;
+ }
+
+ if (state->writewordno)
+ state->writewordno--;
+ }
+ }
+}
+
+/*
+ * Try to merge the current word with the previous word. This is only used
+ * for vacuuming so we'll only shrink non-match words.
+ */
+static void
+try_shrink_bitmap(bmvacstate *state)
+{
+ BM_WORD word = state->curbm->cwords[state->writewordno];
+
+ /* we have no hope if there's no previous word */
+ if (!(state->writewordno >= 1))
+ return;
+
+ if (word == LITERAL_ALL_ZERO)
+ {
+ BM_WORD prevword = state->curbm->cwords[state->writewordno - 1];
+
+ /* check if earlier word was fill too */
+ if (BM_WORD_IS_NON_MATCH_FILL(state->curbm, state->writewordno - 1) &&
+ FILL_LENGTH(prevword) < MAX_FILL_LENGTH)
+ {
+ state->curbm->cwords[state->writewordno - 1]++;
+ /* previous word absorbed current word, so step back a word */
+ state->writewordno--;
+ }
+ else
+ {
+ state->curbm->cwords[state->writewordno] =
+ BM_MAKE_FILL_WORD(0, 1);
+ HEADER_SET_FILL_BIT_OFF(state->curbm->hwords,
+ state->writewordno);
+ }
+ }
+ else
+ {
+ BM_WORD prevword = state->curbm->cwords[state->writewordno - 1];
+
+ if (!BM_WORD_IS_NON_MATCH_FILL(state->curbm, state->writewordno - 1))
+ return;
+
+ if (FILL_LENGTH(word) + FILL_LENGTH(prevword) <= MAX_FILL_LENGTH)
+ {
+ state->curbm->cwords[state->writewordno - 1] +=
+ FILL_LENGTH(word);
+
+ /* previous word absorbed us, see above */
+ state->writewordno--;
+ }
+ else
+ {
+ /* fill up previous word with non-matches and shrink current word */
+ int16 diff = MAX_FILL_LENGTH - FILL_LENGTH(prevword);
+ state->curbm->cwords[state->writewordno - 1] += MAX_FILL_LENGTH;
+ state->curbm->cwords[state->writewordno] = word - diff;
+ }
+ }
+}
+
+static void
+vacuum_literal_word(bmvacstate *state, bmVacType vactype)
+{
+ BM_WORD word;
+ uint8 i;
+ BM_WORD match = 1;
+
+ word = vactype_get_word(state, vactype);
+
+#ifdef DEBUG_BMI
+ elog(NOTICE, "vacuuming literal word %i", word);
+#endif
+ for (i = 0; i < BM_WORD_SIZE; i++)
+ {
+ match <<= i;
+ state->cur_bitpos++;
+ if (word & match)
+ {
+ ItemPointerData tid;
+
+ ItemPointerSet(&tid, BM_INT_GET_BLOCKNO(state->cur_bitpos),
+ BM_INT_GET_OFFSET(state->cur_bitpos));
+#ifdef DEBUG_BMI
+ elog(NOTICE, "found match for (%i, %i)",
+ BM_INT_GET_BLOCKNO(state->cur_bitpos),
+ BM_INT_GET_OFFSET(state->cur_bitpos));
+#endif
+ if (state->callback(&tid, state->callback_state))
+ {
+ /* turn the bit off, easy as that! */
+ word |= ~match;
+ elog(NOTICE, "inverted match for %i, word = %i",
+ i, word);
+ }
+ else
+ state->last_setbit = state->cur_bitpos;
+ }
+ }
+ put_vacuumed_literal_word(state, vactype, word);
+}
+
+
+/*
+ * Check for a scenario where the next write would overwrite a block we
+ * haven't read. Put words at the end of the storage into an overflow
+ * bitmap and shift everything to the right.
+ */
+static void
+check_page_space(bmvacstate *state)
+{
+ elog(NOTICE, "checking page space: write pos: %i, read pos: %i",
+ state->writewordno, state->readwordno);
+
+ if (state->writewordno > state->readwordno)
+ {
+ /*
+ * We need to free up some space. There are two scenarios here:
+ * the page might not actually be full so we just shift things to
+ * the right and not worry about overflow; otherwise, the page is
+ * full so we just move the remaining words off the page into a new
+ * one and just tie things together. This is potentially inefficient
+ * but alternative methods require a lot of code.
+ */
+ uint16 from = 0;
+ uint16 diff = 0;
+ uint16 ovrflwwords = 0; /* number of words to put into overflow */
+
+ /* first case */
+ if (state->curbmo->bm_hrl_words_used < BM_NUM_OF_HRL_WORDS_PER_PAGE)
+ {
+ diff = BM_NUM_OF_HRL_WORDS_PER_PAGE -
+ state->curbmo->bm_hrl_words_used;
+ from = state->readwordno;
+
+ /* XXX: is there an off by one here? */
+ memmove(&(state->curbm->cwords[from + diff]),
+ &(state->curbm->cwords[state->readwordno]),
+ state->curbmo->bm_hrl_words_used - from);
+ memmove(&(state->curbm->hwords[(from + diff)/BM_WORD_SIZE]),
+ &(state->curbm->hwords[from/BM_WORD_SIZE]),
+ (state->curbmo->bm_hrl_words_used - from)/BM_WORD_SIZE);
+ /*
+ * Now, we must change to read position to point to the new
+ * current word.
+ */
+ state->readwordno += diff;
+ }
+ else
+ {
+ /*
+ * We can't do this the easy way, time to free some up. We take
+ * BM_WORD_SIZE number of words at a time, because it's
+ * convenient for managing the header: we just need to copy a
+ * single word.
+ */
+
+ diff =
+ BM_NUM_OF_HRL_WORDS_PER_PAGE - state->readwordno;
+ if (diff > BM_WORD_SIZE)
+ ovrflwwords = diff = BM_WORD_SIZE;
+ else
+ ovrflwwords = diff;
+ }
+
+ /* copy to overflow, if instructed */
+ if (ovrflwwords)
+ {
+ uint16 oo = state->ovrflwwordno;
+ state->ovrflw.hwords[oo/BM_WORD_SIZE] =
+ state->curbm->hwords[from/BM_WORD_SIZE];
+
+ memcpy(&(state->ovrflw.cwords[oo]),
+ &(state->curbm->hwords[from]),
+ diff * sizeof(BM_WORD));
+ }
+ /* XXX: is there an off by one here? */
+ memmove(&(state->curbm->cwords[from + diff]),
+ &(state->curbm->cwords[from]),
+ diff);
+
+ state->curbm->hwords[(from + diff)] = state->curbm->hwords[from];
+
+ state->readwordno += diff;
+ }
+}
+
+
+/*
+ * Progress the write position pointer, filling the word with non-matches
+ * and ensuring the write pointer doesn't overtake the read pointer.
+ */
+
+static void
+progress_write_pos(bmvacstate *state)
+{
+ state->writewordno++;
+ check_page_space(state);
+ state->curbm->cwords[state->writewordno] = LITERAL_ALL_ZERO;
+ HEADER_SET_FILL_BIT_OFF(state->curbm->hwords, state->writewordno);
+}
+
+/*
+ * We must vacuum the last_word and last_compword in the VMI.
+ */
+static void
+vacuum_last_words(bmvacstate *state)
+{
+ /*
+ * When initialised, the last complete word is set to LITERAL_ALL_ONE and
+ * it should never return to that again (because we compress it first).
+ * See _bitmap_formitem().
+ */
+#ifdef DEBUG_BMI
+ elog(NOTICE, "vacuuming last words");
+ elog(NOTICE, "comp %i, last %i", state->curvmi->bm_last_compword,
+ state->curvmi->bm_last_word);
+#endif
+ if (state->curvmi->bm_last_compword != LITERAL_ALL_ONE)
+ {
+ /* Is the word fill */
+ if (BM_LAST_COMPWORD_IS_FILL(state->curvmi))
+ {
+ if (GET_FILL_BIT(state->curvmi->bm_last_compword) == 1)
+ {
+ elog(NOTICE, "match fill %i",
+ state->curvmi->bm_last_compword);
+ }
+ /* If non-match fill, there's nothing to do */
+ }
+ else
+ {
+ vacuum_literal_word(state, BM_VAC_LAST_COMPWORD);
+ }
+ }
+
+ /*
+ * Now, we do the non-complete word. If it has no matches, don't
+ * examine it.
+ */
+
+ if (state->curvmi->bm_last_word != 0)
+ {
+ if (BM_LASTWORD_IS_FILL(state->curvmi))
+ vacuum_fill_word(state, BM_VAC_LAST_WORD);
+ else
+ vacuum_literal_word(state, BM_VAC_LAST_WORD);
+ }
+
+ /*
+ * If the last comp word and last word represent non-matches, we can
+ * truncate the bitmap.
+ */
+ /* XXX: todo */
+}
+
+static BM_WORD
+vactype_get_word(bmvacstate *state, bmVacType type)
+{
+ switch(type)
+ {
+ case BM_VAC_PAGE:
+ return state->curbm->cwords[state->readwordno];
+ break;
+ case BM_VAC_LAST_COMPWORD:
+ return state->curvmi->bm_last_compword;
+ break;
+ case BM_VAC_LAST_WORD:
+ return state->curvmi->bm_last_word;
+ break;
+ default:
+ elog(ERROR, "invalid bitmap vacuum state");
+ return 0; /* not reached */
+ break;
+ }
+}
+
+static void
+put_vacuumed_literal_word(bmvacstate *state, bmVacType type, BM_WORD word)
+{
+ switch(type)
+ {
+ case BM_VAC_PAGE:
+ state->curbm->cwords[state->writewordno] = word;
+ HEADER_SET_FILL_BIT_OFF(state->curbm->hwords, state->writewordno);
+ try_shrink_bitmap(state);
+ break;
+ case BM_VAC_LAST_COMPWORD:
+ state->curvmi->bm_last_compword = word;
+ state->curvmi->vmi_words_header &= ~BM_LAST_COMPWORD_BIT;
+ break;
+ case BM_VAC_LAST_WORD:
+ state->curvmi->bm_last_word = word;
+ state->curvmi->vmi_words_header &= ~BM_LAST_WORD_BIT;
+ break;
+ default:
+ elog(ERROR, "invalid bitmap vacuum state");
+ break;
+ }
+}
+
+#if 0
+/*
+ * Either prepend or append overflow data to the current bitmap page.
+ */
+static void
+merge_ovrflw(bmvacstate *state, bool append)
+{
+ uint16 start; /* start offset into the overflow */
+ Buffer nbuf;
+
+ /*
+ * If the current page hasn't used all of the available words, absorb those.
+ * Push any overflow into the overflow section.
+ */
+
+ if (state->writewordno < (BM_NUM_OF_HRL_WORDS_PER_PAGE - 1))
+ {
+
+
+
+ }
+
+ if (BlockNumberIsValid(state->curbmo->bm_bitmap_next))
+ {
+ nbuf = _bitmap_getbuf(vacinfo.info->index,
+ state->curbmo->bm_bitmap_next,
+ BM_WRITE);
+ elog(NOTICE, "adding overflow to %u",
+ state->curbmo->bm_bitmap_next);
+ }
+ else
+ {
+ /* Argh, we actually need a new page! */
+ nbuf = _bitmap_getbuf(vacinfo.info->index, P_NEW, BM_WRITE);
+ _bitmap_init_bitmappage(nbuf);
+ state->curvmi->bm_bitmap_tail = state->curbmo->bm_bitmap_next =
+ BufferGetBlockNumber(nbuf);
+ elog(NOTICE, "adding overflow to new block %u",
+ state->curbmo->bm_bitmap_next);
+ }
+}
+#endif
+
+static void
+vacuum_append_ovrflw_words(bmvacstate *state)
+{
+ Assert(state->readwordno > 0);
+
+ /*
+ * We want to copy BM_WORD_SIZE words at a time but we have to be careful
+ * of three things: a) we cannot go past BM_NUM_OF_HRL_WORDS_PER_PAGE,
+ * b) we cannot read past the end of the overflow words
+ */
+
+}
diff --git a/src/backend/access/bitmap/bitmapxlog.c b/src/backend/access/bitmap/bitmapxlog.c
new file mode 100644
index 0000000..3f2305e
--- /dev/null
+++ b/src/backend/access/bitmap/bitmapxlog.c
@@ -0,0 +1,670 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmapxlog.c
+ * WAL replay logic for the bitmap index.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/bitmap/bitmapxlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/bitmap.h"
+#include "access/xlogutils.h"
+#include "storage/bufmgr.h" /* for buffer manager functions */
+#include "utils/rel.h" /* for RelationGetDescr */
+#include "utils/lsyscache.h"
+
+static void forget_incomplete_insert_bitmapwords(RelFileNode node,
+ xl_bm_bitmapwords* newWords);
+/*
+ * We must keep track of expected insertion of bitmap words when these
+ * bitmap words are inserted into multiple bitmap pages. We need to manually
+ * insert these words if they are not seen in the WAL log during replay.
+ * This makes it safe for page insertion to be a multiple-WAL-action process.
+ */
+typedef xl_bm_bitmapwords bm_incomplete_action;
+
+static List *incomplete_actions;
+
+static void
+log_incomplete_insert_bitmapwords(RelFileNode node,
+ xl_bm_bitmapwords* newWords)
+{
+ int lastTids_size;
+ int cwords_size;
+ int hwords_size;
+ int total_size;
+ bm_incomplete_action *action;
+
+ /* Delete the previous entry */
+ forget_incomplete_insert_bitmapwords(node, newWords);
+
+ lastTids_size = newWords->bm_num_cwords * sizeof(uint64);
+ cwords_size = newWords->bm_num_cwords * sizeof(BM_WORD);
+ hwords_size = (BM_CALC_H_WORDS(newWords->bm_num_cwords)) *
+ sizeof(BM_WORD);
+ total_size = sizeof(bm_incomplete_action) + lastTids_size +
+ cwords_size + hwords_size;
+
+ action = palloc0(total_size);
+ memcpy(action, newWords, total_size);
+
+ /* Reset the following fields */
+ action->bm_blkno = newWords->bm_next_blkno;
+ action->bm_next_blkno = InvalidBlockNumber;
+ action->bm_start_wordno =
+ newWords->bm_start_wordno + newWords->bm_words_written;
+ action->bm_words_written = 0;
+
+ incomplete_actions = lappend(incomplete_actions, action);
+}
+
+static void
+forget_incomplete_insert_bitmapwords(RelFileNode node,
+ xl_bm_bitmapwords* newWords)
+{
+ ListCell* l;
+
+ foreach (l, incomplete_actions)
+ {
+ bm_incomplete_action *action = (bm_incomplete_action *) lfirst(l);
+
+ if (RelFileNodeEquals(node, action->bm_node) &&
+ (action->bm_vmi_blkno == newWords->bm_vmi_blkno &&
+ action->bm_vmi_offset == newWords->bm_vmi_offset &&
+ action->bm_last_setbit == newWords->bm_last_setbit) &&
+ !action->bm_is_last)
+ {
+ Assert(action->bm_blkno == newWords->bm_blkno);
+
+ incomplete_actions = list_delete_ptr(incomplete_actions, action);
+ pfree(action);
+ break;
+ }
+ }
+}
+
+/*
+ * _bitmap_xlog_newpage() -- create a new page.
+ */
+static void
+_bitmap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_bm_newpage *xlrec = (xl_bm_newpage *) XLogRecGetData(record);
+
+ Page page;
+ uint8 info;
+ Buffer buffer;
+
+ info = record->xl_info & ~XLR_INFO_MASK;
+
+ buffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_new_blkno, true);
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "_bitmap_xlog_newpage: block unfound: %d",
+ xlrec->bm_new_blkno);
+
+ page = BufferGetPage(buffer);
+ Assert(PageIsNew(page));
+
+ if (PageGetLSN(page) < lsn)
+ {
+ switch (info)
+ {
+ case XLOG_BITMAP_INSERT_NEWVMIPAGE:
+ _bitmap_init_vmipage(buffer);
+ break;
+ default:
+ elog(PANIC, "_bitmap_xlog_newpage: unknown newpage op code %u",
+ info);
+ }
+
+ PageSetLSN(page, lsn);
+ _bitmap_wrtbuf(buffer);
+ }
+ else
+ _bitmap_relbuf(buffer);
+}
+
+/*
+ * _bitmap_xlog_insert_vmi() -- insert a new vector meta item.
+ */
+static void
+_bitmap_xlog_insert_vmi(XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_bm_vmi *xlrec = (xl_bm_vmi *) XLogRecGetData(record);
+ Buffer vmiBuffer;
+ Page vmiPage;
+
+ vmiBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_vmi_blkno, true);
+ if (!BufferIsValid(vmiBuffer))
+ elog(PANIC, "_bitmap_xlog_insert_vmi: block unfound: %d",
+ xlrec->bm_vmi_blkno);
+
+ vmiPage = BufferGetPage(vmiBuffer);
+
+ if (PageIsNew(vmiPage))
+ {
+ Assert(xlrec->bm_is_new_vmi_blkno);
+ _bitmap_init_vmipage(vmiBuffer);
+ }
+
+ if (PageGetLSN(vmiPage) < lsn)
+ {
+ OffsetNumber newOffset, itemSize;
+
+ newOffset = OffsetNumberNext(PageGetMaxOffsetNumber(vmiPage));
+ if (newOffset != xlrec->bm_vmi_offset)
+ elog(PANIC, "_bitmap_xlog_insert_vmi: VMI is not inserted "
+ "in pos %d(requested %d)",
+ newOffset, xlrec->bm_vmi_offset);
+
+ itemSize = sizeof(BMVectorMetaItemData);
+ if (itemSize > PageGetFreeSpace(vmiPage))
+ elog(PANIC,
+ "_bitmap_xlog_insert_vmi: not enough space in VMI page %d",
+ xlrec->bm_vmi_blkno);
+
+ if (PageAddItem(vmiPage, (Item) &(xlrec->bm_vmi), itemSize,
+ newOffset, false, false) == InvalidOffsetNumber)
+ {
+ char *rel_name = get_rel_name(xlrec->bm_node.relNode);
+ if (rel_name)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("_bitmap_xlog_insert_vmi: failed to add "
+ "VMI to \"%s\"",
+ rel_name)));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("_bitmap_xlog_insert_vmi: failed to add "
+ "VMI")));
+ }
+
+ if (xlrec->bm_is_new_vmi_blkno)
+ {
+ Buffer metabuf = XLogReadBuffer(xlrec->bm_node, BM_METAPAGE, false);
+ BMMetaPage metapage;
+ if (!BufferIsValid(metabuf))
+ elog(PANIC, "_bitmap_xlog_insert_vmi: block unfound: %d",
+ BM_METAPAGE);
+
+ metapage = (BMMetaPage)
+ PageGetContents(BufferGetPage(metabuf));
+
+ metapage->bm_last_vmi_page = xlrec->bm_vmi_blkno;
+
+ PageSetLSN(BufferGetPage(metabuf), lsn);
+
+ _bitmap_wrtbuf(metabuf);
+ }
+
+ PageSetLSN(vmiPage, lsn);
+
+ _bitmap_wrtbuf(vmiBuffer);
+ }
+ else
+ _bitmap_relbuf(vmiBuffer);
+}
+
+/*
+ * _bitmap_xlog_insert_meta() -- update a metapage.
+ */
+static void
+_bitmap_xlog_insert_meta(XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_bm_metapage *xlrec = (xl_bm_metapage *) XLogRecGetData(record);
+ Buffer metabuf;
+ Page mp;
+ BMMetaPage metapage;
+
+ metabuf = XLogReadBuffer(xlrec->bm_node, BM_METAPAGE, true);
+
+ mp = BufferGetPage(metabuf);
+ if (PageIsNew(mp))
+ PageInit(mp, BufferGetPageSize(metabuf), 0);
+
+ if (PageGetLSN(mp) < lsn)
+ {
+ metapage = (BMMetaPage)PageGetContents(mp);
+
+ metapage->bm_lov_heapId = xlrec->bm_lov_heapId;
+ metapage->bm_lov_indexId = xlrec->bm_lov_indexId;
+ metapage->bm_last_vmi_page = xlrec->bm_last_vmi_page;
+
+ PageSetLSN(mp, lsn);
+ _bitmap_wrtbuf(metabuf);
+ }
+ else
+ _bitmap_relbuf(metabuf);
+}
+
+/*
+ * _bitmap_xlog_insert_bitmap_lastwords() -- update the last two words
+ * in a bitmap vector.
+ */
+static void
+_bitmap_xlog_insert_bitmap_lastwords(XLogRecPtr lsn,
+ XLogRecord *record)
+{
+ xl_bm_bitmap_lastwords *xlrec;
+
+ Buffer vmiBuffer;
+ Page vmiPage;
+ BMVectorMetaItem vmi;
+
+ xlrec = (xl_bm_bitmap_lastwords *) XLogRecGetData(record);
+
+ vmiBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_vmi_blkno, false);
+ if (!BufferIsValid(vmiBuffer))
+ elog(PANIC, "_bitmap_xlog_insert_bitmap_lastwords: "
+ " block not found: %d",
+ xlrec->bm_vmi_blkno);
+
+ vmiPage = BufferGetPage(vmiBuffer);
+
+ if (PageGetLSN(vmiPage) < lsn)
+ {
+ ItemId item = PageGetItemId(vmiPage, xlrec->bm_vmi_offset);
+
+ if (!ItemIdIsUsed(item))
+ elog(PANIC, "_bitmap_xlog_insert_bitmap_lastwords: "
+ "offset not found: %d",
+ xlrec->bm_vmi_offset);
+
+ vmi = (BMVectorMetaItem) PageGetItem(vmiPage, item);
+
+ vmi->bm_last_compword = xlrec->bm_last_compword;
+ vmi->bm_last_word = xlrec->bm_last_word;
+ vmi->vmi_words_header = xlrec->vmi_words_header;
+
+ PageSetLSN(vmiPage, lsn);
+ _bitmap_wrtbuf(vmiBuffer);
+ }
+ else
+ _bitmap_relbuf(vmiBuffer);
+}
+
+static void
+_bitmap_xlog_insert_bitmapwords(XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_bm_bitmapwords *xlrec;
+
+ Buffer bitmapBuffer;
+ Page bitmapPage;
+ BMPageOpaque bitmapPageOpaque;
+ BMTIDBuffer newWords;
+ uint64 words_written;
+
+ int lastTids_size;
+ int cwords_size;
+ int hwords_size;
+
+ xlrec = (xl_bm_bitmapwords *) XLogRecGetData(record);
+
+ bitmapBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_blkno, true);
+ bitmapPage = BufferGetPage(bitmapBuffer);
+
+ if (PageIsNew(bitmapPage))
+ _bitmap_init_bitmappage(bitmapBuffer);
+
+ bitmapPageOpaque =
+ (BMPageOpaque)PageGetSpecialPointer(bitmapPage);
+
+ if (PageGetLSN(bitmapPage) < lsn)
+ {
+ Buffer vmiBuffer;
+ Page vmiPage;
+ BMVectorMetaItem vmi;
+ uint64 *last_tids;
+ BM_WORD *cwords;
+ BM_WORD *hwords;
+
+ newWords.curword = xlrec->bm_num_cwords;
+ newWords.start_wordno = xlrec->bm_start_wordno;
+
+ lastTids_size = newWords.curword * sizeof(uint64);
+ cwords_size = newWords.curword * sizeof(BM_WORD);
+ hwords_size = (BM_CALC_H_WORDS(newWords.curword)) *
+ sizeof(BM_WORD);
+
+ newWords.last_tids = (uint64*)palloc0(lastTids_size);
+ newWords.cwords = (BM_WORD*)palloc0(cwords_size);
+
+ last_tids =
+ (uint64*)(((char*)xlrec) + sizeof(xl_bm_bitmapwords));
+ cwords =
+ (BM_WORD*)(((char*)xlrec) +
+ sizeof(xl_bm_bitmapwords) + lastTids_size);
+ hwords =
+ (BM_WORD*)(((char*)xlrec) +
+ sizeof(xl_bm_bitmapwords) + lastTids_size +
+ cwords_size);
+ memcpy(newWords.last_tids, last_tids, lastTids_size);
+ memcpy(newWords.cwords, cwords, cwords_size);
+ memcpy(newWords.hwords, hwords, hwords_size);
+
+ /*
+ * If no words are written to this bitmap page, it means
+ * this bitmap page is full.
+ */
+ if (xlrec->bm_words_written == 0)
+ {
+ Assert(BM_NUM_OF_HRL_WORDS_PER_PAGE -
+ bitmapPageOpaque->bm_hrl_words_used == 0);
+ words_written = 0;
+ }
+ else
+ words_written =
+ _bitmap_write_bitmapwords(bitmapBuffer, &newWords);
+
+ Assert(words_written == xlrec->bm_words_written);
+
+ bitmapPageOpaque->bm_bitmap_next = xlrec->bm_next_blkno;
+ Assert(bitmapPageOpaque->bm_last_tid_location == xlrec->bm_last_tid);
+
+ vmiBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_vmi_blkno, false);
+ if (!BufferIsValid(vmiBuffer))
+ elog(PANIC,
+ "_bitmap_xlog_insert_last_bitmapwords: VMI block not found: %d",
+ xlrec->bm_vmi_blkno);
+ vmiPage = BufferGetPage(vmiBuffer);
+
+ vmi = (BMVectorMetaItem)
+ PageGetItem(vmiPage, PageGetItemId(vmiPage, xlrec->bm_vmi_offset));
+
+ if (xlrec->bm_is_last)
+ {
+ vmi->bm_last_compword = xlrec->bm_last_compword;
+ vmi->bm_last_word = xlrec->bm_last_word;
+ vmi->vmi_words_header = xlrec->vmi_words_header;
+ vmi->bm_last_setbit = xlrec->bm_last_setbit;
+ vmi->bm_last_tid_location = xlrec->bm_last_setbit -
+ xlrec->bm_last_setbit % BM_WORD_SIZE;
+ vmi->bm_bitmap_tail = BufferGetBlockNumber(bitmapBuffer);
+ if (vmi->bm_bitmap_head == InvalidBlockNumber)
+ vmi->bm_bitmap_head = vmi->bm_bitmap_tail;
+
+ PageSetLSN(vmiPage, lsn);
+
+ _bitmap_wrtbuf(vmiBuffer);
+
+ forget_incomplete_insert_bitmapwords(xlrec->bm_node, xlrec);
+ }
+ else
+ {
+
+ Buffer nextBuffer;
+ Page nextPage;
+
+ /* create a new bitmap page */
+ nextBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_next_blkno, true);
+ nextPage = BufferGetPage(nextBuffer);
+
+ Assert(PageIsNew(nextPage));
+
+ _bitmap_init_bitmappage(nextBuffer);
+
+ if (xlrec->bm_is_first)
+ {
+ vmi->bm_bitmap_head = BufferGetBlockNumber(bitmapBuffer);
+ vmi->bm_bitmap_tail = vmi->bm_bitmap_head;
+
+ PageSetLSN(vmiPage, lsn);
+
+ _bitmap_wrtbuf(vmiBuffer);
+ }
+ else
+ _bitmap_relbuf(vmiBuffer);
+
+ PageSetLSN(nextPage, lsn);
+
+ _bitmap_wrtbuf(nextBuffer);
+
+ log_incomplete_insert_bitmapwords(xlrec->bm_node, xlrec);
+ }
+
+ PageSetLSN(bitmapPage, lsn);
+
+ _bitmap_wrtbuf(bitmapBuffer);
+
+ _bitmap_free_tidbuf(&newWords);
+ }
+
+ else {
+ _bitmap_relbuf(bitmapBuffer);
+ }
+}
+
+static void
+_bitmap_xlog_updateword(XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_bm_updateword *xlrec;
+
+ Buffer bitmapBuffer;
+ Page bitmapPage;
+ BMPageOpaque bitmapOpaque;
+ BMBitmapVectorPage bitmap;
+
+ xlrec = (xl_bm_updateword *) XLogRecGetData(record);
+
+ bitmapBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_blkno, false);
+ if (!BufferIsValid(bitmapBuffer))
+ elog(PANIC, "_bitmap_xlog_updateword block not found: %d",
+ xlrec->bm_blkno);
+
+ bitmapPage = BufferGetPage(bitmapBuffer);
+ bitmapOpaque =
+ (BMPageOpaque)PageGetSpecialPointer(bitmapPage);
+ bitmap = (BMBitmapVectorPage) PageGetContents(bitmapPage);
+
+ if (PageGetLSN(bitmapPage) < lsn)
+ {
+ Assert(bitmapOpaque->bm_hrl_words_used > xlrec->bm_word_no);
+
+ bitmap->cwords[xlrec->bm_word_no] = xlrec->bm_cword;
+ bitmap->hwords[xlrec->bm_word_no/BM_WORD_SIZE] = xlrec->bm_hword;
+
+ PageSetLSN(bitmapPage, lsn);
+ _bitmap_wrtbuf(bitmapBuffer);
+ }
+
+ else
+ _bitmap_relbuf(bitmapBuffer);
+}
+
+static void
+_bitmap_xlog_updatewords(XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_bm_updatewords *xlrec;
+ Buffer firstBuffer;
+ Buffer secondBuffer = InvalidBuffer;
+ Page firstPage;
+ Page secondPage = NULL;
+ BMPageOpaque firstOpaque;
+ BMPageOpaque secondOpaque = NULL;
+ BMBitmapVectorPage firstBitmap;
+ BMBitmapVectorPage secondBitmap = NULL;
+
+ xlrec = (xl_bm_updatewords *) XLogRecGetData(record);
+
+ firstBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_first_blkno, false);
+ if (!BufferIsValid(firstBuffer))
+ elog(PANIC, "_bitmap_xlog_updatewords first block not found: %d",
+ xlrec->bm_first_blkno);
+
+ firstPage = BufferGetPage(firstBuffer);
+ firstOpaque =
+ (BMPageOpaque) PageGetSpecialPointer(firstPage);
+ firstBitmap = (BMBitmapVectorPage) PageGetContents(firstPage);
+
+ if (PageGetLSN(firstPage) < lsn)
+ {
+ if (xlrec->bm_two_pages)
+ {
+ secondBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_second_blkno, true);
+ secondPage = BufferGetPage(secondBuffer);
+ if (PageIsNew(secondPage))
+ _bitmap_init_bitmappage(secondBuffer);
+
+ secondOpaque =
+ (BMPageOpaque) PageGetSpecialPointer(secondPage);
+ secondBitmap = (BMBitmapVectorPage) PageGetContents(secondPage);
+ Assert(PageGetLSN(secondPage) < lsn);
+ }
+
+ memcpy(firstBitmap->cwords, xlrec->bm_first_cwords,
+ BM_NUM_OF_HRL_WORDS_PER_PAGE * sizeof(BM_WORD));
+ memcpy(firstBitmap->hwords, xlrec->bm_first_hwords,
+ BM_NUM_OF_HEADER_WORDS * sizeof(BM_WORD));
+ firstOpaque->bm_hrl_words_used = xlrec->bm_first_num_cwords;
+ firstOpaque->bm_last_tid_location = xlrec->bm_first_last_tid;
+ firstOpaque->bm_bitmap_next = xlrec->bm_second_blkno;
+
+ if (xlrec->bm_two_pages)
+ {
+ memcpy(secondBitmap->cwords, xlrec->bm_second_cwords,
+ BM_NUM_OF_HRL_WORDS_PER_PAGE * sizeof(BM_WORD));
+ memcpy(secondBitmap->hwords, xlrec->bm_second_hwords,
+ BM_NUM_OF_HEADER_WORDS * sizeof(BM_WORD));
+ secondOpaque->bm_hrl_words_used = xlrec->bm_second_num_cwords;
+ secondOpaque->bm_last_tid_location = xlrec->bm_second_last_tid;
+ secondOpaque->bm_bitmap_next = xlrec->bm_next_blkno;
+
+ PageSetLSN(secondPage, lsn);
+ _bitmap_wrtbuf(secondBuffer);
+ }
+
+ if (xlrec->bm_new_lastpage)
+ {
+ Buffer vmiBuffer;
+ Page vmiPage;
+ BMVectorMetaItem vmi;
+
+ vmiBuffer = XLogReadBuffer(xlrec->bm_node, xlrec->bm_vmi_blkno,
+ false);
+ if (!BufferIsValid(vmiBuffer))
+ elog(PANIC, "_bitmap_xlog_updatewords VMI block %d "
+ "does not exist", xlrec->bm_vmi_blkno);
+
+ vmiPage = BufferGetPage(vmiBuffer);
+ vmi = (BMVectorMetaItem)
+ PageGetItem(vmiPage,
+ PageGetItemId(vmiPage, xlrec->bm_vmi_offset));
+ vmi->bm_bitmap_tail = BufferGetBlockNumber(secondBuffer);
+
+ PageSetLSN(vmiPage, lsn);
+ _bitmap_wrtbuf(vmiBuffer);
+ }
+
+ PageSetLSN(firstPage, lsn);
+ _bitmap_wrtbuf(firstBuffer);
+ }
+ else
+ _bitmap_relbuf(firstBuffer);
+}
+
+
+void
+bitmap_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ case XLOG_BITMAP_INSERT_NEWVMIPAGE:
+ _bitmap_xlog_newpage(lsn, record);
+ break;
+ case XLOG_BITMAP_INSERT_VMI:
+ _bitmap_xlog_insert_vmi(lsn, record);
+ break;
+ case XLOG_BITMAP_INSERT_META:
+ _bitmap_xlog_insert_meta(lsn, record);
+ break;
+ case XLOG_BITMAP_INSERT_BITMAP_LASTWORDS:
+ _bitmap_xlog_insert_bitmap_lastwords(lsn, record);
+ break;
+ case XLOG_BITMAP_INSERT_WORDS:
+ _bitmap_xlog_insert_bitmapwords(lsn, record);
+ break;
+ case XLOG_BITMAP_UPDATEWORD:
+ _bitmap_xlog_updateword(lsn, record);
+ break;
+ case XLOG_BITMAP_UPDATEWORDS:
+ _bitmap_xlog_updatewords(lsn, record);
+ break;
+ default:
+ elog(PANIC, "bitmap_redo: unknown op code %u", info);
+ }
+}
+
+void
+bitmap_xlog_startup(void)
+{
+ incomplete_actions = NIL;
+ /* sleep(30); */
+}
+
+void
+bitmap_xlog_cleanup(void)
+{
+ ListCell* l;
+ foreach (l, incomplete_actions)
+ {
+ Relation reln;
+ Buffer vmiBuffer;
+ BMTIDBuffer newWords;
+
+ int lastTids_size;
+ int cwords_size;
+ int hwords_size;
+ BM_WORD *hwords;
+
+ bm_incomplete_action *action = (bm_incomplete_action *) lfirst(l);
+
+ vmiBuffer = XLogReadBuffer(action->bm_node, action->bm_vmi_blkno, false);
+
+ newWords.num_cwords = action->bm_num_cwords;
+ newWords.start_wordno = action->bm_start_wordno;
+
+ lastTids_size = newWords.num_cwords * sizeof(uint64);
+ cwords_size = newWords.num_cwords * sizeof(BM_WORD);
+ hwords_size = (BM_CALC_H_WORDS(newWords.num_cwords)) * sizeof(BM_WORD);
+
+ newWords.last_tids = (uint64 *)
+ (((char *) action) + sizeof(xl_bm_bitmapwords));
+ newWords.cwords = (BM_WORD*)
+ (((char *) action) + sizeof(xl_bm_bitmapwords) + lastTids_size);
+ hwords = (BM_WORD*)
+ (((char *) action) + sizeof(xl_bm_bitmapwords) + lastTids_size +
+ cwords_size);
+ memcpy(newWords.hwords, hwords, hwords_size);
+
+ newWords.last_compword = action->bm_last_compword;
+ newWords.last_word = action->bm_last_word;
+ newWords.is_last_compword_fill = (action->vmi_words_header == 2);
+ newWords.last_tid = action->bm_last_setbit;
+
+ /* Finish an incomplete insert
+ * XXX reln is not initialised here. Where should we get the
+ * value from?
+ */
+ _bitmap_write_new_bitmapwords(reln, vmiBuffer, action->bm_vmi_offset,
+ &newWords, false);
+ }
+ incomplete_actions = NIL;
+}
+
+bool
+bitmap_safe_restartpoint(void)
+{
+ if (incomplete_actions)
+ return false;
+ return true;
+}
diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile
index 7d092d2..c5bdde0 100644
--- a/src/backend/access/rmgrdesc/Makefile
+++ b/src/backend/access/rmgrdesc/Makefile
@@ -8,8 +8,8 @@ subdir = src/backend/access/rmgrdesc
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \
- mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \
- standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
+OBJS = bitmapdesc.o clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o \
+ heapdesc.o mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o \
+ spgdesc.o standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/rmgrdesc/bitmapdesc.c b/src/backend/access/rmgrdesc/bitmapdesc.c
new file mode 100644
index 0000000..933d6b3
--- /dev/null
+++ b/src/backend/access/rmgrdesc/bitmapdesc.c
@@ -0,0 +1,93 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmapdesc.c
+ * rmgr descriptor routines for access/bitmap/bitmap.c
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/rmgrdesc/bitmapdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bitmap.h"
+
+static void
+out_target(StringInfo buf, RelFileNode *node)
+{
+ appendStringInfo(buf, "rel %u/%u/%u",
+ node->spcNode, node->dbNode, node->relNode);
+}
+
+void
+bitmap_desc(StringInfo buf, uint8 xl_info, char *rec)
+{
+ uint8 info = xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ case XLOG_BITMAP_INSERT_NEWVMIPAGE:
+ {
+ xl_bm_newpage *xlrec = (xl_bm_newpage *) rec;
+
+ appendStringInfo(buf, "insert a new VMI page: ");
+ out_target(buf, &(xlrec->bm_node));
+ break;
+ }
+ case XLOG_BITMAP_INSERT_VMI:
+ {
+ xl_bm_vmi *xlrec = (xl_bm_vmi *) rec;
+
+ appendStringInfo(buf, "insert a new vector meta item: ");
+ out_target(buf, &(xlrec->bm_node));
+ break;
+ }
+ case XLOG_BITMAP_INSERT_META:
+ {
+ xl_bm_metapage *xlrec = (xl_bm_metapage *) rec;
+
+ appendStringInfo(buf, "update the metapage: ");
+ out_target(buf, &(xlrec->bm_node));
+ break;
+ }
+ case XLOG_BITMAP_INSERT_BITMAP_LASTWORDS:
+ {
+ xl_bm_bitmap_lastwords *xlrec = (xl_bm_bitmap_lastwords *) rec;
+
+ appendStringInfo(buf, "update the last two words in a bitmap: ");
+ out_target(buf, &(xlrec->bm_node));
+ break;
+ }
+
+ case XLOG_BITMAP_INSERT_WORDS:
+ {
+ xl_bm_bitmapwords *xlrec = (xl_bm_bitmapwords *)rec;
+
+ appendStringInfo(buf, "insert words in a not-last bitmap page: ");
+ out_target(buf, &(xlrec->bm_node));
+ break;
+ }
+
+ case XLOG_BITMAP_UPDATEWORD:
+ {
+ xl_bm_updateword *xlrec = (xl_bm_updateword *) rec;
+
+ appendStringInfo(buf, "update a word in a bitmap page: ");
+ out_target(buf, &(xlrec->bm_node));
+ break;
+ }
+ case XLOG_BITMAP_UPDATEWORDS:
+ {
+ xl_bm_updatewords *xlrec = (xl_bm_updatewords*) rec;
+
+ appendStringInfo(buf, "update words in bitmap pages: ");
+ out_target(buf, &(xlrec->bm_node));
+ break;
+ }
+ default:
+ appendStringInfo(buf, "UNKNOWN");
+ break;
+ }
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 41d4379..ac063d1 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -15,6 +15,7 @@
#include "access/multixact.h"
#include "access/nbtree.h"
#include "access/spgist.h"
+#include "access/bitmap.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "catalog/storage_xlog.h"
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index edfd843..d27f674 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -7314,3 +7314,23 @@ gincostestimate(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+
+Datum
+bmcostestimate(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ IndexPath *path = (IndexPath *) PG_GETARG_POINTER(1);
+ double loop_count = PG_GETARG_FLOAT8(2);
+ Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
+ Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
+ Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
+ double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+ GenericCosts costs;
+
+ MemSet(&costs, 0, sizeof(costs));
+
+ genericcostestimate(root, path, loop_count, &costs);
+ /* XXX Handle indexStartupCost etc. above. */
+
+ PG_RETURN_VOID();
+}
diff --git a/src/include/access/bitmap.h b/src/include/access/bitmap.h
new file mode 100644
index 0000000..4cfb9dc
--- /dev/null
+++ b/src/include/access/bitmap.h
@@ -0,0 +1,853 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitmap.h
+ * header file for on-disk bitmap index access method implementation.
+ *
+ * Copyright (c) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/include/access/bitmap.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef BITMAP_H
+#define BITMAP_H
+
+/* this macro enables debug messages */
+/* #define DEBUG_BMI 1 */
+
+#include "access/genam.h"
+#include "access/htup.h"
+#include "access/itup.h"
+#include "access/relscan.h"
+#include "access/sdir.h"
+#include "access/xlog.h"
+#include "nodes/execnodes.h"
+#include "nodes/tidbitmap.h"
+#include "storage/lock.h"
+#include "storage/relfilenode.h"
+#include "miscadmin.h"
+
+#define BM_READ BUFFER_LOCK_SHARE
+#define BM_WRITE BUFFER_LOCK_EXCLUSIVE
+#define BM_NOLOCK (-1)
+
+/* the size in bits of a hybrid run-length(HRL) word */
+#define BM_WORD_SIZE 16
+
+/* the type for a HRL word */
+typedef uint16 BM_WORD;
+
+#define BM_WORD_LEFTMOST (BM_WORD_SIZE-1)
+
+/*
+ * Metapage, always the first page (page 0) in the index.
+ *
+ * This page stores some meta-data information about this index.
+ *
+ * The list of values (LOV) is comprised of a heap (normal table) and an index.
+ * The heap maintains all distinct values along with the block numbers and
+ * offset numbers required to locate their vector meta items which in turn
+ * point to the real vector pages unless the vector is empty. Along with this
+ * heap, we also create a new btree index on this heap using attribute(s) as
+ * btree keys. In this way, for any given value, we search this btree to find
+ * the block number and offset number for its corresponding vector meta item.
+ * Each item in the heap is corresponding to a distinct value for attribute(s)
+ * to be indexed. For multi-column indexes on (a_1,a_2,...,a_n), we say two
+ * values (l_1,l_2,...,l_n) and (k_1,k_2,...,k_n) for (a_1,a_2,...,a_n) are the
+ * same if and only if for all i, l_i=k_i. This makes it possible to index the
+ * list of all the distinct n-ples of values, which can be thought as the
+ * result of "SELECT DISTINCT a_1,...,a_n FROM table".
+ */
+typedef struct BMMetaPageData
+{
+ /*
+ * The relation ids for a heap and a btree on this heap. They are used to
+ * speed up finding the bitmap vector for given attribute value(s), see the
+ * comments for LOV pages below for more information. We consider these as
+ * the general metadata of the index. There is additional metadata per
+ * vector contained in the BMVectorMetaItemData structures which are stored
+ * to custom pages.
+ */
+ Oid bm_lov_heapId; /* the relation id for the heap */
+ Oid bm_lov_indexId; /* the relation id for the index */
+
+ /* the block number for the last page of vector meta items (VMI). */
+ BlockNumber bm_last_vmi_page;
+} BMMetaPageData;
+
+typedef BMMetaPageData *BMMetaPage;
+
+/*
+ * The meta page is always the first block of the index
+ */
+
+#define BM_METAPAGE 0
+
+/*
+ * Note: we set this value equal to MaxHeapTuplesPerPage, to avoid
+ * having two different notions of a "legitimate" tuple offset.
+ */
+#define BM_MAX_HTUP_PER_PAGE MaxHeapTuplesPerPage
+
+/*
+ * VMI (Vector Meta Info) pages store some metadata related to their
+ * corresponding bitmap vectors. A VMI page maintains an array of
+ * BMVectorMetaItemData instances, called VMI.
+ */
+
+/*
+ * The first VMI (Vector Meta Item) page is reserved for NULL keys
+ */
+#define BM_VMI_STARTPAGE 1
+
+/*
+ * BMVMIID is a kind of a pointer to a vector meta item.
+ *
+ * It is made of a block and offset number of the BMVectorMetaItemData record
+ * within the set of VMI pages.
+ */
+typedef struct BMVMIID
+{
+ BlockNumber block;
+ OffsetNumber offset;
+} BMVMIID;
+
+/*
+ * Vector Meta Items (a.k.a. VMI or entries in a VMI page).
+ *
+ * This is a bit of meta information for each vector. It exists even if the
+ * real vector does not exist because the table is empty or contains no matches
+ * for the key.
+ *
+ * The most important information comes from the pointers to the real vector
+ * pages. The other fields contain information required for performance
+ * optimization purposes.
+ */
+typedef struct BMVectorMetaItemData
+{
+ /* the first page and last page of the bitmap vector. */
+ BlockNumber bm_bitmap_head;
+ BlockNumber bm_bitmap_tail;
+
+ /*
+ * Additional information to be used to append new bits into
+ * existing bitmap vector that this distinct value is associated with.
+ * The following two words do not store in the regular bitmap page,
+ * defined below.
+ */
+
+ /* the last complete word in its bitmap vector. */
+ BM_WORD bm_last_compword;
+
+ /*
+ * the last word in its bitmap vector. This word is not
+ * a complete word. If a new appending bit makes this word
+ * to be complete, this word will merge with bm_last_compword.
+ */
+ BM_WORD bm_last_word;
+
+ /*
+ * This value represents the first bit of the last word representing the
+ * index when completely decompressed.
+ *
+ * See it's use in updatesetbit().
+ */
+ uint64 bm_last_tid_location;
+
+ /*
+ * This is the value of the last set bit in the vector. It lets us
+ * know the extent of the vector without decompressing.
+ *
+ * See insertsetbit() to see how it progresses.
+ */
+ uint64 bm_last_setbit;
+
+ /*
+ * Only two least-significant bits in this byte are used.
+ *
+ * If the first least-significant bit is 1, then it represents
+ * that bm_last_word is a fill word. If the second least-significant
+ * bit is 1, it represents that bm_last_compword is a fill word.
+ */
+ uint8 vmi_words_header;
+} BMVectorMetaItemData;
+typedef BMVectorMetaItemData *BMVectorMetaItem;
+
+#define BM_VECTOR_META_ITEM_SIZE (sizeof(BMVectorMetaItemData))
+
+#define BM_MAX_VMI_PER_PAGE \
+ ((BLCKSZ - sizeof(PageHeaderData)) / sizeof(BMVectorMetaItemData))
+
+#define BM_VMI_WORDS_NO_FILL 0
+#define BM_LAST_WORD_BIT 1
+#define BM_LAST_COMPWORD_BIT 2
+
+#define BM_LASTWORD_IS_FILL(vmi) \
+ (vmi->vmi_words_header & BM_LAST_WORD_BIT)
+
+#define BM_LAST_COMPWORD_IS_FILL(vmi) \
+ (vmi->vmi_words_header & BM_LAST_COMPWORD_BIT)
+
+#define BM_BOTH_VMI_WORDS_FILL(vmi) \
+ (BM_LASTWORD_IS_FILL(vmi) && BM_LAST_COMPWORD_IS_FILL(vmi))
+
+/*
+ * Bitmap page -- pages to store bits in a bitmap vector.
+ *
+ * Each bitmap page stores two parts of information: header words and
+ * content words. Each bit in the header words is corresponding to
+ * a word in the content words. If a bit in the header words is 1,
+ * then its corresponding content word is a compressed word. Otherwise,
+ * it is a literal word.
+ *
+ * If a content word is a fill word, it means that there is a sequence
+ * of 0 bits or 1 bits. The most significant bit in this content word
+ * represents the bits in this sequence are 0s or 1s. The rest of bits
+ * stores the value of "the number of bits / BM_WORD_SIZE".
+ */
+
+/*
+ * Opaque data for a bitmap page.
+ */
+typedef struct BMPageOpaqueData
+{
+ uint16 bm_hrl_words_used; /* the number of words used */
+ BlockNumber bm_bitmap_next; /* the next page for this bitmap */
+
+ /*
+ * the tid location for the last bit in this page.
+ */
+ uint64 bm_last_tid_location;
+ uint16 bm_page_id; /* bitmap index identifier */
+} BMPageOpaqueData;
+typedef BMPageOpaqueData *BMPageOpaque;
+
+#define BM_PAGE_ID 0xFF82
+/*
+ * Approximately 4078 words per 8K page
+ */
+#define BM_MAX_NUM_OF_HRL_WORDS_PER_PAGE \
+ ((BLCKSZ - \
+ MAXALIGN(sizeof(PageHeaderData)) - \
+ MAXALIGN(sizeof(BMPageOpaqueData)))/sizeof(BM_WORD))
+
+/* approx 255 */
+#define BM_MAX_NUM_OF_HEADER_WORDS \
+ (((BM_MAX_NUM_OF_HRL_WORDS_PER_PAGE-1)/BM_WORD_SIZE) + 1)
+
+/*
+ * To make the last header word a complete word, we limit this number to
+ * the multiplication of the word size.
+ */
+#define BM_NUM_OF_HRL_WORDS_PER_PAGE \
+ (((BM_MAX_NUM_OF_HRL_WORDS_PER_PAGE - \
+ BM_MAX_NUM_OF_HEADER_WORDS)/BM_WORD_SIZE) * BM_WORD_SIZE)
+
+#define BM_NUM_OF_HEADER_WORDS \
+ (((BM_NUM_OF_HRL_WORDS_PER_PAGE-1)/BM_WORD_SIZE) + 1)
+
+/*
+ * A page of a compressed bitmap vector
+ */
+typedef struct BMBitmapVectorPageData
+{
+ BM_WORD hwords[BM_NUM_OF_HEADER_WORDS];
+ BM_WORD cwords[BM_NUM_OF_HRL_WORDS_PER_PAGE];
+} BMBitmapVectorPageData;
+typedef BMBitmapVectorPageData *BMBitmapVectorPage;
+
+/*
+ * Data structure for used to buffer index creation during bmbuild().
+ * Buffering provides three benefits: firstly, it makes for many fewer
+ * calls to the lower-level bitmap insert functions; secondly, it means that
+ * we reduce the amount of unnecessary compression and decompression we do;
+ * thirdly, in some cases pages for a given bitmap vector will be contiguous
+ * on disk.
+ *
+ * byte_size counts how many bytes we've consumed in the buffer.
+ * max_vmi_block is a hint as to whether we'll find a VMI block in vmi_blocks
+ * or not (we take advantage of the fact that VMI block numbers will be
+ * increasing).
+ * vmi_blocks is a list of VMI block buffers. The structures put in
+ * this list are defined in bitmapinsert.c.
+ */
+
+typedef struct BMTidBuildBuf
+{
+ uint32 byte_size; /* The size in bytes of the buffer's data */
+ BlockNumber max_vmi_block; /* highest VMI block we're seen */
+ List *vmi_blocks; /* list of VMI blocks we're buffering */
+} BMTidBuildBuf;
+
+
+/*
+ * The number of tid locations to be found at once during query processing.
+ */
+#define BM_BATCH_TIDS 16*1024
+
+/*
+ * the maximum number of words to be retrieved during BitmapIndexScan.
+ */
+#define BM_MAX_WORDS BM_NUM_OF_HRL_WORDS_PER_PAGE*4
+
+/* Some macros for manipulating a bitmap word. */
+#define LITERAL_ALL_ZERO 0
+#define LITERAL_ALL_ONE ((BM_WORD)(~((BM_WORD)0)))
+
+#define FILL_MASK ~(((BM_WORD)1) << (BM_WORD_SIZE - 1))
+
+#define BM_MAKE_FILL_WORD(bit, length) \
+ ((((BM_WORD)bit) << (BM_WORD_SIZE-1)) | (length))
+
+#define FILL_LENGTH(w) (((BM_WORD)(w)) & FILL_MASK)
+
+#define MAX_FILL_LENGTH ((((BM_WORD)1)<<(BM_WORD_SIZE-1))-1)
+
+/* get the left most bit of the word */
+#define GET_FILL_BIT(w) (((BM_WORD)(w))>>BM_WORD_LEFTMOST)
+
+/*
+ * Given a word number, determine the bit position it that holds in its
+ * header word.
+ */
+#define WORDNO_GET_HEADER_BIT(cw_no) \
+ ((BM_WORD)1 << (BM_WORD_SIZE - 1 - ((cw_no) % BM_WORD_SIZE)))
+
+/* update a header to set a fill bit on */
+#define HEADER_SET_FILL_BIT_ON(h, cw_no) \
+ h[(cw_no ? BM_WORD_SIZE/cw_no : cw_no)] |= WORDNO_GET_HEADER_BIT(cw_no)
+
+/* update a header to set a fill bit off */
+#define HEADER_SET_FILL_BIT_OFF(h, cw_no) \
+ h[BM_WORD_SIZE/cw_no] &= ~(WORDNO_GET_HEADER_BIT(cw_no))
+
+/*
+ * To see if the content word at n is a compressed word or not we must look
+ * look in the header words h_words. Each bit in the header words corresponds
+ * to a word amongst the content words. If the bit is 1, the word is compressed
+ * (i.e., it is a fill word) otherwise it is uncompressed.
+ *
+ * See src/backend/access/bitmap/README for more details
+ */
+
+#define IS_FILL_WORD(h, n) \
+ (bool) ((((h)[(n)/BM_WORD_SIZE]) & (WORDNO_GET_HEADER_BIT(n))) > 0 ? \
+ true : false)
+
+/* A simplified interface to IS_FILL_WORD */
+
+#define CUR_WORD_IS_FILL(b) \
+ IS_FILL_WORD(b->hwords, b->startNo)
+
+/*
+ * Yet another interface to IS_FILL_WORD. This tests if the current word
+ * is a fill word of non-matches.
+ */
+#define BM_WORD_IS_NON_MATCH_FILL(bm, wordno) \
+ (IS_FILL_WORD(bm->hwords, wordno) && \
+ (GET_FILL_BIT(bm->cwords[wordno]) == 0 || bm->cwords[wordno] == 0))
+
+/*
+ * Calculate the number of header words we need given the number of
+ * content words
+ */
+#define BM_CALC_H_WORDS(c_words) \
+ (c_words == 0 ? c_words : (((c_words - 1)/BM_WORD_SIZE) + 1))
+
+/*
+ * Convert an ItemPointer to and from an integer representation
+ */
+
+#define BM_IPTR_TO_INT(iptr) \
+ ((uint64)ItemPointerGetBlockNumber(iptr) * BM_MAX_HTUP_PER_PAGE + \
+ (uint64)ItemPointerGetOffsetNumber(iptr))
+
+#define BM_INT_GET_BLOCKNO(i) \
+ (BlockNumber)((i - 1)/BM_MAX_HTUP_PER_PAGE)
+
+#define BM_INT_GET_OFFSET(i) \
+ (OffsetNumber)(((i - 1) % BM_MAX_HTUP_PER_PAGE) + 1)
+
+
+/*
+ * BMTIDBuffer represents TIDs we've buffered for a given bitmap vector --
+ * i.e., TIDs for a distinct value in the underlying table. We take advantage
+ * of the fact that since we are reading the table from beginning to end
+ * TIDs will be ordered.
+ */
+
+/* Note: after HOT tuples, TIDs are no longer scanned in order during
+ * index build. That required the introduction of a per-page and
+ * per-vector "hot_buffer".
+ */
+
+#define BM_SIZEOF_HOT_BUFFER ((BM_MAX_HTUP_PER_PAGE - 1) / BM_WORD_SIZE + 2)
+
+typedef struct BMTIDBuffer
+{
+ /* The last two bitmap words */
+ BM_WORD last_compword;
+ BM_WORD last_word;
+ bool is_last_compword_fill;
+
+ uint64 start_tid; /* starting TID for this buffer */
+ uint64 last_tid; /* most recent tid added */
+ int16 curword; /* index into content */
+ int16 num_cwords; /* number of allocated words in content */
+
+ /* the starting array index that contains useful content words */
+ int16 start_wordno;
+
+ /* the last tids, one for each actual data words */
+ uint64 *last_tids;
+
+ /* the header and content words */
+ BM_WORD hwords[BM_NUM_OF_HEADER_WORDS];
+ BM_WORD *cwords;
+
+ /* HOT tuples buffer */
+ BlockNumber hot_buffer_block;
+ BM_WORD hot_buffer[BM_SIZEOF_HOT_BUFFER];
+ int16 hot_buffer_count;
+ uint64 hot_buffer_start_tid;
+ uint64 hot_buffer_last_tid;
+} BMTIDBuffer;
+
+/*
+ * the state for index build
+ */
+typedef struct BMBuildState
+{
+ TupleDesc bm_tupDesc;
+ Relation bm_lov_heap;
+ Relation bm_lov_index;
+ /*
+ * We use this hash to cache lookups of VMI blocks for different keys When
+ * one of attribute types can not be hashed, we set this hash to NULL.
+ */
+ HTAB *vmi_hash;
+
+ /*
+ * When the attributes to be indexed can not be hashed, we can not use the
+ * hash for the VMI blocks. We have to search through the btree.
+ */
+ ScanKey bm_lov_scanKeys;
+ IndexScanDesc bm_lov_scanDesc;
+
+ /*
+ * The buffer to store last several tid locations for each distinct value.
+ */
+ BMTidBuildBuf *bm_tidLocsBuffer;
+
+ double ituples; /* the number of index tuples */
+ bool use_wal; /* whether or not we write WAL records */
+
+ /* HOT tuples prebuffer */
+ BlockNumber hot_prebuffer_block;
+ uint64 hot_prebuffer_tdn[BM_MAX_HTUP_PER_PAGE];
+ ItemPointerData hot_prebuffer_ipd[BM_MAX_HTUP_PER_PAGE];
+ Datum *hot_prebuffer_atd[BM_MAX_HTUP_PER_PAGE];
+ bool *hot_prebuffer_nll[BM_MAX_HTUP_PER_PAGE];
+ int16 hot_prebuffer_count;
+} BMBuildState;
+
+/*
+ * Define an iteration result while scanning an BMBatchWords.
+ *
+ * This result includes the last scan position in an BMBatchWords,
+ * and all tids that are generated from previous scan.
+ */
+typedef struct BMIterateResult
+{
+ uint64 nextTid; /* the first tid for the next iteration */
+ uint32 lastScanPos; /* position in the bitmap word we're looking at */
+ uint32 lastScanWordNo; /* offset in BWBatchWords */
+ uint64 nextTids[BM_BATCH_TIDS]; /* array of matching TIDs */
+ uint32 numOfTids; /* number of TIDs matched */
+ uint32 nextTidLoc; /* the next position in 'nextTids' to be read. */
+} BMIterateResult;
+
+/*
+ * Stores a batch of consecutive bitmap words from a bitmap vector.
+ *
+ * These bitmap words come from a bitmap vector stored in this bitmap
+ * index, or a bitmap vector that is generated by ANDing/ORing several
+ * bitmap vectors.
+ *
+ * This struct also contains information to compute the tid locations
+ * for the set bits in these bitmap words.
+ */
+typedef struct BMBatchWords
+{
+ uint32 maxNumOfWords; /* maximum number of words in this list */
+
+ /* Number of uncompressed words that have been read already */
+ uint32 nwordsread;
+ uint32 nextread; /* next word to read */
+ uint64 firstTid; /* the TID we're up to */
+ uint32 startNo; /* position we're at in cwords */
+ uint32 nwords; /* the number of bitmap words */
+ BM_WORD *hwords; /* the header words */
+ BM_WORD *cwords; /* the actual bitmap words */
+} BMBatchWords;
+
+/*
+ * Scan opaque data for one bitmap vector.
+ *
+ * This structure stores a batch of consecutive bitmap words for a
+ * bitmap vector that have been read from the disk, and remembers
+ * the next reading position for the next batch of consecutive
+ * bitmap words.
+ */
+typedef struct BMVectorData
+{
+ Buffer bm_vmiBuffer;/* the buffer that contains the VMI */
+ OffsetNumber bm_vmiOffset; /* the offset of the VMI */
+ BlockNumber bm_nextBlockNo; /* the next bitmap page block */
+
+ /* indicate if the last two words in the bitmap has been read. These two
+ * words are stored inside a VMI. If this value is true, it means this
+ * bitmap vector has no more words.
+ */
+ bool bm_readLastWords;
+ BMBatchWords *bm_batchWords; /* actual bitmap words */
+
+} BMVectorData;
+typedef BMVectorData *BMVector;
+
+/*
+ * Defines the current position of a scan.
+ *
+ * For each scan, all related bitmap vectors are read from the bitmap
+ * index, and ORed together into a final bitmap vector. The words
+ * in each bitmap vector are read in batches. This structure stores
+ * the following:
+ * (1) words for a final bitmap vector after ORing words from
+ * related bitmap vectors.
+ * (2) tid locations that satisfy the query.
+ * (3) One BMVectorData for each related bitmap vector.
+ */
+typedef struct BMScanPositionData
+{
+ bool done; /* indicate if this scan is over */
+ int nvec; /* the number of related bitmap vectors */
+ /* the words in the final bitmap vector that satisfies the query. */
+ BMBatchWords *bm_batchWords;
+
+ /*
+ * The BMIterateResult instance that contains the final
+ * tid locations for tuples that satisfy the query.
+ */
+ BMIterateResult bm_result;
+ BMVector posvecs; /* one or more bitmap vectors */
+} BMScanPositionData;
+
+typedef BMScanPositionData *BMScanPosition;
+
+typedef struct BMScanOpaqueData
+{
+ BMScanPosition bm_currPos;
+ bool cur_pos_valid;
+ /* XXX: should we pull out mark pos? */
+ BMScanPosition bm_markPos;
+ bool mark_pos_valid;
+} BMScanOpaqueData;
+
+typedef BMScanOpaqueData *BMScanOpaque;
+
+/*
+ * XLOG records for bitmap index operations
+ *
+ * Some information in high 4 bits of log record xl_info field.
+ */
+#define XLOG_BITMAP_INSERT_NEWVMIPAGE 0x10 /* add a new VMI page */
+#define XLOG_BITMAP_INSERT_VMI 0x20 /* add a new VMI */
+#define XLOG_BITMAP_INSERT_META 0x30 /* update the metapage */
+#define XLOG_BITMAP_INSERT_BITMAP_LASTWORDS 0x40 /* update the last 2 words
+ in a bitmap */
+/* insert bitmap words into a bitmap page which is not the last one. */
+#define XLOG_BITMAP_INSERT_WORDS 0x50
+/* insert bitmap words to the last bitmap page and the VMI buffer */
+#define XLOG_BITMAP_INSERT_LASTWORDS 0x60
+#define XLOG_BITMAP_UPDATEWORD 0x70
+#define XLOG_BITMAP_UPDATEWORDS 0x80
+
+/*
+ * The information about writing bitmap words to last bitmap page and VMI page.
+ */
+typedef struct xl_bm_bitmapwords
+{
+ RelFileNode bm_node;
+ /* The block number for the bitmap page */
+ BlockNumber bm_blkno;
+ /* The next block number for this bitmap page */
+ BlockNumber bm_next_blkno;
+ /* The last tid location for this bitmap page */
+ uint64 bm_last_tid;
+ /*
+ * The block number and offset for the VMI page that is associated with
+ * this bitmap page.
+ */
+ BlockNumber bm_vmi_blkno;
+ OffsetNumber bm_vmi_offset;
+
+ /* The information for the VMI page */
+ BM_WORD bm_last_compword;
+ BM_WORD bm_last_word;
+ uint8 vmi_words_header;
+ uint64 bm_last_setbit;
+
+ /*
+ * Indicate if these bitmap words are stored in the last bitmap page and
+ * the VMI buffer.
+ */
+ bool bm_is_last;
+
+ /*
+ * Indicate if this is the first time to insert into a bitmap
+ * page.
+ */
+ bool bm_is_first;
+
+ /*
+ * The words stored in the following array to be written to this
+ * bitmap page.
+ */
+ uint64 bm_start_wordno;
+ uint64 bm_words_written;
+
+ /*
+ * Total number of new bitmap words. We need to log all new words
+ * to be able to do recovery.
+ */
+ uint64 bm_num_cwords;
+
+ /*
+ * The following are arrays of last tids, content words, and header
+ * words. They are located one after the other. There are bm_num_cwords
+ * of last tids and content words, and BM_CALC_H_WORDS(bm_num_cwords)
+ * header words.
+ */
+} xl_bm_bitmapwords;
+
+typedef struct xl_bm_updatewords
+{
+ RelFileNode bm_node;
+ BlockNumber bm_vmi_blkno;
+ OffsetNumber bm_vmi_offset;
+
+ BlockNumber bm_first_blkno;
+ BM_WORD bm_first_cwords[BM_NUM_OF_HRL_WORDS_PER_PAGE];
+ BM_WORD bm_first_hwords[BM_NUM_OF_HEADER_WORDS];
+ uint64 bm_first_last_tid;
+ uint64 bm_first_num_cwords;
+
+ BlockNumber bm_second_blkno;
+ BM_WORD bm_second_cwords[BM_NUM_OF_HRL_WORDS_PER_PAGE];
+ BM_WORD bm_second_hwords[BM_NUM_OF_HEADER_WORDS];
+ uint64 bm_second_last_tid;
+ uint64 bm_second_num_cwords;
+
+ /* Indicate if this update involves two bitmap pages */
+ bool bm_two_pages;
+
+ /* The previous next page number for the first page. */
+ BlockNumber bm_next_blkno;
+
+ /* Indicate if the second page is a new last bitmap page */
+ bool bm_new_lastpage;
+} xl_bm_updatewords;
+
+typedef struct xl_bm_updateword
+{
+ RelFileNode bm_node;
+ BlockNumber bm_blkno;
+ int bm_word_no;
+ BM_WORD bm_cword;
+ BM_WORD bm_hword;
+} xl_bm_updateword;
+
+/* The information about inserting a new VMI. */
+typedef struct xl_bm_vmi
+{
+ RelFileNode bm_node;
+ BlockNumber bm_vmi_blkno;
+ OffsetNumber bm_vmi_offset;
+ BMVectorMetaItemData bm_vmi;
+ bool bm_is_new_vmi_blkno;
+} xl_bm_vmi;
+
+/* The information about adding a new page */
+typedef struct xl_bm_newpage
+{
+ RelFileNode bm_node;
+ BlockNumber bm_new_blkno;
+} xl_bm_newpage;
+
+/*
+ * The information about changes on a bitmap page.
+ * If bm_isOpaque is true, then bm_next_blkno is set.
+ */
+typedef struct xl_bm_bitmappage
+{
+ RelFileNode bm_node;
+ BlockNumber bm_bitmap_blkno;
+
+ bool bm_isOpaque;
+ BlockNumber bm_next_blkno;
+
+ uint32 bm_last_tid_location;
+ uint32 bm_hrl_words_used;
+ uint32 bm_num_words;
+ /* for simplicity, we log the header words each time */
+ BM_WORD hwords[BM_NUM_OF_HEADER_WORDS];
+ /* followed by the "bm_num_words" content words. */
+} xl_bm_bitmappage;
+
+/* The information about changes to the last 2 words in a bitmap vector */
+typedef struct xl_bm_bitmap_lastwords
+{
+ RelFileNode bm_node;
+ BM_WORD bm_last_compword;
+ BM_WORD bm_last_word;
+ uint8 vmi_words_header;
+
+ BlockNumber bm_vmi_blkno;
+ OffsetNumber bm_vmi_offset;
+} xl_bm_bitmap_lastwords;
+
+/* The information about the changes in the metapage. */
+typedef struct xl_bm_metapage
+{
+ RelFileNode bm_node;
+ Oid bm_lov_heapId; /* the relation id for the heap */
+ Oid bm_lov_indexId; /* the relation id for the index */
+ /* the block number for the last VMI pages. */
+ BlockNumber bm_last_vmi_page;
+} xl_bm_metapage;
+
+/* public routines */
+extern Datum bmbuild(PG_FUNCTION_ARGS);
+extern Datum bmbuildempty(PG_FUNCTION_ARGS);
+extern Datum bminsert(PG_FUNCTION_ARGS);
+extern Datum bmbeginscan(PG_FUNCTION_ARGS);
+extern Datum bmgettuple(PG_FUNCTION_ARGS);
+extern Datum bmgetbitmap(PG_FUNCTION_ARGS);
+extern Datum bmrescan(PG_FUNCTION_ARGS);
+extern Datum bmendscan(PG_FUNCTION_ARGS);
+extern Datum bmmarkpos(PG_FUNCTION_ARGS);
+extern Datum bmrestrpos(PG_FUNCTION_ARGS);
+extern Datum bmbulkdelete(PG_FUNCTION_ARGS);
+extern Datum bmvacuumcleanup(PG_FUNCTION_ARGS);
+extern Datum bmoptions(PG_FUNCTION_ARGS);
+
+/* bitmappages.c */
+extern Buffer _bitmap_getbuf(Relation rel, BlockNumber blkno, int access);
+extern void _bitmap_wrtbuf(Buffer buf);
+extern void _bitmap_relbuf(Buffer buf);
+extern void _bitmap_wrtnorelbuf(Buffer buf);
+extern void _bitmap_init_vmipage(Buffer buf);
+extern void _bitmap_init_bitmappage(Buffer buf);
+extern void _bitmap_init_buildstate(Relation index, BMBuildState *bmstate,
+ IndexInfo *indexInfo);
+extern void _bitmap_cleanup_buildstate(Relation index, BMBuildState *bmstate,
+ IndexInfo *indexInfo);
+extern void _bitmap_init(Relation index, bool use_wal);
+
+/* bitmapinsert.c */
+extern void _bitmap_buildinsert(Relation index, ItemPointerData ht_ctid,
+ Datum *attdata, bool *nulls,
+ BMBuildState *state);
+extern void _bitmap_doinsert(Relation rel, ItemPointerData ht_ctid,
+ Datum *attdata, bool *nulls);
+extern void _bitmap_write_alltids(Relation rel, BMTidBuildBuf *tids,
+ bool use_wal);
+extern uint64 _bitmap_write_bitmapwords(Buffer bitmapBuffer,
+ BMTIDBuffer* buf);
+extern void _bitmap_write_new_bitmapwords(Relation rel, Buffer vmiBuffer,
+ OffsetNumber vmiOffset,
+ BMTIDBuffer *buf, bool use_wal);
+extern uint16 _bitmap_free_tidbuf(BMTIDBuffer *buf);
+extern void build_inserttuple_flush(Relation rel, BMBuildState *state);
+
+/* bitmaputil.c */
+extern BMVectorMetaItem _bitmap_formitem(uint64 currTidNumber);
+extern void _bitmap_init_batchwords(BMBatchWords *words,
+ uint32 maxNumOfWords,
+ MemoryContext mcxt);
+extern void _bitmap_copy_batchwords(BMBatchWords *words, BMBatchWords *copyWords);
+extern void _bitmap_reset_batchwords(BMBatchWords *words);
+extern void _bitmap_cleanup_batchwords(BMBatchWords *words);
+extern void _bitmap_cleanup_scanpos(BMVector bmScanPos,
+ uint32 numBitmapVectors);
+extern uint64 _bitmap_findnexttid(BMBatchWords *words,
+ BMIterateResult *result);
+extern void _bitmap_findprevtid(BMIterateResult *result);
+extern void _bitmap_findnexttids(BMBatchWords *words,
+ BMIterateResult *result, uint32 maxTids);
+#ifdef NOT_USED /* we might use this later */
+extern void _bitmap_intersect(BMBatchWords **batches, uint32 numBatches,
+ BMBatchWords *result);
+#endif
+extern void _bitmap_union(BMBatchWords **batches, uint32 numBatches,
+ BMBatchWords *result);
+extern void _bitmap_begin_iterate(BMBatchWords *words, BMIterateResult *result);
+extern void _bitmap_log_newpage(Relation rel, uint8 info, Buffer buf);
+extern void _bitmap_log_metapage(Relation rel, Page page);
+extern void _bitmap_log_bitmap_lastwords(Relation rel, Buffer vmiBuffer,
+ OffsetNumber vmiOffset, BMVectorMetaItem vmi);
+extern void _bitmap_log_vmi(Relation rel, Buffer vmiBuffer,
+ OffsetNumber offset, BMVectorMetaItem vmi,
+ Buffer metabuf, bool is_new_vmi_blkno);
+extern void _bitmap_log_bitmapwords(Relation rel, Buffer bitmapBuffer,
+ Buffer vmiBuffer,
+ OffsetNumber vmiOffset, BMTIDBuffer* buf,
+ uint64 words_written, uint64 tidnum,
+ BlockNumber nextBlkno,
+ bool isLast, bool isFirst);
+extern void _bitmap_log_updatewords(Relation rel, Buffer vmiBuffer,
+ OffsetNumber vmiOffset, Buffer firstBuffer,
+ Buffer secondBuffer, bool new_lastpage);
+extern void _bitmap_log_updateword(Relation rel, Buffer bitmapBuffer, int word_no);
+
+/* bitmapsearch.c */
+extern bool _bitmap_first(IndexScanDesc scan, ScanDirection dir);
+extern bool _bitmap_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bitmap_firstbatchwords(IndexScanDesc scan, ScanDirection dir);
+extern bool _bitmap_nextbatchwords(IndexScanDesc scan, ScanDirection dir);
+extern void _bitmap_findbitmaps(IndexScanDesc scan, ScanDirection dir);
+extern void _bitmap_initscanpos(IndexScanDesc scan, BMVector bmScanPos,
+ BlockNumber vmiBlock, OffsetNumber vmiOffset);
+extern void _bitmap_get_null_vmiid(Relation index, BMVMIID *vmiid);
+
+
+/* bitmapattutil.c */
+extern void _bitmap_create_lov_heapandindex(Relation rel, Oid *heapId,
+ Oid *indexId);
+extern void _bitmap_open_lov_heapandindex(BMMetaPage metapage,
+ Relation *lovHeapP, Relation *lovIndexP,
+ LOCKMODE lockMode);
+extern void _bitmap_insert_lov(Relation lovHeap, Relation lovIndex,
+ Datum *datum, bool *nulls, bool use_wal,
+ bool skip_index_insert);
+extern void _bitmap_close_lov_heapandindex(Relation lovHeap,
+ Relation lovIndex, LOCKMODE lockMode);
+extern bool _bitmap_findvalue(Relation lovHeap, Relation lovIndex,
+ ScanKey scanKey, IndexScanDesc scanDesc,
+ BMVMIID *vmiid);
+extern void _bitmap_vacuum(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+
+/*
+ * prototypes for functions in bitmapxlog.c
+ */
+extern void bitmap_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void bitmap_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern void bitmap_xlog_startup(void);
+extern void bitmap_xlog_cleanup(void);
+extern bool bitmap_safe_restartpoint(void);
+
+#endif
diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h
index 5a4664b..eaa8e03 100644
--- a/src/include/access/reloptions.h
+++ b/src/include/access/reloptions.h
@@ -41,10 +41,11 @@ typedef enum relopt_kind
RELOPT_KIND_HASH = (1 << 3),
RELOPT_KIND_GIN = (1 << 4),
RELOPT_KIND_GIST = (1 << 5),
- RELOPT_KIND_ATTRIBUTE = (1 << 6),
- RELOPT_KIND_TABLESPACE = (1 << 7),
- RELOPT_KIND_SPGIST = (1 << 8),
- RELOPT_KIND_VIEW = (1 << 9),
+ RELOPT_KIND_BITMAP = (1 << 6),
+ RELOPT_KIND_ATTRIBUTE = (1 << 7),
+ RELOPT_KIND_TABLESPACE = (1 << 8),
+ RELOPT_KIND_SPGIST = (1 << 9),
+ RELOPT_KIND_VIEW = (1 << 10),
/* if you add a new kind, make sure you update "last_default" too */
RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_VIEW,
/* some compilers treat enums as signed ints, so we can't use 1 << 31 */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index 7ad71b3..b3b96eb 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -42,3 +42,4 @@ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL)
PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, NULL, NULL, NULL)
PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_xlog_startup, spg_xlog_cleanup, NULL)
+PG_RMGR(RM_BITMAP_ID, "Bitmap", bitmap_redo, bitmap_desc, bitmap_xlog_startup, bitmap_xlog_cleanup, bitmap_safe_restartpoint)
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h
index 5b04dda..b743aaa 100644
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -132,5 +132,8 @@ DESCR("GIN index access method");
DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
DESCR("SP-GiST index access method");
#define SPGIST_AM_OID 4000
+DATA(insert OID = 5013 ( bitmap 5 1 f f f f t t f f f f f 0 bminsert bmbeginscan bmgettuple bmgetbitmap bmrescan bmendscan bmmarkpos bmrestrpos bmbuild bmbuildempty bmbulkdelete bmvacuumcleanup - bmcostestimate bmoptions ));
+DESCR("bitmap index access method");
+#define BITMAP_AM_OID 5013
#endif /* PG_AM_H */
diff --git a/src/include/catalog/pg_amop.h b/src/include/catalog/pg_amop.h
index d200348..2db9d9a 100644
--- a/src/include/catalog/pg_amop.h
+++ b/src/include/catalog/pg_amop.h
@@ -781,4 +781,393 @@ DATA(insert ( 3474 3831 3831 8 s 3892 4000 0 ));
DATA(insert ( 3474 3831 2283 16 s 3889 4000 0 ));
DATA(insert ( 3474 3831 3831 18 s 3882 4000 0 ));
+/*
+ * on-disk bitmap index operators
+ */
+
+/*
+ * bitmap integer_ops
+ */
+
+/* default operators int2 */
+DATA(insert ( 5026 21 21 1 s 95 5013 0 ));
+DATA(insert ( 5026 21 21 2 s 522 5013 0 ));
+DATA(insert ( 5026 21 21 3 s 94 5013 0 ));
+DATA(insert ( 5026 21 21 4 s 524 5013 0 ));
+DATA(insert ( 5026 21 21 5 s 520 5013 0 ));
+/* crosstype operators int24 */
+DATA(insert ( 5026 21 23 1 s 534 5013 0 ));
+DATA(insert ( 5026 21 23 2 s 540 5013 0 ));
+DATA(insert ( 5026 21 23 3 s 532 5013 0 ));
+DATA(insert ( 5026 21 23 4 s 542 5013 0 ));
+DATA(insert ( 5026 21 23 5 s 536 5013 0 ));
+/* crosstype operators int28 */
+DATA(insert ( 5026 21 20 1 s 1864 5013 0 ));
+DATA(insert ( 5026 21 20 2 s 1866 5013 0 ));
+DATA(insert ( 5026 21 20 3 s 1862 5013 0 ));
+DATA(insert ( 5026 21 20 4 s 1867 5013 0 ));
+DATA(insert ( 5026 21 20 5 s 1865 5013 0 ));
+/* default operators int4 */
+DATA(insert ( 5026 23 23 1 s 97 5013 0 ));
+DATA(insert ( 5026 23 23 2 s 523 5013 0 ));
+DATA(insert ( 5026 23 23 3 s 96 5013 0 ));
+DATA(insert ( 5026 23 23 4 s 525 5013 0 ));
+DATA(insert ( 5026 23 23 5 s 521 5013 0 ));
+/* crosstype operators int42 */
+DATA(insert ( 5026 23 21 1 s 535 5013 0 ));
+DATA(insert ( 5026 23 21 2 s 541 5013 0 ));
+DATA(insert ( 5026 23 21 3 s 533 5013 0 ));
+DATA(insert ( 5026 23 21 4 s 543 5013 0 ));
+DATA(insert ( 5026 23 21 5 s 537 5013 0 ));
+/* crosstype operators int48 */
+DATA(insert ( 5026 23 20 1 s 37 5013 0 ));
+DATA(insert ( 5026 23 20 2 s 80 5013 0 ));
+DATA(insert ( 5026 23 20 3 s 15 5013 0 ));
+DATA(insert ( 5026 23 20 4 s 82 5013 0 ));
+DATA(insert ( 5026 23 20 5 s 76 5013 0 ));
+/* default operators int8 */
+DATA(insert ( 5026 20 20 1 s 412 5013 0 ));
+DATA(insert ( 5026 20 20 2 s 414 5013 0 ));
+DATA(insert ( 5026 20 20 3 s 410 5013 0 ));
+DATA(insert ( 5026 20 20 4 s 415 5013 0 ));
+DATA(insert ( 5026 20 20 5 s 413 5013 0 ));
+/* crosstype operators int82 */
+DATA(insert ( 5026 20 21 1 s 1870 5013 0 ));
+DATA(insert ( 5026 20 21 2 s 1872 5013 0 ));
+DATA(insert ( 5026 20 21 3 s 1868 5013 0 ));
+DATA(insert ( 5026 20 21 4 s 1873 5013 0 ));
+DATA(insert ( 5026 20 21 5 s 1871 5013 0 ));
+/* crosstype operators int84 */
+DATA(insert ( 5026 20 23 1 s 418 5013 0 ));
+DATA(insert ( 5026 20 23 2 s 420 5013 0 ));
+DATA(insert ( 5026 20 23 3 s 416 5013 0 ));
+DATA(insert ( 5026 20 23 4 s 430 5013 0 ));
+DATA(insert ( 5026 20 23 5 s 419 5013 0 ));
+
+/*
+ * bitmap oid_ops
+ */
+
+DATA(insert ( 5033 26 26 1 s 609 5013 0 ));
+DATA(insert ( 5033 26 26 2 s 611 5013 0 ));
+DATA(insert ( 5033 26 26 3 s 607 5013 0 ));
+DATA(insert ( 5033 26 26 4 s 612 5013 0 ));
+DATA(insert ( 5033 26 26 5 s 610 5013 0 ));
+
+/*
+ * bitmap oidvector_ops
+ */
+
+DATA(insert ( 5034 30 30 1 s 645 5013 0 ));
+DATA(insert ( 5034 30 30 2 s 647 5013 0 ));
+DATA(insert ( 5034 30 30 3 s 649 5013 0 ));
+DATA(insert ( 5034 30 30 4 s 648 5013 0 ));
+DATA(insert ( 5034 30 30 5 s 646 5013 0 ));
+
+/*
+ * bitmap float_ops
+ */
+
+/* default operators float4 */
+DATA(insert ( 5023 700 700 1 s 622 5013 0 ));
+DATA(insert ( 5023 700 700 2 s 624 5013 0 ));
+DATA(insert ( 5023 700 700 3 s 620 5013 0 ));
+DATA(insert ( 5023 700 700 4 s 625 5013 0 ));
+DATA(insert ( 5023 700 700 5 s 623 5013 0 ));
+/* crosstype operators float48 */
+DATA(insert ( 5023 700 701 1 s 1122 5013 0 ));
+DATA(insert ( 5023 700 701 2 s 1124 5013 0 ));
+DATA(insert ( 5023 700 701 3 s 1120 5013 0 ));
+DATA(insert ( 5023 700 701 4 s 1125 5013 0 ));
+DATA(insert ( 5023 700 701 5 s 1123 5013 0 ));
+/* default operators float8 */
+DATA(insert ( 5023 701 701 1 s 672 5013 0 ));
+DATA(insert ( 5023 701 701 2 s 673 5013 0 ));
+DATA(insert ( 5023 701 701 3 s 670 5013 0 ));
+DATA(insert ( 5023 701 701 4 s 675 5013 0 ));
+DATA(insert ( 5023 701 701 5 s 674 5013 0 ));
+/* crosstype operators float84 */
+DATA(insert ( 5023 701 700 1 s 1132 5013 0 ));
+DATA(insert ( 5023 701 700 2 s 1134 5013 0 ));
+DATA(insert ( 5023 701 700 3 s 1130 5013 0 ));
+DATA(insert ( 5023 701 700 4 s 1135 5013 0 ));
+DATA(insert ( 5023 701 700 5 s 1133 5013 0 ));
+
+/*
+ * bitmap char_ops
+ */
+
+DATA(insert ( 5020 18 18 1 s 631 5013 0 ));
+DATA(insert ( 5020 18 18 2 s 632 5013 0 ));
+DATA(insert ( 5020 18 18 3 s 92 5013 0 ));
+DATA(insert ( 5020 18 18 4 s 634 5013 0 ));
+DATA(insert ( 5020 18 18 5 s 633 5013 0 ));
+
+/*
+ * bitmap name_ops
+ */
+
+DATA(insert ( 5031 19 19 1 s 660 5013 0 ));
+DATA(insert ( 5031 19 19 2 s 661 5013 0 ));
+DATA(insert ( 5031 19 19 3 s 93 5013 0 ));
+DATA(insert ( 5031 19 19 4 s 663 5013 0 ));
+DATA(insert ( 5031 19 19 5 s 662 5013 0 ));
+
+/*
+ * bitmap text_ops
+ */
+
+DATA(insert ( 5035 25 25 1 s 664 5013 0 ));
+DATA(insert ( 5035 25 25 2 s 665 5013 0 ));
+DATA(insert ( 5035 25 25 3 s 98 5013 0 ));
+DATA(insert ( 5035 25 25 4 s 667 5013 0 ));
+DATA(insert ( 5035 25 25 5 s 666 5013 0 ));
+
+/*
+ * bitmap bpchar_ops
+ */
+
+DATA(insert ( 5018 1042 1042 1 s 1058 5013 0 ));
+DATA(insert ( 5018 1042 1042 2 s 1059 5013 0 ));
+DATA(insert ( 5018 1042 1042 3 s 1054 5013 0 ));
+DATA(insert ( 5018 1042 1042 4 s 1061 5013 0 ));
+DATA(insert ( 5018 1042 1042 5 s 1060 5013 0 ));
+
+/*
+ * bitmap bytea_ops
+ */
+
+DATA(insert ( 5019 17 17 1 s 1957 5013 0 ));
+DATA(insert ( 5019 17 17 2 s 1958 5013 0 ));
+DATA(insert ( 5019 17 17 3 s 1955 5013 0 ));
+DATA(insert ( 5019 17 17 4 s 1960 5013 0 ));
+DATA(insert ( 5019 17 17 5 s 1959 5013 0 ));
+
+/*
+ * bitmap abstime_ops
+ */
+
+DATA(insert ( 5014 702 702 1 s 562 5013 0 ));
+DATA(insert ( 5014 702 702 2 s 564 5013 0 ));
+DATA(insert ( 5014 702 702 3 s 560 5013 0 ));
+DATA(insert ( 5014 702 702 4 s 565 5013 0 ));
+DATA(insert ( 5014 702 702 5 s 563 5013 0 ));
+
+/*
+ * bitmap datetime_ops
+ */
+
+/* default operators date */
+DATA(insert ( 5037 1082 1082 1 s 1095 5013 0 ));
+DATA(insert ( 5037 1082 1082 2 s 1096 5013 0 ));
+DATA(insert ( 5037 1082 1082 3 s 1093 5013 0 ));
+DATA(insert ( 5037 1082 1082 4 s 1098 5013 0 ));
+DATA(insert ( 5037 1082 1082 5 s 1097 5013 0 ));
+/* crosstype operators vs timestamp */
+DATA(insert ( 5037 1082 1114 1 s 2345 5013 0 ));
+DATA(insert ( 5037 1082 1114 2 s 2346 5013 0 ));
+DATA(insert ( 5037 1082 1114 3 s 2347 5013 0 ));
+DATA(insert ( 5037 1082 1114 4 s 2348 5013 0 ));
+DATA(insert ( 5037 1082 1114 5 s 2349 5013 0 ));
+/* crosstype operators vs timestamptz */
+DATA(insert ( 5037 1082 1184 1 s 2358 5013 0 ));
+DATA(insert ( 5037 1082 1184 2 s 2359 5013 0 ));
+DATA(insert ( 5037 1082 1184 3 s 2360 5013 0 ));
+DATA(insert ( 5037 1082 1184 4 s 2361 5013 0 ));
+DATA(insert ( 5037 1082 1184 5 s 2362 5013 0 ));
+/* default operators timestamp */
+DATA(insert ( 5037 1114 1114 1 s 2062 5013 0 ));
+DATA(insert ( 5037 1114 1114 2 s 2063 5013 0 ));
+DATA(insert ( 5037 1114 1114 3 s 2060 5013 0 ));
+DATA(insert ( 5037 1114 1114 4 s 2065 5013 0 ));
+DATA(insert ( 5037 1114 1114 5 s 2064 5013 0 ));
+/* crosstype operators vs date */
+DATA(insert ( 5037 1114 1082 1 s 2371 5013 0 ));
+DATA(insert ( 5037 1114 1082 2 s 2372 5013 0 ));
+DATA(insert ( 5037 1114 1082 3 s 2373 5013 0 ));
+DATA(insert ( 5037 1114 1082 4 s 2374 5013 0 ));
+DATA(insert ( 5037 1114 1082 5 s 2375 5013 0 ));
+/* crosstype operators vs timestamptz */
+DATA(insert ( 5037 1114 1184 1 s 2534 5013 0 ));
+DATA(insert ( 5037 1114 1184 2 s 2535 5013 0 ));
+DATA(insert ( 5037 1114 1184 3 s 2536 5013 0 ));
+DATA(insert ( 5037 1114 1184 4 s 2537 5013 0 ));
+DATA(insert ( 5037 1114 1184 5 s 2538 5013 0 ));
+/* default operators timestamptz */
+DATA(insert ( 5037 1184 1184 1 s 1322 5013 0 ));
+DATA(insert ( 5037 1184 1184 2 s 1323 5013 0 ));
+DATA(insert ( 5037 1184 1184 3 s 1320 5013 0 ));
+DATA(insert ( 5037 1184 1184 4 s 1325 5013 0 ));
+DATA(insert ( 5037 1184 1184 5 s 1324 5013 0 ));
+/* crosstype operators vs date */
+DATA(insert ( 5037 1184 1082 1 s 2384 5013 0 ));
+DATA(insert ( 5037 1184 1082 2 s 2385 5013 0 ));
+DATA(insert ( 5037 1184 1082 3 s 2386 5013 0 ));
+DATA(insert ( 5037 1184 1082 4 s 2387 5013 0 ));
+DATA(insert ( 5037 1184 1082 5 s 2388 5013 0 ));
+/* crosstype operators vs timestamp */
+DATA(insert ( 5037 1184 1114 1 s 2540 5013 0 ));
+DATA(insert ( 5037 1184 1114 2 s 2541 5013 0 ));
+DATA(insert ( 5037 1184 1114 3 s 2542 5013 0 ));
+DATA(insert ( 5037 1184 1114 4 s 2543 5013 0 ));
+DATA(insert ( 5037 1184 1114 5 s 2544 5013 0 ));
+
+/*
+ * bitmap time_ops
+ */
+
+DATA(insert ( 5036 1083 1083 1 s 1110 5013 0 ));
+DATA(insert ( 5036 1083 1083 2 s 1111 5013 0 ));
+DATA(insert ( 5036 1083 1083 3 s 1108 5013 0 ));
+DATA(insert ( 5036 1083 1083 4 s 1113 5013 0 ));
+DATA(insert ( 5036 1083 1083 5 s 1112 5013 0 ));
+
+/*
+ * bitmap timetz_ops
+ */
+
+DATA(insert ( 5038 1266 1266 1 s 1552 5013 0 ));
+DATA(insert ( 5038 1266 1266 2 s 1553 5013 0 ));
+DATA(insert ( 5038 1266 1266 3 s 1550 5013 0 ));
+DATA(insert ( 5038 1266 1266 4 s 1555 5013 0 ));
+DATA(insert ( 5038 1266 1266 5 s 1554 5013 0 ));
+
+/*
+ * bitmap interval_ops
+ */
+
+DATA(insert ( 5029 1186 1186 1 s 1332 5013 0 ));
+DATA(insert ( 5029 1186 1186 2 s 1333 5013 0 ));
+DATA(insert ( 5029 1186 1186 3 s 1330 5013 0 ));
+DATA(insert ( 5029 1186 1186 4 s 1335 5013 0 ));
+DATA(insert ( 5029 1186 1186 5 s 1334 5013 0 ));
+
+/*
+ * bitmap macaddr
+ */
+
+DATA(insert ( 5030 829 829 1 s 1222 5013 0 ));
+DATA(insert ( 5030 829 829 2 s 1223 5013 0 ));
+DATA(insert ( 5030 829 829 3 s 1220 5013 0 ));
+DATA(insert ( 5030 829 829 4 s 1225 5013 0 ));
+DATA(insert ( 5030 829 829 5 s 1224 5013 0 ));
+
+/*
+ * bitmap network
+ */
+
+DATA(insert ( 5021 869 869 1 s 1203 5013 0 ));
+DATA(insert ( 5021 869 869 2 s 1204 5013 0 ));
+DATA(insert ( 5021 869 869 3 s 1201 5013 0 ));
+DATA(insert ( 5021 869 869 4 s 1206 5013 0 ));
+DATA(insert ( 5021 869 869 5 s 1205 5013 0 ));
+
+/*
+ * bitmap numeric
+ */
+
+DATA(insert ( 5032 1700 1700 1 s 1754 5013 0 ));
+DATA(insert ( 5032 1700 1700 2 s 1755 5013 0 ));
+DATA(insert ( 5032 1700 1700 3 s 1752 5013 0 ));
+DATA(insert ( 5032 1700 1700 4 s 1757 5013 0 ));
+DATA(insert ( 5032 1700 1700 5 s 1756 5013 0 ));
+
+/*
+ * bitmap bool
+ */
+
+DATA(insert ( 5017 16 16 1 s 58 5013 0 ));
+DATA(insert ( 5017 16 16 2 s 1694 5013 0 ));
+DATA(insert ( 5017 16 16 3 s 91 5013 0 ));
+DATA(insert ( 5017 16 16 4 s 1695 5013 0 ));
+DATA(insert ( 5017 16 16 5 s 59 5013 0 ));
+
+/*
+ * bitmap bit
+ */
+
+DATA(insert ( 5016 1560 1560 1 s 1786 5013 0 ));
+DATA(insert ( 5016 1560 1560 2 s 1788 5013 0 ));
+DATA(insert ( 5016 1560 1560 3 s 1784 5013 0 ));
+DATA(insert ( 5016 1560 1560 4 s 1789 5013 0 ));
+DATA(insert ( 5016 1560 1560 5 s 1787 5013 0 ));
+
+/*
+ * bitmap varbit
+ */
+
+DATA(insert ( 5039 1562 1562 1 s 1806 5013 0 ));
+DATA(insert ( 5039 1562 1562 2 s 1808 5013 0 ));
+DATA(insert ( 5039 1562 1562 3 s 1804 5013 0 ));
+DATA(insert ( 5039 1562 1562 4 s 1809 5013 0 ));
+DATA(insert ( 5039 1562 1562 5 s 1807 5013 0 ));
+
+/*
+ * bitmap text pattern
+ */
+
+DATA(insert ( 5042 25 25 1 s 2314 5013 0 ));
+DATA(insert ( 5042 25 25 2 s 2315 5013 0 ));
+DATA(insert ( 5042 25 25 3 s 2317 5013 0 ));
+DATA(insert ( 5042 25 25 4 s 2318 5013 0 ));
+
+/*
+ * bitmap bpchar pattern
+ */
+
+DATA(insert ( 5044 1042 1042 1 s 2326 5013 0 ));
+DATA(insert ( 5044 1042 1042 2 s 2327 5013 0 ));
+DATA(insert ( 5044 1042 1042 3 s 2329 5013 0 ));
+DATA(insert ( 5044 1042 1042 4 s 2330 5013 0 ));
+
+/*
+ * bitmap money_ops
+ */
+
+DATA(insert ( 5046 790 790 1 s 902 5013 0 ));
+DATA(insert ( 5046 790 790 2 s 904 5013 0 ));
+DATA(insert ( 5046 790 790 3 s 900 5013 0 ));
+DATA(insert ( 5046 790 790 4 s 905 5013 0 ));
+DATA(insert ( 5046 790 790 5 s 903 5013 0 ));
+
+/*
+ * bitmap reltime_ops
+ */
+
+DATA(insert ( 5047 703 703 1 s 568 5013 0 ));
+DATA(insert ( 5047 703 703 2 s 570 5013 0 ));
+DATA(insert ( 5047 703 703 3 s 566 5013 0 ));
+DATA(insert ( 5047 703 703 4 s 571 5013 0 ));
+DATA(insert ( 5047 703 703 5 s 569 5013 0 ));
+
+/*
+ * bitmap tinterval_ops
+ */
+
+DATA(insert ( 5048 704 704 1 s 813 5013 0 ));
+DATA(insert ( 5048 704 704 2 s 815 5013 0 ));
+DATA(insert ( 5048 704 704 3 s 811 5013 0 ));
+DATA(insert ( 5048 704 704 4 s 816 5013 0 ));
+DATA(insert ( 5048 704 704 5 s 814 5013 0 ));
+
+/*
+ * bitmap array_ops
+ */
+
+DATA(insert ( 5015 2277 2277 1 s 1072 5013 0 ));
+DATA(insert ( 5015 2277 2277 2 s 1074 5013 0 ));
+DATA(insert ( 5015 2277 2277 3 s 1070 5013 0 ));
+DATA(insert ( 5015 2277 2277 4 s 1075 5013 0 ));
+DATA(insert ( 5015 2277 2277 5 s 1073 5013 0 ));
+
+/*
+ * bitmap uuid_ops
+ */
+
+DATA(insert ( 5024 2950 2950 1 s 2974 5013 0 ));
+DATA(insert ( 5024 2950 2950 2 s 2976 5013 0 ));
+DATA(insert ( 5024 2950 2950 3 s 2972 5013 0 ));
+DATA(insert ( 5024 2950 2950 4 s 2977 5013 0 ));
+DATA(insert ( 5024 2950 2950 5 s 2975 5013 0 ));
+
#endif /* PG_AMOP_H */
diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h
index 7155cb2..5ea5b77 100644
--- a/src/include/catalog/pg_amproc.h
+++ b/src/include/catalog/pg_amproc.h
@@ -379,4 +379,53 @@ DATA(insert ( 3474 3831 3831 3 3471 ));
DATA(insert ( 3474 3831 3831 4 3472 ));
DATA(insert ( 3474 3831 3831 5 3473 ));
+/* bitmap */
+DATA(insert ( 5015 2277 2277 1 382 ));
+DATA(insert ( 5014 702 702 1 357 ));
+DATA(insert ( 5016 1560 1560 1 1596 ));
+DATA(insert ( 5017 16 16 1 1693 ));
+DATA(insert ( 5018 1042 1042 1 1078 ));
+DATA(insert ( 5019 17 17 1 1954 ));
+DATA(insert ( 5020 18 18 1 358 ));
+DATA(insert ( 5037 1082 1082 1 1092 ));
+DATA(insert ( 5037 1082 1114 1 2344 ));
+DATA(insert ( 5037 1082 1184 1 2357 ));
+DATA(insert ( 5037 1114 1114 1 2045 ));
+DATA(insert ( 5037 1114 1082 1 2370 ));
+DATA(insert ( 5037 1114 1184 1 2526 ));
+DATA(insert ( 5037 1184 1184 1 1314 ));
+DATA(insert ( 5037 1184 1082 1 2383 ));
+DATA(insert ( 5037 1184 1114 1 2533 ));
+DATA(insert ( 5023 700 700 1 354 ));
+DATA(insert ( 5023 700 701 1 2194 ));
+DATA(insert ( 5023 701 701 1 355 ));
+DATA(insert ( 5023 701 700 1 2195 ));
+DATA(insert ( 5021 869 869 1 926 ));
+DATA(insert ( 5026 21 21 1 350 ));
+DATA(insert ( 5026 21 23 1 2190 ));
+DATA(insert ( 5026 21 20 1 2192 ));
+DATA(insert ( 5026 23 23 1 351 ));
+DATA(insert ( 5026 23 20 1 2188 ));
+DATA(insert ( 5026 23 21 1 2191 ));
+DATA(insert ( 5026 20 20 1 842 ));
+DATA(insert ( 5026 20 23 1 2189 ));
+DATA(insert ( 5026 20 21 1 2193 ));
+DATA(insert ( 5029 1186 1186 1 1315 ));
+DATA(insert ( 5030 829 829 1 836 ));
+DATA(insert ( 5031 19 19 1 359 ));
+DATA(insert ( 5032 1700 1700 1 1769 ));
+DATA(insert ( 5033 26 26 1 356 ));
+DATA(insert ( 5034 30 30 1 404 ));
+DATA(insert ( 5035 25 25 1 360 ));
+DATA(insert ( 5036 1083 1083 1 1107 ));
+DATA(insert ( 5038 1266 1266 1 1358 ));
+DATA(insert ( 5039 1562 1562 1 1672 ));
+DATA(insert ( 5042 25 25 1 2166 ));
+DATA(insert ( 5044 1042 1042 1 2180 ));
+DATA(insert ( 5045 19 19 1 2187 ));
+DATA(insert ( 5046 790 790 1 377 ));
+DATA(insert ( 5047 703 703 1 380 ));
+DATA(insert ( 5038 704 704 1 381 ));
+DATA(insert ( 5024 2950 2950 1 2960 ));
+
#endif /* PG_AMPROC_H */
diff --git a/src/include/catalog/pg_namespace.h b/src/include/catalog/pg_namespace.h
index b259a25..52dbe73 100644
--- a/src/include/catalog/pg_namespace.h
+++ b/src/include/catalog/pg_namespace.h
@@ -72,6 +72,9 @@ DESCR("system catalog schema");
DATA(insert OID = 99 ( "pg_toast" PGUID _null_ ));
DESCR("reserved schema for TOAST tables");
#define PG_TOAST_NAMESPACE 99
+DATA(insert OID = 5012 ( "pg_bitmapindex" PGUID _null_ ));
+DESCR("Reserved schema for internal relations of bitmap indexes");
+#define PG_BITMAPINDEX_NAMESPACE 5012
DATA(insert OID = 2200 ( "public" PGUID _null_ ));
DESCR("standard public schema");
#define PG_PUBLIC_NAMESPACE 2200
diff --git a/src/include/catalog/pg_opclass.h b/src/include/catalog/pg_opclass.h
index f714db5..bc5f592 100644
--- a/src/include/catalog/pg_opclass.h
+++ b/src/include/catalog/pg_opclass.h
@@ -228,4 +228,41 @@ DATA(insert ( 4000 quad_point_ops PGNSP PGUID 4015 600 t 0 ));
DATA(insert ( 4000 kd_point_ops PGNSP PGUID 4016 600 f 0 ));
DATA(insert ( 4000 text_ops PGNSP PGUID 4017 25 t 0 ));
+/* On-disk bitmap indexes opclass */
+DATA(insert ( 5013 abstime_ops PGNSP PGUID 5014 702 t 0 ));
+DATA(insert ( 5013 array_ops PGNSP PGUID 5015 2277 t 0 ));
+DATA(insert ( 5013 bit_ops PGNSP PGUID 5016 1560 t 0 ));
+DATA(insert ( 5013 bool_ops PGNSP PGUID 5017 16 t 0 ));
+DATA(insert ( 5013 bpchar_ops PGNSP PGUID 5018 1042 t 0 ));
+DATA(insert ( 5013 bytea_ops PGNSP PGUID 5019 17 t 0 ));
+DATA(insert ( 5013 char_ops PGNSP PGUID 5020 18 t 0 ));
+DATA(insert ( 5013 cidr_ops PGNSP PGUID 5021 650 t 0 ));
+DATA(insert ( 5013 date_ops PGNSP PGUID 5037 1082 t 0 ));
+DATA(insert ( 5013 float4_ops PGNSP PGUID 5023 700 t 0 ));
+DATA(insert ( 5013 float8_ops PGNSP PGUID 5023 701 t 0 ));
+DATA(insert ( 5013 inet_ops PGNSP PGUID 5021 869 t 0 ));
+DATA(insert ( 5013 int2_ops PGNSP PGUID 5026 21 t 0 ));
+DATA(insert ( 5013 int4_ops PGNSP PGUID 5026 23 t 0 ));
+DATA(insert ( 5013 int8_ops PGNSP PGUID 5026 20 t 0 ));
+DATA(insert ( 5013 interval_ops PGNSP PGUID 5029 1186 t 0 ));
+DATA(insert ( 5013 macaddr_ops PGNSP PGUID 5030 829 t 0 ));
+DATA(insert ( 5013 name_ops PGNSP PGUID 5031 19 t 0 ));
+DATA(insert ( 5013 numeric_ops PGNSP PGUID 5032 1700 t 0 ));
+DATA(insert ( 5013 oid_ops PGNSP PGUID 5033 26 t 0 ));
+DATA(insert ( 5013 oidvector_ops PGNSP PGUID 5034 30 t 0 ));
+DATA(insert ( 5013 text_ops PGNSP PGUID 5035 25 t 0 ));
+DATA(insert ( 5013 time_ops PGNSP PGUID 5036 1083 t 0 ));
+DATA(insert ( 5013 timestamptz_ops PGNSP PGUID 5037 1184 t 0 ));
+DATA(insert ( 5013 timetz_ops PGNSP PGUID 5038 1266 t 0 ));
+DATA(insert ( 5013 varbit_ops PGNSP PGUID 5039 1562 t 0 ));
+DATA(insert ( 5013 varchar_ops PGNSP PGUID 5035 25 f 0 ));
+DATA(insert ( 5013 timestamp_ops PGNSP PGUID 5037 1114 t 0 ));
+DATA(insert ( 5013 text_pattern_ops PGNSP PGUID 5042 25 f 0 ));
+DATA(insert ( 5013 varchar_pattern_ops PGNSP PGUID 5042 25 f 0 ));
+DATA(insert ( 5013 bpchar_pattern_ops PGNSP PGUID 5044 1042 f 0 ));
+DATA(insert ( 5013 name_pattern_ops PGNSP PGUID 3045 19 f 0 ));
+DATA(insert ( 5013 money_ops PGNSP PGUID 5046 790 t 0 ));
+DATA(insert ( 5013 reltime_ops PGNSP PGUID 5047 703 t 0 ));
+DATA(insert ( 5013 tinterval_ops PGNSP PGUID 5048 704 t 0 ));
+
#endif /* PG_OPCLASS_H */
diff --git a/src/include/catalog/pg_opfamily.h b/src/include/catalog/pg_opfamily.h
index 832f194..b583834 100644
--- a/src/include/catalog/pg_opfamily.h
+++ b/src/include/catalog/pg_opfamily.h
@@ -148,4 +148,34 @@ DATA(insert OID = 4016 ( 4000 kd_point_ops PGNSP PGUID ));
DATA(insert OID = 4017 ( 4000 text_ops PGNSP PGUID ));
#define TEXT_SPGIST_FAM_OID 4017
+/* On-disk bitmap indexes opfamily entries */
+DATA(insert OID = 5014 ( 5013 abstime_ops PGNSP PGUID ));
+DATA(insert OID = 5015 ( 5013 array_ops PGNSP PGUID ));
+DATA(insert OID = 5016 ( 5013 bit_ops PGNSP PGUID ));
+DATA(insert OID = 5017 ( 5013 bool_ops PGNSP PGUID ));
+DATA(insert OID = 5018 ( 5013 bpchar_ops PGNSP PGUID ));
+DATA(insert OID = 5019 ( 5013 bytea_ops PGNSP PGUID ));
+DATA(insert OID = 5020 ( 5013 char_ops PGNSP PGUID ));
+DATA(insert OID = 5023 ( 5013 float_ops PGNSP PGUID ));
+DATA(insert OID = 5021 ( 5013 network_ops PGNSP PGUID ));
+DATA(insert OID = 5026 ( 5013 integer_ops PGNSP PGUID ));
+DATA(insert OID = 5029 ( 5013 interval_ops PGNSP PGUID ));
+DATA(insert OID = 5030 ( 5013 macaddr_ops PGNSP PGUID ));
+DATA(insert OID = 5031 ( 5013 name_ops PGNSP PGUID ));
+DATA(insert OID = 5032 ( 5013 numeric_ops PGNSP PGUID ));
+DATA(insert OID = 5033 ( 5013 oid_ops PGNSP PGUID ));
+DATA(insert OID = 5034 ( 5013 oidvector_ops PGNSP PGUID ));
+DATA(insert OID = 5035 ( 5013 text_ops PGNSP PGUID ));
+DATA(insert OID = 5036 ( 5013 time_ops PGNSP PGUID ));
+DATA(insert OID = 5037 ( 5013 datetime_ops PGNSP PGUID ));
+DATA(insert OID = 5038 ( 5013 timetz_ops PGNSP PGUID ));
+DATA(insert OID = 5039 ( 5013 varbit_ops PGNSP PGUID ));
+DATA(insert OID = 5042 ( 5013 text_pattern_ops PGNSP PGUID ));
+DATA(insert OID = 5044 ( 5013 bpchar_pattern_ops PGNSP PGUID ));
+DATA(insert OID = 5045 ( 5013 name_pattern_ops PGNSP PGUID ));
+DATA(insert OID = 5046 ( 5013 money_ops PGNSP PGUID ));
+DATA(insert OID = 5047 ( 5013 reltime_ops PGNSP PGUID ));
+DATA(insert OID = 5048 ( 5013 tinterval_ops PGNSP PGUID ));
+DATA(insert OID = 5024 ( 5013 uuid_ops PGNSP PGUID ));
+
#endif /* PG_OPFAMILY_H */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index f03dd0b..4dbc4db 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4171,6 +4171,35 @@ DESCR("I/O");
DATA(insert OID = 2963 ( uuid_hash PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 23 "2950" _null_ _null_ _null_ _null_ uuid_hash _null_ _null_ _null_ ));
DESCR("hash");
+/* the bitmap index access method routines */
+DATA(insert OID = 5050 ( bmgettuple PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ bmgettuple _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5051 ( bmgetbitmap PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ bmgetbitmap _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5001 ( bminsert PGNSP PGUID 12 1 0 0 0 f f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ bminsert _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5002 ( bmbeginscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ bmbeginscan _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5003 ( bmrescan PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ bmrescan _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5004 ( bmendscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ bmendscan _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5005 ( bmmarkpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ bmmarkpos _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5006 ( bmrestrpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ bmrestrpos _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5007 ( bmbuild PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ bmbuild _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5052 ( bmbuildempty PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ bmbuildempty _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5008 ( bmbulkdelete PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ bmbulkdelete _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5009 ( bmvacuumcleanup PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ bmvacuumcleanup _null_ _null_ _null_ ));
+DATA(insert OID = 5010 ( bmcostestimate PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ bmcostestimate _null_ _null_ _null_ ));
+DESCR("bitmap(internal)");
+DATA(insert OID = 5011 ( bmoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ bmoptions _null_ _null_ _null_ ));
+DESCR("btree(internal)");
+
/* enum related procs */
DATA(insert OID = 3504 ( anyenum_in PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 3500 "2275" _null_ _null_ _null_ _null_ anyenum_in _null_ _null_ _null_ ));
DESCR("I/O");
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 0c2cd34..3459510 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -195,6 +195,7 @@ extern Datum hashcostestimate(PG_FUNCTION_ARGS);
extern Datum gistcostestimate(PG_FUNCTION_ARGS);
extern Datum spgcostestimate(PG_FUNCTION_ARGS);
extern Datum gincostestimate(PG_FUNCTION_ARGS);
+extern Datum bmcostestimate(PG_FUNCTION_ARGS);
/* Functions in array_selfuncs.c */
Hi Abhijit,
On 2013-09-14 23:44:24 +0530, Abhijit Menon-Sen wrote:
I've been working on this patch for a while, and have made some progress
towards (a) general fixing, and (b) a working VACUUM implementation (the
major remaining piece). Unfortunately, I've been busy moving house, and
the latter is not complete (and not in this patch).I will continue working on the code, and I'll post updates. I expect to
have more to show in just a few days.Nevertheless, I'm posting it for review now as I keep working. Given the
size and age of the patch, I would appreciate any comments, no matter
how nitpicky.
It'd be nice if you could quickly sketch out the plan for vacuum, that
will make reviewing more useful and easier.
Greetings,
Andres Freund
--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Sat, Sep 14, 2013 at 1:14 PM, Abhijit Menon-Sen <ams@2ndquadrant.com> wrote:
Hi.
This is a cleaned-up and rebased version of the bitmap index patch from
Gavin Sherry, later revised by Gianni Ciolli and Gabriele Bartolini, and
others including Daniel Bausch.
Hi Abhijit,
Please, in the next update consider this messages i'm getting when
compiling with your patch.
"""
bitmapxlog.c: In function ‘bitmap_xlog_cleanup’:
bitmapxlog.c:658:32: warning: ‘reln’ may be used uninitialized in this
function [-Wuninitialized]
selfuncs.c: In function ‘bmcostestimate’:
selfuncs.c:7327:13: warning: unused variable ‘indexCorrelation’
[-Wunused-variable]
selfuncs.c:7326:15: warning: unused variable ‘indexSelectivity’
[-Wunused-variable]
selfuncs.c:7325:11: warning: unused variable ‘indexTotalCost’
[-Wunused-variable]
selfuncs.c:7324:11: warning: unused variable ‘indexStartupCost’
[-Wunused-variable]
"""
Also, there are 2 regression tests failing (attached regression.diffs)
And this error, when trying to generate docs
"""
openjade:bitmap.sgml:123:85:X: reference to non-existent ID
"SQL-CREATEINDEX-TITLE"
"""
And finally, i was excercising the feature in some ways and got a
crash when creating an index concurrently (attached
index_failure.txt), it wasn't just a crash i couldn't start up the
server again after it
--
Jaime Casanova www.2ndQuadrant.com
Professional PostgreSQL: Soporte 24x7 y capacitación
Phone: +593 4 5107566 Cell: +593 987171157
Attachments:
regression.diffsapplication/octet-stream; name=regression.diffsDownload
*** /home/jcasanov/Documentos/2ndQuadrant/postgresql/src/test/regress/expected/oidjoins.out 2013-09-15 21:57:27.517958770 -0500
--- /home/jcasanov/Documentos/2ndQuadrant/postgresql/src/test/regress/results/oidjoins.out 2013-09-15 22:52:23.346301897 -0500
***************
*** 237,245 ****
FROM pg_catalog.pg_amproc fk
WHERE amproc != 0 AND
NOT EXISTS(SELECT 1 FROM pg_catalog.pg_proc pk WHERE pk.oid = fk.amproc);
! ctid | amproc
! ------+--------
! (0 rows)
SELECT ctid, adrelid
FROM pg_catalog.pg_attrdef fk
--- 237,246 ----
FROM pg_catalog.pg_amproc fk
WHERE amproc != 0 AND
NOT EXISTS(SELECT 1 FROM pg_catalog.pg_proc pk WHERE pk.oid = fk.amproc);
! ctid | amproc
! --------+--------
! (2,62) | 2187
! (1 row)
SELECT ctid, adrelid
FROM pg_catalog.pg_attrdef fk
***************
*** 605,613 ****
FROM pg_catalog.pg_opclass fk
WHERE opcfamily != 0 AND
NOT EXISTS(SELECT 1 FROM pg_catalog.pg_opfamily pk WHERE pk.oid = fk.opcfamily);
! ctid | opcfamily
! ------+-----------
! (0 rows)
SELECT ctid, opcintype
FROM pg_catalog.pg_opclass fk
--- 606,615 ----
FROM pg_catalog.pg_opclass fk
WHERE opcfamily != 0 AND
NOT EXISTS(SELECT 1 FROM pg_catalog.pg_opfamily pk WHERE pk.oid = fk.opcfamily);
! ctid | opcfamily
! --------+-----------
! (2,30) | 3045
! (1 row)
SELECT ctid, opcintype
FROM pg_catalog.pg_opclass fk
======================================================================
*** /home/jcasanov/Documentos/2ndQuadrant/postgresql/src/test/regress/expected/opr_sanity.out 2013-09-15 21:57:59.922119454 -0500
--- /home/jcasanov/Documentos/2ndQuadrant/postgresql/src/test/regress/results/opr_sanity.out 2013-09-15 22:52:23.710303709 -0500
***************
*** 345,353 ****
FROM pg_proc as p1 LEFT JOIN pg_description as d
ON p1.tableoid = d.classoid and p1.oid = d.objoid and d.objsubid = 0
WHERE d.classoid IS NULL AND p1.oid <= 9999;
! oid | proname
! -----+---------
! (0 rows)
-- **************** pg_cast ****************
-- Catch bogus values in pg_cast columns (other than cases detected by
--- 345,354 ----
FROM pg_proc as p1 LEFT JOIN pg_description as d
ON p1.tableoid = d.classoid and p1.oid = d.objoid and d.objsubid = 0
WHERE d.classoid IS NULL AND p1.oid <= 9999;
! oid | proname
! ------+-----------------
! 5009 | bmvacuumcleanup
! (1 row)
-- **************** pg_cast ****************
-- Catch bogus values in pg_cast columns (other than cases detected by
***************
*** 988,994 ****
p3.amoppurpose = 's');
amname | amoplefttype | amoprighttype
--------+--------------+---------------
! (0 rows)
-- Currently, none of the AMs with fixed strategy sets support ordering ops.
SELECT p1.amname, p2.amopfamily, p2.amopstrategy
--- 989,1003 ----
p3.amoppurpose = 's');
amname | amoplefttype | amoprighttype
--------+--------------+---------------
! bitmap | 25 | 25
! bitmap | 25 | 25
! bitmap | 25 | 25
! bitmap | 25 | 25
! bitmap | 1042 | 1042
! bitmap | 1042 | 1042
! bitmap | 1042 | 1042
! bitmap | 1042 | 1042
! (8 rows)
-- Currently, none of the AMs with fixed strategy sets support ordering ops.
SELECT p1.amname, p2.amopfamily, p2.amopstrategy
***************
*** 1098,1104 ****
4000 | 15 | >
4000 | 16 | @>
4000 | 18 | =
! (62 rows)
-- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing
--- 1107,1122 ----
4000 | 15 | >
4000 | 16 | @>
4000 | 18 | =
! 5013 | 1 | <
! 5013 | 1 | ~<~
! 5013 | 2 | <=
! 5013 | 2 | ~<=~
! 5013 | 3 | =
! 5013 | 3 | ~>=~
! 5013 | 4 | >=
! 5013 | 4 | ~>~
! 5013 | 5 | >
! (71 rows)
-- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing
***************
*** 1118,1126 ****
WHERE NOT EXISTS(SELECT 1 FROM pg_amop AS p2
WHERE p2.amopfamily = p1.opcfamily
AND binary_coercible(p1.opcintype, p2.amoplefttype));
! opcname | opcfamily
! ---------+-----------
! (0 rows)
-- Check that each operator listed in pg_amop has an associated opclass,
-- that is one whose opcintype matches oprleft (possibly by coercion).
--- 1136,1145 ----
WHERE NOT EXISTS(SELECT 1 FROM pg_amop AS p2
WHERE p2.amopfamily = p1.opcfamily
AND binary_coercible(p1.opcintype, p2.amoplefttype));
! opcname | opcfamily
! ------------------+-----------
! name_pattern_ops | 3045
! (1 row)
-- Check that each operator listed in pg_amop has an associated opclass,
-- that is one whose opcintype matches oprleft (possibly by coercion).
***************
*** 1135,1141 ****
AND binary_coercible(p2.opcintype, p1.amoplefttype));
amopfamily | amopstrategy | amopopr
------------+--------------+---------
! (0 rows)
-- Operators that are primary members of opclasses must be immutable (else
-- it suggests that the index ordering isn't fixed). Operators that are
--- 1154,1165 ----
AND binary_coercible(p2.opcintype, p1.amoplefttype));
amopfamily | amopstrategy | amopopr
------------+--------------+---------
! 5024 | 1 | 2974
! 5024 | 2 | 2976
! 5024 | 3 | 2972
! 5024 | 4 | 2977
! 5024 | 5 | 2975
! (5 rows)
-- Operators that are primary members of opclasses must be immutable (else
-- it suggests that the index ordering isn't fixed). Operators that are
***************
*** 1272,1280 ****
WHERE am.amname <> 'btree' AND am.amname <> 'gist' AND am.amname <> 'gin'
GROUP BY amname, amsupport, opcname, amprocfamily
HAVING count(*) != amsupport OR amprocfamily IS NULL;
! amname | opcname | count
! --------+---------+-------
! (0 rows)
SELECT amname, opcname, count(*)
FROM pg_am am JOIN pg_opclass op ON opcmethod = am.oid
--- 1296,1307 ----
WHERE am.amname <> 'btree' AND am.amname <> 'gist' AND am.amname <> 'gin'
GROUP BY amname, amsupport, opcname, amprocfamily
HAVING count(*) != amsupport OR amprocfamily IS NULL;
! amname | opcname | count
! --------+------------------+-------
! bitmap | cidr_ops | 1
! bitmap | tinterval_ops | 1
! bitmap | name_pattern_ops | 1
! (3 rows)
SELECT amname, opcname, count(*)
FROM pg_am am JOIN pg_opclass op ON opcmethod = am.oid
======================================================================
Hi Jaime.
At 2013-09-15 23:32:11 -0500, jaime@2ndquadrant.com wrote:
bitmapxlog.c:658:32: warning: ‘reln’ may be used uninitialized in this
function [-Wuninitialized]
I added an XXX comment about this one, will investigate and fix.
Will look into the other errors as well, many thanks for the report.
-- Abhijit
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Sat, Sep 14, 2013 at 11:14 AM, Abhijit Menon-Sen <ams@2ndquadrant.com>wrote:
Hi.
This is a cleaned-up and rebased version of the bitmap index patch from
Gavin Sherry, later revised by Gianni Ciolli and Gabriele Bartolini, and
others including Daniel Bausch.I've been working on this patch for a while, and have made some progress
towards (a) general fixing, and (b) a working VACUUM implementation (the
major remaining piece). Unfortunately, I've been busy moving house, and
the latter is not complete (and not in this patch).I will continue working on the code, and I'll post updates. I expect to
have more to show in just a few days.Nevertheless, I'm posting it for review now as I keep working. Given the
size and age of the patch, I would appreciate any comments, no matter
how nitpicky.
Hi Abhijit,
I get wrong answers from this index sometimes. It seems to occur when the
position of the column within the index is not the same as its position
within the table. So I think that what is happening is somewhere the
offset into the list of table columns is misused to offset into the list of
index columns.
I didn't see any XXX notes that indicate this is a known problem.
create table foo as select
floor(random()*10) as a,
floor(random()*10) as b,
floor(random()*10) as c,
d
from generate_series(1,10000000) d;
vacuum ANALYZE;
create index on foo using bitmap (a);
create index on foo using bitmap (b);
select count(*) from foo where a=4;
1000173
select count(*) from foo where a+0=4;
1000173
select count(*) from foo where b=4;
0
select count(*) from foo where b+0=4;
999750
Cheers,
Jeff
At 2013-09-24 09:51:00 -0700, jeff.janes@gmail.com wrote:
I get wrong answers from this index sometimes.
Thanks for the report and the test case. I'll investigate.
-- Abhijit
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi,
Here are some quick items while skimming this patch. I am looking at
commit 6448de29d from your github repo, branch bmi.
What's with the pg_bitmapindex stuff in pg_namespace.h? It doesn't seem
to be used anywhere.
This led me to research how these indexes are stored. I note that what
we're doing here is to create another regular table and a btree index on
top of it, and those are the ones that actually store the index data.
This seems grotty and completely unlike the way we do things elsewhere
(compare GIN indexes which have rbtrees inside them). I think this
should be stored in separate forks, or separate kinds of pages
intermixed in the index's main fork.
I don't like that files are named bitmap.c/.h. We already have bitmap
scans, so having the generic concept (bitmap) show up as a file name is
confusing. To cite an example, see the name of the bmgetbitmap function
(which returns a TID bitmap from a bitmap index. How is that not
confusing?). I think I would be more comfortable with the files being
called "bmi" or maybe "bitmapidx", or something like that. (For sure do
not change the AM's name. I mean "CREATE INDEX .. USING bmi" would
suck.)
Not happy with (the contents of) src/include/access/bitmap.h. There's
way too much stuff in a single file. I think a three-file split would
work nicely: one for the AM routine declarations (bitmap.h), one for
xlog stuff (bitmap_xlog.h), one for internal routines and struct
declarations (bitmap_priv.h, bitmap_internals.h, or something like
that). Also, I think macros and structs should be defined in a narrow
scope as possible; for example, macros such as HEADER_SET_FILL_BIT_ON
should be defined in bitmaputil.c, not bitmap.h (that macro is missing
parens BTW). For macros that are defined in headers, it would be better
to have prefixes that scope them to bitmaps; for example IS_FILL_WORD
should maybe have a BM_ prefix or something similar.
I don't think it's necessary to renumber relopt_kind. Just stash the
new one at the end of the enum.
bmoptions's DESCR entry in pg_proc.h says "btree".
contrib/bmfuncs.c defines CHECK_PAGE_OFFSET_RANGE but doesn't seem to
use it anywhere.
same file defines CHECK_RELATION_BLOCK_RANGE using a bare { } block. Our
style is to have these multi-statement macros use do {} while (false).
#include lines in source files are normally alphabetically sorted. The
new code fails to meet this expectation in many places.
First four lines of _bitmap_init seem gratuitous ..
All those #ifdef DEBUG_BMI lines sprinkled all over the place look
pretty bad; they interrupt the indentation flow. I would suggest to
define a macro or some macros to emit debugging messages, which are
enabled or disabled in a single place depending on DEBUG_BMI. Something
simple such as DO_DB() in fd.c would suffice.
I don't like this comment one bit: /* misteriously, MemSet segfaults... :( */
I think that's probably a bug that should be investigated rather than
papered over.
I don't understand the cur_bmbuild thingy .. I think that stuff should
be passed down as arguments to whatever do the index build, instead of
being a magical global var that also signals failure to find hash
functions for some datatypes.
Above definition of BMBuildHashData there's a comment referring us to
execGrouping.c but I cannot understand what it refers to.
"block unfound"? In general I think it's poor style to spell out the
function name in error messages. I mean, ereport() already reports the
function name. Also, please don't split long error messages across
multiple lines; better to leave the line to run off the screen.
I'm unsure about distinguishing errors in the recovery routines that
raise ERROR from those that PANIC. I mean, they would both cause the
server to PANIC.
The bitmap_xlog_cleanup routine talks about an "uninitialized reln".
That's a large point about xlog recovery -- you don't have relations.
You only have relfilenodes. I think you need to shuffle some routines
so that they can work with only a relfilenode.
Generally, the xlog stuff needs a lot of comments.
A pgindent run would be beneficial.
--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 09/26/2013 12:20 AM, Alvaro Herrera wrote:
This led me to research how these indexes are stored. I note that what
we're doing here is to create another regular table and a btree index on
top of it, and those are the ones that actually store the index data.
This seems grotty and completely unlike the way we do things elsewhere
(compare GIN indexes which have rbtrees inside them).
Perhaps you meant that GIN has B-tree inside. RBTree is in fact used by
GiST, but only as in-memory structure during the search - to get the
tuples sorted by distance.
// Antonin Houska (Tony)
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
At 2013-09-25 19:20:17 -0300, alvherre@2ndquadrant.com wrote:
Here are some quick items while skimming this patch.
Great, that gives me plenty to work on.
At this point, I think it's appropriate to mark this patch as returned
with feedback (which I will do), since the changes needed seem fairly
major. I'll submit a revised patch for the next commitfest.
Many thanks to everyone who tried the patch and commented.
-- Abhijit
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Wed, Sep 25, 2013 at 3:20 PM, Alvaro Herrera <alvherre@2ndquadrant.com>wrote:
Hi,
Here are some quick items while skimming this patch. I am looking at
commit 6448de29d from your github repo, branch bmi.What's with the pg_bitmapindex stuff in pg_namespace.h? It doesn't seem
to be used anywhere.This led me to research how these indexes are stored. I note that what
we're doing here is to create another regular table and a btree index on
top of it, and those are the ones that actually store the index data.
This seems grotty and completely unlike the way we do things elsewhere
(compare GIN indexes which have rbtrees inside them).
+1 on that. I don't know about grottiness, but it certainly seems like it
would deadlock like crazy. Which another product's bitmap indexes are
infamous for, but since we don't need to store visibility information in
our indexes, hopefully we can do better.
Cheers,
Jeff
At 2013-09-26 08:39:05 -0700, jeff.janes@gmail.com wrote:
I don't know about grottiness, but it certainly seems like it would
deadlock like crazy.
Hi Jeff.
I don't understand the deadlock scenario you're thinking of. Could you
explain, please?
-- Abhijit
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers