Yet another fast GiST build
Hi!
In many cases GiST index can be build fast using z-order sorting.
I've looked into proof of concept by Nikita Glukhov [0]https://github.com/postgres/postgres/compare/master...glukhovn:gist_btree_build and it looks very interesting.
So, I've implemented yet another version of B-tree-like GiST build.
It's main use case and benefits can be summarized with small example:
postgres=# create table x as select point (random(),random()) from generate_series(1,3000000,1);
SELECT 3000000
Time: 5061,967 ms (00:05,062)
postgres=# create index ON x using gist (point ) with (fast_build_sort_function=gist_point_sortsupport);
CREATE INDEX
Time: 6140,227 ms (00:06,140)
postgres=# create index ON x using gist (point );
CREATE INDEX
Time: 32061,200 ms (00:32,061)
As you can see, Z-order build is on order of magnitude faster. Select performance is roughly the same. Also, index is significantly smaller.
Nikita's PoC is faster because it uses parallel build, but it intervenes into B-tree code a lot (for reuse). This patchset is GiST-isolated.
My biggest concern is that passing function to relation option seems a bit hacky. You can pass there any function matching sort support signature.
Embedding this function into opclass makes no sense: it does not affect scan anyhow.
In current version, docs and tests are not implemented. I want to discuss overall design. Do we really want yet another GiST build, if it is 3-10 times faster?
Thanks!
Best regards, Andrey Borodin.
[0]: https://github.com/postgres/postgres/compare/master...glukhovn:gist_btree_build
Attachments:
0001-function-relopt-for-gist-build.patchapplication/octet-stream; name=0001-function-relopt-for-gist-build.patch; x-unix-mode=0644Download
From 6e1a942432f7aa43555c58e882dc404d44554e51 Mon Sep 17 00:00:00 2001
From: Andrey <amborodin@acm.org>
Date: Mon, 10 Jun 2019 11:54:59 +0500
Subject: [PATCH 1/3] function relopt for gist build
---
src/backend/access/common/reloptions.c | 12 +++++++++++
src/backend/access/gist/gistbuild.c | 30 ++++++++++++++++++++++++++
src/backend/access/gist/gistutil.c | 3 ++-
src/include/access/gist_private.h | 2 ++
4 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 4d0d24be0b..cd9aa97437 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -450,6 +450,18 @@ static relopt_string stringRelOpts[] =
gistValidateBufferingOption,
"auto"
},
+ {
+ {
+ "fast_build_sort_function",
+ "Function for presorting data instead of usual penalty\\split inserts",
+ RELOPT_KIND_GIST,
+ AccessExclusiveLock
+ },
+ 0,
+ true,
+ gistValidateBuildFuncOption,
+ NULL
+ },
{
{
"check_option",
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index ecef0ff072..e7112590b8 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -22,12 +22,15 @@
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "catalog/index.h"
+#include "catalog/pg_type.h"
#include "miscadmin.h"
#include "optimizer/optimizer.h"
+#include "parser/parse_func.h"
#include "storage/bufmgr.h"
#include "storage/smgr.h"
#include "utils/memutils.h"
#include "utils/rel.h"
+#include "utils/regproc.h"
/* Step of index tuples for check whether to switch to buffering build mode */
#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
@@ -74,6 +77,7 @@ typedef struct
HTAB *parentMap;
GistBufferingMode bufferingMode;
+ FmgrInfo sortFn;
} GISTBuildState;
/* prototypes for private functions */
@@ -125,6 +129,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
buildstate.indexrel = index;
buildstate.heaprel = heap;
+ buildstate.sortFn.fn_oid = InvalidOid;
if (index->rd_options)
{
/* Get buffering mode from the options string */
@@ -138,6 +143,16 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
else
buildstate.bufferingMode = GIST_BUFFERING_AUTO;
+ if (options->vl_len_ > offsetof(GiSTOptions, buildSortFunctionOffset) &&
+ options->bufferingModeOffset > 0)
+ {
+ char *sortFuncName = (char *) options + options->buildSortFunctionOffset;
+ Oid argList[1] = {INTERNALOID};
+ List *namelist = stringToQualifiedNameList(sortFuncName);
+ Oid sortFuncOid = LookupFuncName(namelist, 1, argList, false);
+ fmgr_info(sortFuncOid, &buildstate.sortFn);
+ }
+
fillfactor = options->fillfactor;
}
else
@@ -255,6 +270,21 @@ gistValidateBufferingOption(const char *value)
}
}
+/*
+ * Validator for "fast_build_sort_function" reloption on GiST indexes. Allows function name
+ */
+void
+gistValidateBuildFuncOption(const char *value)
+{
+ Oid argList[1] = {INTERNALOID};
+ List *namelist;
+ if (value == NULL)
+ return;
+ namelist = stringToQualifiedNameList(value);
+ /* LookupFuncName will fail if function is not existent */
+ LookupFuncName(namelist, 1, argList, false);
+}
+
/*
* Attempt to switch to buffering mode.
*
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 49df05653b..6e2ff95b5c 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -895,7 +895,8 @@ gistoptions(Datum reloptions, bool validate)
int numoptions;
static const relopt_parse_elt tab[] = {
{"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
- {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
+ {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)},
+ {"fast_build_sort_function", RELOPT_TYPE_STRING, offsetof(GiSTOptions, buildSortFunctionOffset)}
};
options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index f80694bf9a..b4b6883f6d 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -386,6 +386,7 @@ typedef struct GiSTOptions
int32 vl_len_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */
int bufferingModeOffset; /* use buffering build? */
+ int buildSortFunctionOffset; /* used buffering sort function */
} GiSTOptions;
/* gist.c */
@@ -531,6 +532,7 @@ extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
extern IndexBuildResult *gistbuild(Relation heap, Relation index,
struct IndexInfo *indexInfo);
extern void gistValidateBufferingOption(const char *value);
+extern void gistValidateBuildFuncOption(const char *value);
/* gistbuildbuffers.c */
extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
--
2.20.1
0003-Implement-GiST-build-using-sort-support.patchapplication/octet-stream; name=0003-Implement-GiST-build-using-sort-support.patch; x-unix-mode=0644Download
From c39f3ef547d03e02f2dfca6d011ca65aa7e6c8c3 Mon Sep 17 00:00:00 2001
From: Andrey <amborodin@acm.org>
Date: Sun, 25 Aug 2019 14:18:31 +0500
Subject: [PATCH 3/3] Implement GiST build using sort support
---
src/backend/access/gist/gistbuild.c | 204 +++++++++++++++++++++++++++-
src/backend/access/gist/gistutil.c | 30 +++-
src/backend/utils/sort/tuplesort.c | 82 +++++++++++
src/include/access/gist_private.h | 3 +
src/include/utils/tuplesort.h | 6 +
5 files changed, 312 insertions(+), 13 deletions(-)
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index e7112590b8..b49875e731 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -31,6 +31,7 @@
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/regproc.h"
+#include "utils/tuplesort.h"
/* Step of index tuples for check whether to switch to buffering build mode */
#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
@@ -56,6 +57,15 @@ typedef enum
GIST_BUFFERING_ACTIVE /* in buffering build mode */
} GistBufferingMode;
+/*
+ * Status record for spooling/sorting phase.
+ */
+typedef struct
+{
+ Tuplesortstate *sortstate; /* state data for tuplesort.c */
+ Relation index;
+} GSpool;
+
/* Working state for gistbuild and its callback */
typedef struct
{
@@ -77,7 +87,7 @@ typedef struct
HTAB *parentMap;
GistBufferingMode bufferingMode;
- FmgrInfo sortFn;
+ GSpool *spool;
} GISTBuildState;
/* prototypes for private functions */
@@ -111,6 +121,173 @@ static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
+static GSpool *gist_spoolinit(Relation heap, Relation index, SortSupport ssup);
+static void gist_spool(GSpool *gspool, ItemPointer self, Datum *values, bool *isnull);
+static void gist_indexsortbuild(GISTBuildState *state);
+static void gist_indexsortbuild_flush(Relation rel, Page page, BlockNumber blockno, bool isNew);
+
+/*
+ * create and initialize a spool structure to sort tuples
+ */
+static GSpool *
+gist_spoolinit(Relation heap, Relation index, SortSupport ssup)
+{
+ GSpool *gspool = (GSpool *) palloc0(sizeof(GSpool));
+
+ gspool->index = index;
+
+ /*
+ * We size the sort area as maintenance_work_mem rather than work_mem to
+ * speed index creation. This should be OK since a single backend can't
+ * run multiple index creations in parallel.
+ */
+ gspool->sortstate = tuplesort_begin_index_gist(heap,
+ index,
+ ssup,
+ maintenance_work_mem,
+ NULL,
+ false);
+
+ return gspool;
+}
+
+/*
+ * Flush page contents to actual page at blockno
+ * We have expected block number, because GiST build relies on that pages
+ * will be allocated in continous segments. This simplifies allocation
+ * logic.
+ */
+static void
+gist_indexsortbuild_flush(Relation rel, Page page, BlockNumber blockno, bool isNew)
+{
+ OffsetNumber i,
+ maxoff;
+ Page newpage;
+
+ Buffer buffer = ReadBuffer(rel, isNew ? P_NEW : blockno);
+ GISTInitBuffer(buffer, GistPageIsLeaf(page) ? F_LEAF : 0);
+ /* If the page is new - check that it was allocated correctly */
+ Assert(BufferGetBlockNumber(buffer) == blockno);
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ newpage = BufferGetPage(buffer);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ ItemId itemid = PageGetItemId(page, i);
+ IndexTuple tuple = (IndexTuple) PageGetItem(page, itemid);
+ gistfillbuffer(newpage, &tuple,
+ 1, InvalidOffsetNumber);
+ }
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * given a spool loaded by successive calls to _h_spool,
+ * create an entire index.
+ */
+static void
+gist_indexsortbuild(GISTBuildState *state)
+{
+ GSpool *gspool = state->spool;
+ /*
+ * Build constructs GiST by levels. Level is always allocated
+ * sequentially from level_start until level_end.
+ */
+ BlockNumber level_start = GIST_ROOT_BLKNO + 1;
+ BlockNumber level_end = level_start;
+ BlockNumber prev_level_start;
+ IndexTuple itup;
+ /*
+ * We keep one page in memory for the special case
+ * When layer will have only on page - we will place it
+ * to ROOT_BLOCK_NO
+ */
+ Page page = palloc(BLCKSZ);
+ gistinitpage(page, F_LEAF);
+
+ tuplesort_performsort(gspool->sortstate);
+
+ /* Create a first layer of leaf pages */
+ while ((itup = tuplesort_getindextuple(gspool->sortstate, true)) != NULL)
+ {
+ if (PageGetFreeSpace(page) >= IndexTupleSize(itup) + sizeof(ItemIdData))
+ {
+ gistfillbuffer(page, &itup, 1, InvalidOffsetNumber);
+ }
+ else
+ {
+ gist_indexsortbuild_flush(state->indexrel, page, level_end, true);
+ level_end++;
+ gistinitpage(page, F_LEAF);
+ gistfillbuffer(page, &itup, 1, InvalidOffsetNumber);
+ }
+ }
+
+ /* Construct internal levels */
+ do
+ {
+ /* If previous level had only one page - that page is a root */
+ if (level_start == level_end)
+ {
+ gist_indexsortbuild_flush(state->indexrel, page, GIST_ROOT_BLKNO,
+ false);
+ return;
+ }
+
+ gist_indexsortbuild_flush(state->indexrel, page, level_end, true);
+ level_end++;
+ gistinitpage(page, 0);
+
+ prev_level_start = level_start;
+ level_start = level_end;
+
+ for (BlockNumber i = prev_level_start; i < level_start; i++)
+ {
+ /* For each page on previous level we form one tuple */
+ Buffer lower_buffer = ReadBuffer(state->indexrel, i);
+ Page lower_page;
+ IndexTuple union_tuple;
+ MemoryContext oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
+ int vect_len;
+ LockBuffer(lower_buffer, GIST_SHARE);
+ lower_page = BufferGetPage(lower_buffer);
+
+ IndexTuple *itvec = gistextractpage(lower_page, &vect_len);
+ union_tuple = gistunion(state->indexrel, itvec, vect_len,
+ state->giststate);
+ ItemPointerSetBlockNumber(&(union_tuple->t_tid), i);
+
+ if (PageGetFreeSpace(page) >= IndexTupleSize(union_tuple) + sizeof(ItemIdData))
+ {
+ gistfillbuffer(page, &union_tuple, 1, InvalidOffsetNumber);
+ }
+ else
+ {
+ gist_indexsortbuild_flush(state->indexrel, page, level_end, true);
+ level_end++;
+ gistinitpage(page, 0);
+ gistfillbuffer(page, &union_tuple, 1, InvalidOffsetNumber);
+ }
+
+ UnlockReleaseBuffer(lower_buffer);
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(state->giststate->tempCxt);
+ }
+ } while (true);
+}
+
+/*
+ * spool an index entry into the sort file.
+ */
+static void
+gist_spool(GSpool *gspool, ItemPointer self, Datum *values, bool *isnull)
+{
+ tuplesort_putindextuplevalues(gspool->sortstate, gspool->index,
+ self, values, isnull);
+}
+
/*
* Main entry point to GiST index build. Initially calls insert over and over,
* but switches to more efficient buffering build algorithm after a certain
@@ -129,7 +306,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
buildstate.indexrel = index;
buildstate.heaprel = heap;
- buildstate.sortFn.fn_oid = InvalidOid;
+ buildstate.spool = NULL;
if (index->rd_options)
{
/* Get buffering mode from the options string */
@@ -144,13 +321,16 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
buildstate.bufferingMode = GIST_BUFFERING_AUTO;
if (options->vl_len_ > offsetof(GiSTOptions, buildSortFunctionOffset) &&
- options->bufferingModeOffset > 0)
+ options->buildSortFunctionOffset > 0)
{
char *sortFuncName = (char *) options + options->buildSortFunctionOffset;
Oid argList[1] = {INTERNALOID};
List *namelist = stringToQualifiedNameList(sortFuncName);
Oid sortFuncOid = LookupFuncName(namelist, 1, argList, false);
- fmgr_info(sortFuncOid, &buildstate.sortFn);
+ SortSupport sort = palloc0(sizeof(SortSupportData));
+ OidFunctionCall1(sortFuncOid, PointerGetDatum(sort));
+ buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
+ buildstate.spool = gist_spoolinit(heap, index, sort);
}
fillfactor = options->fillfactor;
@@ -223,6 +403,12 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
gistFreeBuildBuffers(buildstate.gfbb);
}
+ if (buildstate.spool)
+ {
+ gist_indexsortbuild(&buildstate);
+ tuplesort_end(buildstate.spool->sortstate);
+ }
+
/* okay, all heap tuples are indexed */
MemoryContextSwitchTo(oldcxt);
MemoryContextDelete(buildstate.giststate->tempCxt);
@@ -498,14 +684,20 @@ gistBuildCallback(Relation index,
GISTBuildState *buildstate = (GISTBuildState *) state;
IndexTuple itup;
MemoryContext oldCtx;
+ Datum compressed_values[INDEX_MAX_KEYS];
oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
/* form an index tuple and point it at the heap tuple */
- itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
+ itup = gistCompressValuesAndFormTuple(buildstate->giststate, index, values, isnull, true, compressed_values);
itup->t_tid = htup->t_self;
- if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
+ if (buildstate->spool)
+ {
+ if (tupleIsAlive)
+ gist_spool(buildstate->spool, (ItemPointer)itup, compressed_values, isnull);
+ }
+ else if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
{
/* We have buffers, so use them. */
gistBufferingBuildInsert(buildstate, itup);
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 6e2ff95b5c..330ef3a2af 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -174,6 +174,7 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
datum = index_getattr(itvec[j], i + 1, giststate->leafTupdesc,
&IsNull);
+
if (IsNull)
continue;
@@ -576,6 +577,13 @@ gistFormTuple(GISTSTATE *giststate, Relation r,
Datum attdata[], bool isnull[], bool isleaf)
{
Datum compatt[INDEX_MAX_KEYS];
+ return gistCompressValuesAndFormTuple(giststate, r, attdata, isnull, isleaf, compatt);
+}
+
+IndexTuple
+gistCompressValuesAndFormTuple(GISTSTATE *giststate, Relation r,
+ Datum attdata[], bool isnull[], bool isleaf, Datum compatt[])
+{
int i;
IndexTuple res;
@@ -746,14 +754,10 @@ gistpenalty(GISTSTATE *giststate, int attno,
* Initialize a new index page
*/
void
-GISTInitBuffer(Buffer b, uint32 f)
+gistinitpage(Page page, uint32 f)
{
- GISTPageOpaque opaque;
- Page page;
- Size pageSize;
-
- pageSize = BufferGetPageSize(b);
- page = BufferGetPage(b);
+ GISTPageOpaque opaque;
+ Size pageSize = BLCKSZ;
PageInit(page, pageSize, sizeof(GISTPageOpaqueData));
opaque = GistPageGetOpaque(page);
@@ -764,6 +768,18 @@ GISTInitBuffer(Buffer b, uint32 f)
opaque->gist_page_id = GIST_PAGE_ID;
}
+/*
+ * Initialize a new index buffer
+ */
+void
+GISTInitBuffer(Buffer b, uint32 f)
+{
+ Page page;
+
+ page = BufferGetPage(b);
+ gistinitpage(page, f);
+}
+
/*
* Verify that a freshly-read page looks sane.
*/
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 7b8e67899e..3249f3eb69 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -630,6 +630,8 @@ static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
+static int comparetup_index_gist(const SortTuple *a, const SortTuple *b,
+ Tuplesortstate *state);
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
static void writetup_index(Tuplesortstate *state, int tapenum,
SortTuple *stup);
@@ -1095,6 +1097,43 @@ tuplesort_begin_index_hash(Relation heapRel,
return state;
}
+Tuplesortstate *
+tuplesort_begin_index_gist(Relation heapRel,
+ Relation indexRel,
+ SortSupport ssup,
+ int workMem,
+ SortCoordinate coordinate,
+ bool randomAccess)
+{
+ Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
+ randomAccess);
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(state->sortcontext);
+
+#ifdef TRACE_SORT
+ if (trace_sort)
+ elog(LOG,
+ "begin index sort: workMem = %d, randomAccess = %c",
+ workMem, randomAccess ? 't' : 'f');
+#endif
+
+ state->nKeys = 1; /* Only one sort column, the gist attribute */
+
+ state->comparetup = comparetup_index_gist;
+ state->copytup = copytup_index;
+ state->writetup = writetup_index;
+ state->readtup = readtup_index;
+ state->sortKeys = ssup;
+
+ state->heapRel = heapRel;
+ state->indexRel = indexRel;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return state;
+}
+
Tuplesortstate *
tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
bool nullsFirstFlag, int workMem,
@@ -4138,6 +4177,49 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
return 0;
}
+static int
+comparetup_index_gist(const SortTuple *a, const SortTuple *b,
+ Tuplesortstate *state)
+{
+ SortSupport sortKey = state->sortKeys;
+ IndexTuple tuple1;
+ IndexTuple tuple2;
+ int32 compare;
+
+
+ /* Compare the leading sort key */
+ compare = ApplySortComparator(a->datum1, a->isnull1,
+ b->datum1, b->isnull1,
+ sortKey);
+ if (compare != 0)
+ return compare;
+
+ tuple1 = (IndexTuple) a->tuple;
+ tuple2 = (IndexTuple) b->tuple;
+ /*
+ * If key values are equal, we sort on ItemPointer.
+ */
+ {
+ BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
+ BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
+
+ if (blk1 != blk2)
+ return (blk1 < blk2) ? -1 : 1;
+ }
+ {
+ OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
+ OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
+
+ if (pos1 != pos2)
+ return (pos1 < pos2) ? -1 : 1;
+ }
+
+ /* ItemPointer values should never be equal */
+ Assert(false);
+
+ return 0;
+}
+
static void
copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
{
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index b4b6883f6d..ef9afdf1ca 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -487,12 +487,15 @@ extern IndexTuple gistgetadjusted(Relation r,
GISTSTATE *giststate);
extern IndexTuple gistFormTuple(GISTSTATE *giststate,
Relation r, Datum *attdata, bool *isnull, bool isleaf);
+extern IndexTuple gistCompressValuesAndFormTuple(GISTSTATE *giststate, Relation r,
+ Datum attdata[], bool isnull[], bool isleaf, Datum compatt[]);
extern OffsetNumber gistchoose(Relation r, Page p,
IndexTuple it,
GISTSTATE *giststate);
extern void GISTInitBuffer(Buffer b, uint32 f);
+extern void gistinitpage(Page page, uint32 f);
extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
Datum k, Relation r, Page pg, OffsetNumber o,
bool l, bool isNull);
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 4521de18e1..3f1ceac3d2 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -26,6 +26,7 @@
#include "fmgr.h"
#include "storage/dsm.h"
#include "utils/relcache.h"
+#include "utils/sortsupport.h"
/*
@@ -209,6 +210,11 @@ extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
uint32 max_buckets,
int workMem, SortCoordinate coordinate,
bool randomAccess);
+extern Tuplesortstate *tuplesort_begin_index_gist(Relation heapRel,
+ Relation indexRel,
+ SortSupport ssup,
+ int workMem, SortCoordinate coordinate,
+ bool randomAccess);
extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
Oid sortOperator, Oid sortCollation,
bool nullsFirstFlag,
--
2.20.1
0002-Add-sort-support-for-point-gist_point_sortsupport.patchapplication/octet-stream; name=0002-Add-sort-support-for-point-gist_point_sortsupport.patch; x-unix-mode=0644Download
From ac8bb2980444efcfb99b9b4e22630e9becf46c4f Mon Sep 17 00:00:00 2001
From: Andrey <amborodin@acm.org>
Date: Sun, 25 Aug 2019 12:42:36 +0500
Subject: [PATCH 2/3] Add sort support for point gist_point_sortsupport
---
src/backend/access/gist/gistproc.c | 57 ++++++++++++++++++++++++++++++
src/include/catalog/pg_proc.dat | 3 ++
2 files changed, 60 insertions(+)
diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c
index 1826b51bbb..e2702af280 100644
--- a/src/backend/access/gist/gistproc.c
+++ b/src/backend/access/gist/gistproc.c
@@ -24,6 +24,7 @@
#include "utils/builtins.h"
#include "utils/float.h"
#include "utils/geo_decls.h"
+#include "utils/sortsupport.h"
static bool gist_box_leaf_consistent(BOX *key, BOX *query,
@@ -1530,3 +1531,59 @@ gist_poly_distance(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(distance);
}
+
+static int64
+part_bits32_by2(uint32 x)
+{
+ uint64 n = x;
+
+ n = (n | (n << 16)) & 0x0000FFFF0000FFFF;
+ n = (n | (n << 8)) & 0x00FF00FF00FF00FF;
+ n = (n | (n << 4)) & 0x0F0F0F0F0F0F0F0F;
+ n = (n | (n << 2)) & 0x3333333333333333;
+ n = (n | (n << 1)) & 0x5555555555555555;
+
+ return n;
+}
+
+static int64
+interleave_bits32(uint32 x, uint32 y)
+{
+ return part_bits32_by2(x) | (part_bits32_by2(y) << 1);
+}
+
+static inline uint64
+point_zorder_internal(Point *p)
+{
+ union {
+ float f;
+ uint32 i;
+ } a,b;
+ a.f = p->x;
+ b.f = p->y;
+ return interleave_bits32(a.i, b.i);
+}
+
+static int
+gist_point_fastcmp(Datum x, Datum y, SortSupport ssup)
+{
+ Point *p1 = DatumGetPointP(x);
+ Point *p2 = DatumGetPointP(y);
+ uint64 z1 = point_zorder_internal(p1);
+ uint64 z2 = point_zorder_internal(p2);
+
+ return z1 == z2 ? 0 : z1 > z2 ? 1 : -1;
+}
+
+Datum
+gist_point_sortsupport(PG_FUNCTION_ARGS)
+{
+ SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+ ssup->comparator = gist_point_fastcmp;
+ PG_RETURN_VOID();
+}
+
+
+//create table x as select point (random(),random()) from generate_series(1,1000,1);
+//create index ON x using gist (point ) with (fast_build_sort_function=gist_point_sortsupport);
\ No newline at end of file
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 87335248a0..de3cb2f1e3 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7849,6 +7849,9 @@
proname => 'gist_poly_distance', prorettype => 'float8',
proargtypes => 'internal polygon int2 oid internal',
prosrc => 'gist_poly_distance' },
+{ oid => '3429', descr => 'sort support',
+ proname => 'gist_point_sortsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'gist_point_sortsupport' },
# GIN array support
{ oid => '2743', descr => 'GIN array support',
--
2.20.1
Hello,
This is very interesting. In my pipeline currently GiST index rebuild is
the biggest time consuming step.
I believe introducing optional concept of order in the GiST opclass will be
beneficial not only for fast build, but for other tasks later:
- CLUSTER can order the table using that notion, in parallel way.
- btree_gist can be even closer to btree by getting the tuples sorted
inside page.
- tree descend on insertion in future can traverse the list in more
opportunistic way, calculating penalty for siblings-by-order first.
I believe everywhere the idea of ordering is needed it's provided by giving
a btree opclass.
How about giving a link to btree opclass inside a gist opclass?
On Mon, Aug 26, 2019 at 10:59 AM Andrey Borodin <x4mmm@yandex-team.ru>
wrote:
Hi!
In many cases GiST index can be build fast using z-order sorting.
I've looked into proof of concept by Nikita Glukhov [0] and it looks very
interesting.
So, I've implemented yet another version of B-tree-like GiST build.
It's main use case and benefits can be summarized with small example:postgres=# create table x as select point (random(),random()) from
generate_series(1,3000000,1);
SELECT 3000000
Time: 5061,967 ms (00:05,062)
postgres=# create index ON x using gist (point ) with
(fast_build_sort_function=gist_point_sortsupport);
CREATE INDEX
Time: 6140,227 ms (00:06,140)
postgres=# create index ON x using gist (point );
CREATE INDEX
Time: 32061,200 ms (00:32,061)As you can see, Z-order build is on order of magnitude faster. Select
performance is roughly the same. Also, index is significantly smaller.Nikita's PoC is faster because it uses parallel build, but it intervenes
into B-tree code a lot (for reuse). This patchset is GiST-isolated.
My biggest concern is that passing function to relation option seems a bit
hacky. You can pass there any function matching sort support signature.
Embedding this function into opclass makes no sense: it does not affect
scan anyhow.In current version, docs and tests are not implemented. I want to discuss
overall design. Do we really want yet another GiST build, if it is 3-10
times faster?Thanks!
Best regards, Andrey Borodin.
[0]
https://github.com/postgres/postgres/compare/master...glukhovn:gist_btree_build
--
Darafei Praliaskouski
Support me: http://patreon.com/komzpa
On 26/08/2019 10:59, Andrey Borodin wrote:
Hi!
In many cases GiST index can be build fast using z-order sorting.
I've looked into proof of concept by Nikita Glukhov [0] and it looks very interesting.
So, I've implemented yet another version of B-tree-like GiST build.
Cool!
My biggest concern is that passing function to relation option seems
a bit hacky. You can pass there any function matching sort support
signature. Embedding this function into opclass makes no sense: it
does not affect scan anyhow.
I think it should be a new, optional, GiST "AM support function", in
pg_amproc. That's how the sort support functions are defined for B-tree
opfamilies, too.
- Heikki
On Mon, Aug 26, 2019 at 10:59 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
In many cases GiST index can be build fast using z-order sorting.
I've looked into proof of concept by Nikita Glukhov [0] and it looks very interesting.
So, I've implemented yet another version of B-tree-like GiST build.
It's main use case and benefits can be summarized with small example:postgres=# create table x as select point (random(),random()) from generate_series(1,3000000,1);
SELECT 3000000
Time: 5061,967 ms (00:05,062)
postgres=# create index ON x using gist (point ) with (fast_build_sort_function=gist_point_sortsupport);
CREATE INDEX
Time: 6140,227 ms (00:06,140)
postgres=# create index ON x using gist (point );
CREATE INDEX
Time: 32061,200 ms (00:32,061)As you can see, Z-order build is on order of magnitude faster. Select performance is roughly the same. Also, index is significantly smaller.
Cool! These experiments bring me to following thoughts. Can we not
only build, but also maintain GiST indexes in B-tree-like manner? If
we put Z-value together with MBR to the non-leaf keys, that should be
possible. Maintaining it in B-tree-like manner would have a lot of
advantages.
1) Binary search in non-leaf pages instead of probing each key is much faster.
2) Split algorithm is also simpler and faster.
3) We can refind existing index tuple in predictable time of O(log N).
And this doesn't depend on MBR overlapping degree. This is valuable
for future table AMs, which would have notion of deleting individual
index tuples (for instance, zheap promises to eventually support
delete-marking indexes).
Eventually, we may come to the idea of B-tree indexes with
user-defined additional keys in non-leaf tuples, which could be used
for scanning.
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Thu, Aug 29, 2019 at 3:48 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
As you can see, Z-order build is on order of magnitude faster. Select performance is roughly the same. Also, index is significantly smaller.
Cool! These experiments bring me to following thoughts. Can we not
only build, but also maintain GiST indexes in B-tree-like manner? If
we put Z-value together with MBR to the non-leaf keys, that should be
possible. Maintaining it in B-tree-like manner would have a lot of
advantages.
I'm not an expert on GiST, but that seems like it would have a lot of
advantages in the long term. It is certainly theoretically appealing.
Could this make it easier to use merge join with containment
operators? I'm thinking of things like geospatial joins, which can
generally only be performed as nested loop joins at the moment. This
is often wildly inefficient.
--
Peter Geoghegan
On Fri, Aug 30, 2019 at 2:34 AM Peter Geoghegan <pg@bowt.ie> wrote:
On Thu, Aug 29, 2019 at 3:48 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:As you can see, Z-order build is on order of magnitude faster. Select performance is roughly the same. Also, index is significantly smaller.
Cool! These experiments bring me to following thoughts. Can we not
only build, but also maintain GiST indexes in B-tree-like manner? If
we put Z-value together with MBR to the non-leaf keys, that should be
possible. Maintaining it in B-tree-like manner would have a lot of
advantages.I'm not an expert on GiST, but that seems like it would have a lot of
advantages in the long term. It is certainly theoretically appealing.Could this make it easier to use merge join with containment
operators? I'm thinking of things like geospatial joins, which can
generally only be performed as nested loop joins at the moment. This
is often wildly inefficient.
AFAICS, spatial joins aren't going to be as easy as just merge joins
on Z-value. When searching for close points in two datasets (closer
than given threshold) we can scan some ranges of Z-value in one
dataset while iterating on another. But dealing with prolonged
spatial objects in not that easy. In order to determine if there are
matching values in given Z-range you also need to be aware on size of
objects inside that Z-range. So, before merge-like join you need to
not only sort, but build some index-like structure.
Alternatively you can encode size in Z-value. But this increases
dimensionality of space and decreases efficiency of join. Also,
spatial join can be made using two indexes, even just current GiST
without Z-values. We've prototyped that, see [1].
Links
1. https://github.com/pgsphere/pgsphere/blob/crossmatch_cnode/crossmatch.c
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Thu, Aug 29, 2019 at 8:22 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:
Alternatively you can encode size in Z-value. But this increases
dimensionality of space and decreases efficiency of join. Also,
spatial join can be made using two indexes, even just current GiST
without Z-values. We've prototyped that, see [1].
I'm pretty sure that spatial joins generally need two spatial indexes
(usually R-Trees). There seems to have been quite a lot of research in
it in the 1990s.
--
Peter Geoghegan
30 авг. 2019 г., в 3:47, Alexander Korotkov <a.korotkov@postgrespro.ru> написал(а):
1) Binary search in non-leaf pages instead of probing each key is much faster.
That's a neat idea, but key union breaks ordering, even for z-order.
for two sets of tuples X and Y
if for any i,o from N, Xi < Yo
does not guaranty union(X) < union (Y)
For example consider this z-ordered keyspace (picture attached)
union(5, 9) is z-order-smaller than union(4,4)
I'm not even sure we can use sorted search for choosing subtree for insertion.
How do you think, should I supply GiST-build patch with docs and tests and add it to CF? Or do we need more design discussion before?
Best regards, Andrey Borodin.
Attachments:
PastedGraphic-2.pngimage/png; name=PastedGraphic-2.pngDownload
�PNG
IHDR � � �]� sRGB ��� @ IDATx���[���K ����*T��{g������o��/�w���d�I�[�-��U)J���B� ���ke9'�,�k������7�?������n�;444<<B���t�����N��j�n|u�/������z���J%��Q2jQ��jw�Gr�,@1��� z�� �J���nwd����P���D�#9���T+rH�,L}�"������d�����-�
�n'>�n*�GM����@��F�������S��v�=��
��h�4X%�����e�����>�z����^�t�R�#� B��.�
�r��<�}���d����.%���!>HG�f�X�%��`�4����7n�(��^�|yaa�\.Ga*���O$=��9XHr
e��r;P� [�w�������W��=\��FrDZ]B ���������<~�Go///S&�����<�`Y������B�E�����,fn&+�� k�$�*�6�d��"��"�s0e�s �Gj"�~��\�:V5����@=2�$�+3 Z����D����(&��
y���'p�E��u�R�R�l�M��\
��3��:A�G�C!DK���Gp %Q4r2^TZHC��@p@K�&e��{Ai��D"��#��|���:��9X �
F)�d���x�m����-�31�B����7��Z��&
[�/�5� Y�b=KYpZ��
4f���3C>$�ab�p� �������qF�E������ib����ok���iwN -�y�@;�N�������0h�`� �Sn�F���j��'M�����3�B\Oh��o@#N�>X}����3x��o� #�2��68�<�ri��V�P����@��S���*���`���~�2�#!j��>9A��0��0fweWz�D<cH=��_�����JJ_�L
��P,�-���t����MJ����8����O��&�
n�NI����6D�`hD+���f}1C��l��=i[N�%�&�$8�J��b�RRK
CP�v#�<)%iU�vu���)��uH�����
���Q�m�G�#���+:��)~
��-q6�5���i�*�;��dG)dg,K��@:r��`~�EM�o2Z�&��4;��������H���D�5�+mf�/��N�1bQ�DB2q��+�#��h�X�>q��,���XO
�6���PYF��������Hc�=-`2�uFp��$ ��0��X�T-".�;XI �rP "[r�w1J:c�������9=pRN"��� k���PVK�E@:�L�"Y8�b���cL%���<�q>�y��Q�SZ�qG$���{��#L �e�\��|�}n���$���6��%:a�\�M$�dab�lJ����$� I[
��"U8E��*7��`��@���<� 2 ��m4�SI���."������+�-�HHqO����bTN��KXR�D+��I�8��D��TIg�����d���@�X,#�����",+� �@�����
�A��+w&�X�#�Adc�0�����zj�DC�QTU����/ARu���H�"�L����u
&`+����4��f��aKM�`QE$��l�A3�N��'�vo6�0�E~���yl;L���� ����`W�6���6!���P��xaD��DC�%��1�����@��R}Mb<���}�-�G�0F���V��b�d#�`W/��d9�"9���0!��%�PN������P���&�VcOnY4����]`��V���H�o�=�1`��D��EI!�������(zdPP���8>@c@�h���l\�`6����A�������~*��{���p�[n�4�4�na�1���
$��=;a���S��'f]�)��B�pO���K|"�����J$0K�~���gr*���N�������'�0Hz&<�8(d���"�u�*��U��DD�P0��Q�*+�����n�C����9?g*G��Y��6�����>