gist vacuum gist access
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Attachments:
gist-vacuum-more-seaq-access.patchtext/x-diff; name=gist-vacuum-more-seaq-access.patchDownload
*** a/src/backend/access/gist/gistvacuum.c
--- b/src/backend/access/gist/gistvacuum.c
***************
*** 101,134 **** gistvacuumcleanup(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(stats);
}
- typedef struct GistBDItem
- {
- GistNSN parentlsn;
- BlockNumber blkno;
- struct GistBDItem *next;
- } GistBDItem;
-
- static void
- pushStackIfSplited(Page page, GistBDItem *stack)
- {
- GISTPageOpaque opaque = GistPageGetOpaque(page);
-
- if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
- (GistFollowRight(page) || stack->parentlsn < GistPageGetNSN(page)) &&
- opaque->rightlink != InvalidBlockNumber /* sanity check */ )
- {
- /* split page detected, install right link to the stack */
-
- GistBDItem *ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
-
- ptr->blkno = opaque->rightlink;
- ptr->parentlsn = stack->parentlsn;
- ptr->next = stack->next;
- stack->next = ptr;
- }
- }
-
-
/*
* Bulk deletion of all index entries pointing to a set of heap tuples and
* check invalid tuples left after upgrade.
--- 101,106 ----
***************
*** 145,283 **** gistbulkdelete(PG_FUNCTION_ARGS)
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
void *callback_state = (void *) PG_GETARG_POINTER(3);
Relation rel = info->index;
! GistBDItem *stack,
! *ptr;
!
! /* first time through? */
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
- /* we'll re-count the tuples each time */
stats->estimated_count = false;
stats->num_index_tuples = 0;
! stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
! stack->blkno = GIST_ROOT_BLKNO;
!
! while (stack)
! {
! Buffer buffer;
! Page page;
! OffsetNumber i,
! maxoff;
! IndexTuple idxtuple;
! ItemId iid;
!
! buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno,
! RBM_NORMAL, info->strategy);
! LockBuffer(buffer, GIST_SHARE);
! gistcheckpage(rel, buffer);
! page = (Page) BufferGetPage(buffer);
!
! if (GistPageIsLeaf(page))
{
! OffsetNumber todelete[MaxOffsetNumber];
! int ntodelete = 0;
!
! LockBuffer(buffer, GIST_UNLOCK);
! LockBuffer(buffer, GIST_EXCLUSIVE);
page = (Page) BufferGetPage(buffer);
- if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page))
- {
- /* only the root can become non-leaf during relock */
- UnlockReleaseBuffer(buffer);
- /* one more check */
- continue;
- }
-
- /*
- * check for split proceeded after look at parent, we should check
- * it after relock
- */
- pushStackIfSplited(page, stack);
! /*
! * Remove deletable tuples from page
! */
!
! maxoff = PageGetMaxOffsetNumber(page);
!
! for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
{
! iid = PageGetItemId(page, i);
! idxtuple = (IndexTuple) PageGetItem(page, iid);
!
! if (callback(&(idxtuple->t_tid), callback_state))
{
! todelete[ntodelete] = i - ntodelete;
! ntodelete++;
! stats->tuples_removed += 1;
}
- else
- stats->num_index_tuples += 1;
- }
-
- if (ntodelete)
- {
- START_CRIT_SECTION();
-
- MarkBufferDirty(buffer);
-
- for (i = 0; i < ntodelete; i++)
- PageIndexTupleDelete(page, todelete[i]);
- GistMarkTuplesDeleted(page);
! if (RelationNeedsWAL(rel))
{
! XLogRecPtr recptr;
! recptr = gistXLogUpdate(rel->rd_node, buffer,
! todelete, ntodelete,
! NULL, 0, InvalidBuffer);
! PageSetLSN(page, recptr);
! }
! else
! PageSetLSN(page, gistGetFakeLSN(rel));
!
! END_CRIT_SECTION();
! }
! }
! else
! {
! /* check for split proceeded after look at parent */
! pushStackIfSplited(page, stack);
!
! maxoff = PageGetMaxOffsetNumber(page);
! for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
! {
! iid = PageGetItemId(page, i);
! idxtuple = (IndexTuple) PageGetItem(page, iid);
! ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
! ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
! ptr->parentlsn = PageGetLSN(page);
! ptr->next = stack->next;
! stack->next = ptr;
! if (GistTupleIsInvalid(idxtuple))
! ereport(LOG,
! (errmsg("index \"%s\" contains an inner tuple marked as invalid",
! RelationGetRelationName(rel)),
! errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
! errhint("Please REINDEX it.")));
}
- }
-
- UnlockReleaseBuffer(buffer);
! ptr = stack->next;
! pfree(stack);
! stack = ptr;
!
! vacuum_delay_point();
}
PG_RETURN_POINTER(stats);
}
--- 117,208 ----
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
void *callback_state = (void *) PG_GETARG_POINTER(3);
Relation rel = info->index;
! BlockNumber blkno = GIST_ROOT_BLKNO + 1;
! bool needLock = !RELATION_IS_LOCAL(rel);
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
stats->estimated_count = false;
stats->num_index_tuples = 0;
! for(;;) {
! BlockNumber npages;
! if (needLock)
! LockRelationForExtension(rel, ExclusiveLock);
! npages = RelationGetNumberOfBlocks(rel);
! if (needLock)
! UnlockRelationForExtension(rel, ExclusiveLock);
! if (blkno >= npages)
! break;
! for ( ; blkno < npages; blkno++)
{
! Buffer buffer;
! Page page;
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, info->strategy);
+ LockBuffer(buffer, GIST_SHARE);
+ gistcheckpage(rel, buffer);
page = (Page) BufferGetPage(buffer);
! if (GistPageIsLeaf(page))
{
! OffsetNumber todelete[MaxOffsetNumber];
! int ntodelete = 0;
! OffsetNumber i,
! maxoff;
! IndexTuple idxtuple;
! ItemId iid;
! LockBuffer(buffer, GIST_UNLOCK);
! LockBuffer(buffer, GIST_EXCLUSIVE);
!
! page = (Page) BufferGetPage(buffer);
! maxoff = PageGetMaxOffsetNumber(page);
!
! for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
{
! iid = PageGetItemId(page, i);
! idxtuple = (IndexTuple) PageGetItem(page, iid);
!
! if (callback(&(idxtuple->t_tid), callback_state))
! {
! todelete[ntodelete] = i - ntodelete;
! ntodelete++;
! stats->tuples_removed += 1;
! }
! else
! stats->num_index_tuples += 1;
}
! if (ntodelete)
{
! START_CRIT_SECTION();
! MarkBufferDirty(buffer);
! for (i = 0; i < ntodelete; i++)
! PageIndexTupleDelete(page, todelete[i]);
! GistMarkTuplesDeleted(page);
! if (RelationNeedsWAL(rel))
! {
! XLogRecPtr recptr;
! recptr = gistXLogUpdate(rel->rd_node, buffer,
! todelete, ntodelete,
! NULL, 0, InvalidBuffer);
! PageSetLSN(page, recptr);
! }
! else
! PageSetLSN(page, gistGetFakeLSN(rel));
! END_CRIT_SECTION();
! }
}
! UnlockReleaseBuffer(buffer);
! }
}
+
PG_RETURN_POINTER(stats);
}
On 09/07/2014 05:11 PM, О©╫О©╫О©╫О©╫О©╫ О©╫О©╫О©╫О©╫О©╫О©╫О©╫О©╫ wrote:
hello.
i recode vacuum for gist index.
all tests is ok.
also i test vacuum on table size 2 million rows. all is ok.
on my machine old vaccum work about 9 second. this version work about 6-7 sec .
review please.
If I'm reading this correctly, the patch changes gistbulkdelete to scan
the index in physical order, while the old code starts from the root and
scans the index from left to right, in logical order.
Scanning the index in physical order is wrong, if any index pages are
split while vacuum runs. A page split could move some tuples to a
lower-numbered page, so that the vacuum will not scan those tuples.
In the b-tree code, we solved that problem back in 2006, so it can be
done but requires a bit more code. In b-tree, we solved it with a
"vacuum cycle ID" number that's set on the page halves when a page is
split. That allows VACUUM to identify pages that have been split
concurrently sees them, and "jump back" to vacuum them too. See commit
http://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=5749f6ef0cc1c67ef9c9ad2108b3d97b82555c80.
It should be possible to do something similar in GiST, and in fact you
might be able to reuse the NSN field that's already set on the page
halves on split, instead of adding a new "vacuum cycle ID".
- Heikki
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Mon, Sep 8, 2014 at 11:13 AM, Heikki Linnakangas <hlinnakangas@vmware.com
wrote:
On 09/07/2014 05:11 PM, Костя Кузнецов wrote:
hello.
i recode vacuum for gist index.
all tests is ok.
also i test vacuum on table size 2 million rows. all is ok.
on my machine old vaccum work about 9 second. this version work about 6-7
sec .
review please.If I'm reading this correctly, the patch changes gistbulkdelete to scan
the index in physical order, while the old code starts from the root and
scans the index from left to right, in logical order.Scanning the index in physical order is wrong, if any index pages are
split while vacuum runs. A page split could move some tuples to a
lower-numbered page, so that the vacuum will not scan those tuples.In the b-tree code, we solved that problem back in 2006, so it can be done
but requires a bit more code. In b-tree, we solved it with a "vacuum cycle
ID" number that's set on the page halves when a page is split. That allows
VACUUM to identify pages that have been split concurrently sees them, and
"jump back" to vacuum them too. See commit http://git.postgresql.org/
gitweb/?p=postgresql.git;a=commit;h=5749f6ef0cc1c67ef9c9ad2108b3d9
7b82555c80. It should be possible to do something similar in GiST, and in
fact you might be able to reuse the NSN field that's already set on the
page halves on split, instead of adding a new "vacuum cycle ID".
Idea is right. But in fact, does GiST ever recycle any page? It has
F_DELETED flag, but ISTM this flag is never set. So, I think it's possible
that this patch is working correctly. However, probably GiST sometimes
leaves new page unused due to server crash.
Anyway, I'm not fan of committing patch in this shape. We need to let GiST
recycle pages first, then implement VACUUM similar to b-tree.
------
With best regards,
Alexander Korotkov.
On Mon, Sep 8, 2014 at 12:08 PM, Alexander Korotkov <aekorotkov@gmail.com>
wrote:
On Mon, Sep 8, 2014 at 11:13 AM, Heikki Linnakangas <
hlinnakangas@vmware.com> wrote:On 09/07/2014 05:11 PM, Костя Кузнецов wrote:
hello.
i recode vacuum for gist index.
all tests is ok.
also i test vacuum on table size 2 million rows. all is ok.
on my machine old vaccum work about 9 second. this version work about
6-7 sec .
review please.If I'm reading this correctly, the patch changes gistbulkdelete to scan
the index in physical order, while the old code starts from the root and
scans the index from left to right, in logical order.Scanning the index in physical order is wrong, if any index pages are
split while vacuum runs. A page split could move some tuples to a
lower-numbered page, so that the vacuum will not scan those tuples.In the b-tree code, we solved that problem back in 2006, so it can be
done but requires a bit more code. In b-tree, we solved it with a "vacuum
cycle ID" number that's set on the page halves when a page is split. That
allows VACUUM to identify pages that have been split concurrently sees
them, and "jump back" to vacuum them too. See commit
http://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=
5749f6ef0cc1c67ef9c9ad2108b3d97b82555c80. It should be possible to do
something similar in GiST, and in fact you might be able to reuse the NSN
field that's already set on the page halves on split, instead of adding a
new "vacuum cycle ID".Idea is right. But in fact, does GiST ever recycle any page? It has
F_DELETED flag, but ISTM this flag is never set. So, I think it's possible
that this patch is working correctly. However, probably GiST sometimes
leaves new page unused due to server crash.
Anyway, I'm not fan of committing patch in this shape. We need to let GiST
recycle pages first, then implement VACUUM similar to b-tree.
Another note. Assuming we have NSN which can play the role of "vacuum cycle
ID", can we implement sequential (with possible "jump back") index scan for
GiST?
------
With best regards,
Alexander Korotkov.
On 09/08/2014 11:08 AM, Alexander Korotkov wrote:
On Mon, Sep 8, 2014 at 11:13 AM, Heikki Linnakangas <hlinnakangas@vmware.com
wrote:
On 09/07/2014 05:11 PM, Костя Кузнецов wrote:
hello.
i recode vacuum for gist index.
all tests is ok.
also i test vacuum on table size 2 million rows. all is ok.
on my machine old vaccum work about 9 second. this version work about 6-7
sec .
review please.If I'm reading this correctly, the patch changes gistbulkdelete to scan
the index in physical order, while the old code starts from the root and
scans the index from left to right, in logical order.Scanning the index in physical order is wrong, if any index pages are
split while vacuum runs. A page split could move some tuples to a
lower-numbered page, so that the vacuum will not scan those tuples.In the b-tree code, we solved that problem back in 2006, so it can be done
but requires a bit more code. In b-tree, we solved it with a "vacuum cycle
ID" number that's set on the page halves when a page is split. That allows
VACUUM to identify pages that have been split concurrently sees them, and
"jump back" to vacuum them too. See commit http://git.postgresql.org/
gitweb/?p=postgresql.git;a=commit;h=5749f6ef0cc1c67ef9c9ad2108b3d9
7b82555c80. It should be possible to do something similar in GiST, and in
fact you might be able to reuse the NSN field that's already set on the
page halves on split, instead of adding a new "vacuum cycle ID".Idea is right. But in fact, does GiST ever recycle any page? It has
F_DELETED flag, but ISTM this flag is never set. So, I think it's possible
that this patch is working correctly. However, probably GiST sometimes
leaves new page unused due to server crash.
Hmm. It's correctness depends on the fact that when a page is split, the
new right page is always put on a higher-numbered page than the old
page, which hinges on the fact that we never recycle pages.
We used to delete pages in VACUUM FULL, but that got removed when
old-style VACUUM FULL was changed to just rewrite the whole heap. So if
you have pg_upgraded from an old index, it's possible that you still
have some F_DELETED pages in the tree. And then there's the case of
unused pages after crash that you mentioned. So no, you can't really
rely on that. We could of course remove the code that checks the FSM
altogether, and always extend the relation on page split. But of course
the long-term solution is to allow recycling pages.
Anyway, I'm not fan of committing patch in this shape. We need to let GiST
recycle pages first, then implement VACUUM similar to b-tree.
+1. Although I guess we could implement the b-tree style strategy first,
and implement page recycling later. It's just a bit hard to test that
it's correct, when there is no easy way to get deleted pages in the
index to begin with.
- Heikki
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 09/08/2014 11:19 AM, Alexander Korotkov wrote:
On Mon, Sep 8, 2014 at 12:08 PM, Alexander Korotkov <aekorotkov@gmail.com>
wrote:On Mon, Sep 8, 2014 at 11:13 AM, Heikki Linnakangas <
hlinnakangas@vmware.com> wrote:In the b-tree code, we solved that problem back in 2006, so it can be
done but requires a bit more code. In b-tree, we solved it with a "vacuum
cycle ID" number that's set on the page halves when a page is split. That
allows VACUUM to identify pages that have been split concurrently sees
them, and "jump back" to vacuum them too. See commit
http://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=
5749f6ef0cc1c67ef9c9ad2108b3d97b82555c80. It should be possible to do
something similar in GiST, and in fact you might be able to reuse the NSN
field that's already set on the page halves on split, instead of adding a
new "vacuum cycle ID"....
Another note. Assuming we have NSN which can play the role of "vacuum cycle
ID", can we implement sequential (with possible "jump back") index scan for
GiST?
Yeah, I think it would work. It's pretty straightforward, the page split
code already sets the NSN, just when we need it. Vacuum needs to
memorize the current NSN when it begins, and whenever it sees a page
with a higher NSN (or the FOLLOW_RIGHT flag is set), follow the
right-link if it points to lower-numbered page.
- Heikki
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Mon, Sep 8, 2014 at 12:51 PM, Heikki Linnakangas <hlinnakangas@vmware.com
wrote:
On 09/08/2014 11:19 AM, Alexander Korotkov wrote:
On Mon, Sep 8, 2014 at 12:08 PM, Alexander Korotkov <aekorotkov@gmail.com
wrote:
On Mon, Sep 8, 2014 at 11:13 AM, Heikki Linnakangas <
hlinnakangas@vmware.com> wrote:
In the b-tree code, we solved that problem back in 2006, so it can be
done but requires a bit more code. In b-tree, we solved it with a
"vacuum
cycle ID" number that's set on the page halves when a page is split.
That
allows VACUUM to identify pages that have been split concurrently sees
them, and "jump back" to vacuum them too. See commit
http://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=
5749f6ef0cc1c67ef9c9ad2108b3d97b82555c80. It should be possible to do
something similar in GiST, and in fact you might be able to reuse the
NSN
field that's already set on the page halves on split, instead of adding
a
new "vacuum cycle ID"....
Another note. Assuming we have NSN which can play the role of "vacuum
cycle
ID", can we implement sequential (with possible "jump back") index scan
for
GiST?Yeah, I think it would work. It's pretty straightforward, the page split
code already sets the NSN, just when we need it. Vacuum needs to memorize
the current NSN when it begins, and whenever it sees a page with a higher
NSN (or the FOLLOW_RIGHT flag is set), follow the right-link if it points
to lower-numbered page.
I mean "full index scan" feature for SELECT queries might be implemented as
well as sequential VACUUM.
------
With best regards,
Alexander Korotkov.
On 09/08/2014 03:26 PM, Alexander Korotkov wrote:
On Mon, Sep 8, 2014 at 12:51 PM, Heikki Linnakangas <hlinnakangas@vmware.com
wrote:
On 09/08/2014 11:19 AM, Alexander Korotkov wrote:
On Mon, Sep 8, 2014 at 12:08 PM, Alexander Korotkov <aekorotkov@gmail.com
wrote:
On Mon, Sep 8, 2014 at 11:13 AM, Heikki Linnakangas <
hlinnakangas@vmware.com> wrote:
In the b-tree code, we solved that problem back in 2006, so it can be
done but requires a bit more code. In b-tree, we solved it with a
"vacuum
cycle ID" number that's set on the page halves when a page is split.
That
allows VACUUM to identify pages that have been split concurrently sees
them, and "jump back" to vacuum them too. See commit
http://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=
5749f6ef0cc1c67ef9c9ad2108b3d97b82555c80. It should be possible to do
something similar in GiST, and in fact you might be able to reuse the
NSN
field that's already set on the page halves on split, instead of adding
a
new "vacuum cycle ID"....
Another note. Assuming we have NSN which can play the role of "vacuum
cycle
ID", can we implement sequential (with possible "jump back") index scan
for
GiST?Yeah, I think it would work. It's pretty straightforward, the page split
code already sets the NSN, just when we need it. Vacuum needs to memorize
the current NSN when it begins, and whenever it sees a page with a higher
NSN (or the FOLLOW_RIGHT flag is set), follow the right-link if it points
to lower-numbered page.I mean "full index scan" feature for SELECT queries might be implemented as
well as sequential VACUUM.
Oh, sorry, I missed that. If you implement a full-index scan like that,
you might visit some tuples twice, so you'd have to somehow deal with
the duplicates. For a bitmap index scan it would be fine.
- Heikki
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Tue, Sep 9, 2014 at 12:47 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:
On 09/08/2014 03:26 PM, Alexander Korotkov wrote:
On Mon, Sep 8, 2014 at 12:51 PM, Heikki Linnakangas
<hlinnakangas@vmware.comwrote:
On 09/08/2014 11:19 AM, Alexander Korotkov wrote:
On Mon, Sep 8, 2014 at 12:08 PM, Alexander Korotkov
<aekorotkov@gmail.comwrote:
On Mon, Sep 8, 2014 at 11:13 AM, Heikki Linnakangas <
hlinnakangas@vmware.com> wrote:
In the b-tree code, we solved that problem back in 2006, so it can be
done but requires a bit more code. In b-tree, we solved it with a
"vacuum
cycle ID" number that's set on the page halves when a page is split.
That
allows VACUUM to identify pages that have been split concurrently sees
them, and "jump back" to vacuum them too. See commit
http://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=
5749f6ef0cc1c67ef9c9ad2108b3d97b82555c80. It should be possible to do
something similar in GiST, and in fact you might be able to reuse the
NSN
field that's already set on the page halves on split, instead of
adding
a
new "vacuum cycle ID"....
Another note. Assuming we have NSN which can play the role of "vacuum
cycle
ID", can we implement sequential (with possible "jump back") index scan
for
GiST?Yeah, I think it would work. It's pretty straightforward, the page split
code already sets the NSN, just when we need it. Vacuum needs to memorize
the current NSN when it begins, and whenever it sees a page with a higher
NSN (or the FOLLOW_RIGHT flag is set), follow the right-link if it points
to lower-numbered page.I mean "full index scan" feature for SELECT queries might be implemented
as
well as sequential VACUUM.Oh, sorry, I missed that. If you implement a full-index scan like that, you
might visit some tuples twice, so you'd have to somehow deal with the
duplicates. For a bitmap index scan it would be fine.
This patch has been in a "Wait on Author" state for quite a long time
and Heikki has provided comments on it. Switching it to "Returned with
feedback".
--
Michael
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers