PATCH: optimized DROP of multiple tables within a transaction
Hi,
attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.
Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).
This is not likely to improve usual performance, but for systems like
ours, this patch is a significant improvement.
kind regards
Tomas
Attachments:
drop-in-transaction.patchtext/plain; charset=UTF-8; name=drop-in-transaction.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 993bc49..593c360 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -335,6 +335,9 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation));
+ int nrels = 0, i = 0;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -358,14 +361,25 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ srels = repalloc(srels, sizeof(SMgrRelation) * (nrels+1));
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0) {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+
+ pfree(srels);
+ }
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a3bf9a4..8238f49 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -108,6 +108,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2130,6 +2131,53 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
}
/* ---------------------------------------------------------------------
+ * DropRelFileNodeAllBuffersList
+ *
+ * This function removes from the buffer pool all the pages of all
+ * forks of the specified relations. It's equivalent to calling
+ * DropRelFileNodeBuffers once per fork with firstDelBlock = 0 for
+ * each of the relations.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeAllBuffersList(RelFileNodeBackend * rnodes, int nnodes)
+{
+ int i;
+ int j;
+
+ /* sort the list of rnodes */
+ pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);
+
+ /* If it's a local relation, it's localbuf.c's problem. */
+ for (j = 0; j < nnodes; j++) {
+ if (rnodes[j].backend != InvalidBackendId)
+ {
+ if (rnodes[j].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[j].node);
+ }
+ }
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ volatile BufferDesc *bufHdr = &BufferDescriptors[i];
+
+ RelFileNodeBackend * rnode = bsearch((const void *)&(bufHdr->tag.rnode), rnodes,
+ nnodes, sizeof(RelFileNodeBackend),
+ rnode_comparator);
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
+ continue;
+
+ LockBufHdr(bufHdr);
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode->node))
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr);
+ }
+}
+
+/* ---------------------------------------------------------------------
* DropDatabaseBuffers
*
* This function removes all the buffers in the buffer cache for a
@@ -2957,3 +3005,30 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/* Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]),
+ * so that it's suitable for bsearch. */
+static int
+rnode_comparator(const void * p1, const void * p2) {
+
+ RelFileNodeBackend n1 = *(RelFileNodeBackend *)p1;
+ RelFileNodeBackend n2 = *(RelFileNodeBackend *)p2;
+
+ if (n1.node.relNode < n2.node.relNode)
+ return -1;
+ else if (n1.node.relNode > n2.node.relNode)
+ return 1;
+
+ if (n1.node.dbNode < n2.node.dbNode)
+ return -1;
+ else if (n1.node.dbNode > n2.node.dbNode)
+ return 1;
+
+ if (n1.node.spcNode < n2.node.spcNode)
+ return -1;
+ else if (n1.node.spcNode > n2.node.spcNode)
+ return 1;
+ else
+ return 0;
+
+}
\ No newline at end of file
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 0cec147..4219958 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -384,6 +384,63 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend rnodes[nrels];
+ ForkNumber forknum;
+
+ for (i = 0; i < nrels; i++) {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffersList(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++) {
+ CacheInvalidateSmgr(rnodes[i]);
+ }
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++) {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..a29441a 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -189,6 +189,7 @@ extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffersList(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 3560d53..600ef15 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -81,6 +81,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
On Fri, Aug 24, 2012 at 6:36 PM, Tomas Vondra <tv@fuzzy.cz> wrote:
attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).This is not likely to improve usual performance, but for systems like
ours, this patch is a significant improvement.
Seems pretty reasonable. But instead of duplicating so much code,
couldn't we find a way to use replace DropRelFileNodeAllBuffers with
DropRelFileNodeAllBuffersList? Surely anyone who was planning to call
the first one could instead call the second one with a count of one
and a pointer to the address of the data they were planning to pass.
I'd probably swap the order of arguments to
DropRelFileNodeAllBuffersList as well. We could do something similar
with smgrdounlink/smgrdounlinkall so that, again, only one copy of the
code is needed.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 30 Srpen 2012, 17:53, Robert Haas wrote:
On Fri, Aug 24, 2012 at 6:36 PM, Tomas Vondra <tv@fuzzy.cz> wrote:
attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).This is not likely to improve usual performance, but for systems like
ours, this patch is a significant improvement.Seems pretty reasonable. But instead of duplicating so much code,
couldn't we find a way to use replace DropRelFileNodeAllBuffers with
DropRelFileNodeAllBuffersList? Surely anyone who was planning to call
the first one could instead call the second one with a count of one
and a pointer to the address of the data they were planning to pass.
I'd probably swap the order of arguments to
DropRelFileNodeAllBuffersList as well. We could do something similar
with smgrdounlink/smgrdounlinkall so that, again, only one copy of the
code is needed.
Yeah, I was thinking about that too, but I simply wasn't sure which is the
best choice so I've sent the raw patch. OTOH these functions are called on
a very limited number of places, so a refactoring like this seems fine.
Tomas
On Thu, Aug 30, 2012 at 3:17 PM, Tomas Vondra <tv@fuzzy.cz> wrote:
On 30 Srpen 2012, 17:53, Robert Haas wrote:
On Fri, Aug 24, 2012 at 6:36 PM, Tomas Vondra <tv@fuzzy.cz> wrote:
attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).This is not likely to improve usual performance, but for systems like
ours, this patch is a significant improvement.Seems pretty reasonable. But instead of duplicating so much code,
couldn't we find a way to use replace DropRelFileNodeAllBuffers with
DropRelFileNodeAllBuffersList? Surely anyone who was planning to call
the first one could instead call the second one with a count of one
and a pointer to the address of the data they were planning to pass.
I'd probably swap the order of arguments to
DropRelFileNodeAllBuffersList as well. We could do something similar
with smgrdounlink/smgrdounlinkall so that, again, only one copy of the
code is needed.Yeah, I was thinking about that too, but I simply wasn't sure which is the
best choice so I've sent the raw patch. OTOH these functions are called on
a very limited number of places, so a refactoring like this seems fine.
If there are enough call sites then DropRelFileNodeAllBuffers could
become a one-line function that simply calls
DropRelFileNodeAllBuffersList(1, &arg). But we should avoid
duplicating the code.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hi Tomas,
Sorry to be late.
On Sat, Aug 25, 2012 at 7:36 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.
Here are my review comments.
Submission
==========
The patch is in unified diff format, and can be applied to the head of
master. It doesn't include any test nor document, but it seems ok
because the patch doesn't change visible behavior.
Usability
=========
The patch intends to improve performance of bulk DROP TABLEs which are
in a transaction. Such improvement would useful in cases such as 1)
dropping set of partitioned tables as periodic maintenance work, and 2)
dropping lot of work tables. The patch doesn't change any SQL syntax or
built-in objects, but just change internal behavior.
Feature test
============
The patch doesn't provide no visible functionality, and existing
regression tests passed.
Performance test
================
I tested 1000 tables case (each is copy of pgbench_branches with 100000
rows) on 1GB shared_buffers server. Please note that I tested on
MacBook air, i.e. storage is not HDD but SSD. Here is the test procedure:
1) loop 1000 times
1-1) create copy table of pgbench_accounts as accounts$i
1-2) load 100000 rows
1-3) add primary key
1-4) select all rows to cache pages in shared buffer
2) BEGIN
3) loop 1000 times
3-1) DROP TABLE accounts$i
4) COMMIT
The numbers below are average of 5 trials.
---------+----------+-------------
Build | DROP * | COMMIT
---------+----------+-------------
Master | 0.239 ms | 1220.421 ms
Patched | 0.240 ms | 432.209 ms
---------+----------+-------------
* time elapsed for one DROP TABLE statement
IIUC the patch's target is improving COMMIT performance by avoiding
repeated buffer search loop, so this results show that the patch
obtained its goal.
Coding review
=============
I have some comments about coding.
* Some cosmetic changes are necessary.
* Variable j in DropRelFileNodeAllBuffersList seems redundant.
* RelFileNodeBackendIsTemp() macro is provided for checking whether the
relation is local, so using it would be better.
Please see attached patch for changes above.
* As Robert commented, this patch adds DropRelFileNodeAllBuffersList by
copying code from DropRelFileNodeAllBuffers. Please refactor it to
avoid code duplication.
* In smgrDoPendingDeletes, you free srels explicitly. Can't we leave
them to memory context stuff? Even it is required, at least pfree must
be called in the case nrels == 0 too.
* In smgrDoPendingDeletes, the buffer srels is expanded in every
iteration. This seems a bit inefficient. How about doubling the
capacity when used it up? This requires additional variable, but
reduces repalloc call considerably.
* Just an idea, but if we can remove entries for local relations from
rnodes array before buffer loop in DropRelFileNodeAllBuffersList,
following bsearch might be more efficient, though dropping many
temporary tables might be rare.
Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).
Hm, my environment seems very different from yours. Could you show the
setting of shared_buffers in your environment? I'd like to make my test
environment as similar as possible to yours.
This is not likely to improve usual performance, but for systems like
ours, this patch is a significant improvement.
I'll test the performance of bare DROP TABLEs (not surrounded by BEGIN
and COMMIT) tomorrow to confirm that the patch doesn't introduce
performance degradation.
Regards,
--
Shigeru HANADA
Attachments:
drop-in-transaction-v2.patchtext/plain; charset=Shift_JIS; name=drop-in-transaction-v2.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 993bc49..86bca04 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -335,6 +335,10 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -358,14 +362,26 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ srels = repalloc(srels, sizeof(SMgrRelation) * (nrels + 1));
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+
+ pfree(srels);
+ }
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 56095b3..09c6085 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -108,6 +108,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2132,6 +2133,53 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
}
/* ---------------------------------------------------------------------
+ * DropRelFileNodeAllBuffersList
+ *
+ * This function removes from the buffer pool all the pages of all
+ * forks of the specified relations. It's equivalent to calling
+ * DropRelFileNodeBuffers once per fork with firstDelBlock = 0 for
+ * each of the relations.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeAllBuffersList(RelFileNodeBackend * rnodes, int nnodes)
+{
+ int i;
+
+ /* sort the list of rnodes */
+ pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);
+
+ /* If it's a local relation, it's localbuf.c's problem. */
+ for (i = 0; i < nnodes; i++)
+ {
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
+ }
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ volatile BufferDesc *bufHdr = &BufferDescriptors[i];
+ RelFileNodeBackend *rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ rnodes,
+ nnodes, sizeof(RelFileNodeBackend),
+ rnode_comparator);
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
+ continue;
+
+ LockBufHdr(bufHdr);
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode->node))
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr);
+ }
+}
+
+/* ---------------------------------------------------------------------
* DropDatabaseBuffers
*
* This function removes all the buffers in the buffer cache for a
@@ -2959,3 +3007,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void * p1, const void * p2)
+{
+ RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1;
+ RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2;
+
+ if (n1.node.relNode < n2.node.relNode)
+ return -1;
+ else if (n1.node.relNode > n2.node.relNode)
+ return 1;
+
+ if (n1.node.dbNode < n2.node.dbNode)
+ return -1;
+ else if (n1.node.dbNode > n2.node.dbNode)
+ return 1;
+
+ if (n1.node.spcNode < n2.node.spcNode)
+ return -1;
+ else if (n1.node.spcNode > n2.node.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 0cec147..e0b77aa 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -384,6 +384,64 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend rnodes[nrels];
+ ForkNumber forknum;
+
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffersList(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++) {
+ CacheInvalidateSmgr(rnodes[i]);
+ }
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++) {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..a29441a 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -189,6 +189,7 @@ extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffersList(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 3560d53..600ef15 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -81,6 +81,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
Hi,
thanks for the review. I'll look into that in ~2 weeks, once the
pgconf.eu
is over. A few comments in the text below.
Dne 17.10.2012 12:34, Shigeru HANADA napsal:
Performance test
================
I tested 1000 tables case (each is copy of pgbench_branches with
100000
rows) on 1GB shared_buffers server. Please note that I tested on
MacBook air, i.e. storage is not HDD but SSD. Here is the test
procedure:1) loop 1000 times
1-1) create copy table of pgbench_accounts as accounts$i
1-2) load 100000 rows
1-3) add primary key
1-4) select all rows to cache pages in shared buffer
2) BEGIN
3) loop 1000 times
3-1) DROP TABLE accounts$i
4) COMMIT
I don't think the 'load rows' and 'select all rows' is really
necessary.
And AFAIK sequential scans use small circular buffer not to pollute
shared
buffers, so I'd guess the pages are not cached in shared buffers
anyway.
Have you verified that, e.g. by pg_buffercache?
The numbers below are average of 5 trials.
---------+----------+-------------
Build | DROP * | COMMIT
---------+----------+-------------
Master | 0.239 ms | 1220.421 ms
Patched | 0.240 ms | 432.209 ms
---------+----------+-------------
* time elapsed for one DROP TABLE statementIIUC the patch's target is improving COMMIT performance by avoiding
repeated buffer search loop, so this results show that the patch
obtained its goal.Coding review
=============
I have some comments about coding.* Some cosmetic changes are necessary.
* Variable j in DropRelFileNodeAllBuffersList seems redundant.
* RelFileNodeBackendIsTemp() macro is provided for checking whether
the
relation is local, so using it would be better.Please see attached patch for changes above.
* As Robert commented, this patch adds DropRelFileNodeAllBuffersList
by
copying code from DropRelFileNodeAllBuffers. Please refactor it to
avoid code duplication.
* In smgrDoPendingDeletes, you free srels explicitly. Can't we leave
them to memory context stuff? Even it is required, at least pfree
must
be called in the case nrels == 0 too.
* In smgrDoPendingDeletes, the buffer srels is expanded in every
iteration. This seems a bit inefficient. How about doubling the
capacity when used it up? This requires additional variable, but
reduces repalloc call considerably.
* Just an idea, but if we can remove entries for local relations from
rnodes array before buffer loop in DropRelFileNodeAllBuffersList,
following bsearch might be more efficient, though dropping many
temporary tables might be rare.
Yes, I plan to do all of this.
Our system creates a lot of "working tables" (even 100.000) and we
need
to perform garbage collection (dropping obsolete tables) regularly.
This
often took ~ 1 hour, because we're using big AWS instances with lots
of
RAM (which tends to be slower than RAM on bare hw). After applying
this
patch and dropping tables in groups of 100, the gc runs in less than
4
minutes (i.e. a 15x speed-up).Hm, my environment seems very different from yours. Could you show
the
setting of shared_buffers in your environment? I'd like to make my
test
environment as similar as possible to yours.
We're using m2.4xlarge instances (70G of RAM) with 10GB shared buffers.
Tomas
Hi Tomas,
On 2012/10/17, at 20:45, Tomas Vondra <tv@fuzzy.cz> wrote:
Dne 17.10.2012 12:34, Shigeru HANADA napsal:
Performance test
================
I tested 1000 tables case (each is copy of pgbench_branches with 100000
rows) on 1GB shared_buffers server. Please note that I tested on
MacBook air, i.e. storage is not HDD but SSD. Here is the test procedure:1) loop 1000 times
1-1) create copy table of pgbench_accounts as accounts$i
1-2) load 100000 rows
1-3) add primary key
1-4) select all rows to cache pages in shared buffer
2) BEGIN
3) loop 1000 times
3-1) DROP TABLE accounts$i
4) COMMITI don't think the 'load rows' and 'select all rows' is really necessary.
And AFAIK sequential scans use small circular buffer not to pollute shared
buffers, so I'd guess the pages are not cached in shared buffers anyway.
Have you verified that, e.g. by pg_buffercache?
Oops, you're right. I omitted 1-3 and 1-4 in actual measurement, but IMO loading data is necessary to fill the shared buffer up, because # of buffers which are deleted during COMMIT is major factor of this patch. And, yes, I verified that all shared buffers are used after all loading have been finished.
Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).Hm, my environment seems very different from yours. Could you show the
setting of shared_buffers in your environment? I'd like to make my test
environment as similar as possible to yours.We're using m2.4xlarge instances (70G of RAM) with 10GB shared buffers.
Thank you, it's more huge than I expected. I'm not sure whether my boss allows me to use such rich environment... :(
Here are results of additional measurements on my MBA.
* stats of 1000 bare DROP TABLE statements
90%ile of patched PG is just 2% slower than Master, so it would be acceptable.
| Patched | Master
---------+------------+------------
Average | 1.595 ms | 1.634 ms
Median | 1.791 ms | 1.900 ms
90%ile | 2.517 ms | 2.477 ms
Max | 37.526 ms | 24.553 ms
* Total time to complete 1000 DROP TABLEs and COMMIT
| Patched | Master
-------+---------+---------
Bare | 1595 ms | 1634 ms
In TX | 672 ms | 1459 ms
Regards,
--
Shigeru HANADA
shigeru.hanada@gmail.com
Tomas Vondra wrote:
Hi,
thanks for the review. I'll look into that in ~2 weeks, once the
pgconf.eu
is over.
Excellent. Please submit the updated version to the upcoming commitfest
when you have it. I'm marking this patch Returned with Feedback.
Many thanks to Shigeru Hanada for the review and benchmark.
--
Álvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
Hi,
I've prepared a slightly updated patch, based on the previous review.
See it attached.
On 18.10.2012 04:28, 花田 茂 wrote:> Hi Tomas,
On 2012/10/17, at 20:45, Tomas Vondra <tv@fuzzy.cz> wrote:
Dne 17.10.2012 12:34, Shigeru HANADA napsal:
Performance test
================
I tested 1000 tables case (each is copy of pgbench_branches with
100000 rows) on 1GB shared_buffers server. Please note that I
tested on MacBook air, i.e. storage is not HDD but SSD. Here is
the test procedure:1) loop 1000 times
1-1) create copy table of pgbench_accounts as accounts$i
1-2) load 100000 rows
1-3) add primary key
1-4) select all rows to cache pages in shared buffer
2) BEGIN
3) loop 1000 times
3-1) DROP TABLE accounts$i
4) COMMITI don't think the 'load rows' and 'select all rows' is really
necessary. And AFAIK sequential scans use small circular buffer
not to pollute sharedbuffers, so I'd guess the pages are not cached
in shared buffers anyway. Have you verified that, e.g. by
pg_buffercache?Oops, you're right. I omitted 1-3 and 1-4 in actual measurement, but
IMO loading data is necessary to fill the shared buffer up, because
# of buffers which are deleted during COMMIT is major factor of this
patch. And, yes, I verified that all shared buffers are used after
all loading have been finished.
I don't think it's an important factor, as the original code does this:
for (i = 0; i < NBuffers; i++)
{
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
...
}
in the DropRelFileNodeAllBuffers. That loops through all shared buffers
no matter what, so IMHO the performance in this case depends on the
total size of the shared buffers. But maybe I'm missing something important.
Our system creates a lot of "working tables" (even 100.000)
and we need to perform garbage collection (dropping obsolete
tables) regularly. Thisoften took ~ 1 hour, because we're using
big AWS instances with lots of RAM (which tends to be slower
than RAM on bare hw). After applying this patch and dropping
tables in groups of 100, the gc runs in less than 4 minutes
(i.e. a 15x speed-up).Hm, my environment seems very different from yours. Could you
show the setting of shared_buffers in your environment? I'd like
to make my test environment as similar as possible to yours.We're using m2.4xlarge instances (70G of RAM) with 10GB shared
buffers.Thank you, it's more huge than I expected. I'm not sure whether my
boss allows me to use such rich environment... :(
I've done a simple benchmark on my laptop with 2GB shared buffers, it's
attached in the drop-test.py (it's a bit messy, but it works).
It does four things:
1) creates N tables
2) drops them one by one
3) creates N tables
4) drops them in batches (batch = one transaction)
To run the script, simply specify number of tables you want to work with
(say 10000), size of the batch (e.g. 100) and connection string (e.g.
'host=localhost dbname=mydb').
With those parameters, I got these numbers on the laptop:
creating 10000 tables
all tables created in 3.298694 seconds
dropping 10000 tables one by one ...
all tables dropped in 32.692478 seconds
creating 10000 tables
all tables created in 3.458178 seconds
dropping 10000 tables in batches by 100...
all tables dropped in 3.28268 seconds
So it's 33 seconds vs. 3.3 seconds, i.e. 10x speedup. On AWS we usually
get larger differences, as we use larger shared buffers and the memory
is significantly slower there.
Regarding the other comments:
* As Robert commented, this patch adds DropRelFileNodeAllBuffersList
by copying code from DropRelFileNodeAllBuffers. Please refactor it
to avoid code duplication.
Yes, I've merged the two functions, i.e. I've removed the original
DropRelFileNodeAllBuffers and used the name for the new one (taking
array instead of single relnode). Then I've modified the existing calls
to to use
DropRelFileNodeAllBuffers(&relnode, 1)
instead of the original one
DropRelFileNodeAllBuffers(relnode)
Maybe this should be done for smgrdounlink/smgrdounlinkall too.
* In smgrDoPendingDeletes, you free srels explicitly. Can't we leave
them to memory context stuff? Even it is required, at least pfree
must be called in the case nrels == 0 too.
Hmmm, right. Not sure which choice is better, so for now I've moved the
pfree out of the 'if' block, so it's executed for 'nrels==0' too.
* In smgrDoPendingDeletes, the buffer srels is expanded in every
iteration. This seems a bit inefficient. How about doubling the
capacity when used it up? This requires additional variable, but
reduces repalloc call considerably.
Done, although I haven't seen any significant speed improvement.
* Just an idea, but if we can remove entries for local relations from
rnodes array before buffer loop in DropRelFileNodeAllBuffersList,
following bsearch might be more efficient, though dropping many
temporary tables might be rare.
My reasoning, exactly. But maybe it should be done to keep the code
clean, i.e. not letting temp tables to code paths where they are not
expected.
Tomas
Attachments:
drop-in-transaction-v3.patchtext/plain; charset=UTF-8; name=drop-in-transaction-v3.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 993bc49..fcd9387 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -335,6 +335,11 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0,
+ maxrels = 1;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -358,14 +363,31 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels) {
+ maxrels *= 2;
+ srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index bdcbe47..aa934a8 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -108,6 +108,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,31 +2095,37 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes)
{
- int i;
+ int i;
+
+ /* sort the list of rnodes */
+ pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
- return;
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
}
for (i = 0; i < NBuffers; i++)
{
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
+ RelFileNodeBackend *rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ rnodes,
+ nnodes, sizeof(RelFileNodeBackend),
+ rnode_comparator);
- /*
- * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
- * and saves some cycles.
- */
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode->node))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
@@ -2953,3 +2960,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void * p1, const void * p2)
+{
+ RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1;
+ RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2;
+
+ if (n1.node.relNode < n2.node.relNode)
+ return -1;
+ else if (n1.node.relNode > n2.node.relNode)
+ return 1;
+
+ if (n1.node.dbNode < n2.node.dbNode)
+ return -1;
+ else if (n1.node.dbNode > n2.node.dbNode)
+ return 1;
+
+ if (n1.node.spcNode < n2.node.spcNode)
+ return -1;
+ else if (n1.node.spcNode > n2.node.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..40a9d0b 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,64 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend rnodes[nrels];
+ ForkNumber forknum;
+
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++) {
+ CacheInvalidateSmgr(rnodes[i]);
+ }
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++) {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
On Mon, Nov 12, 2012 at 4:36 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
Hi,
I've prepared a slightly updated patch, based on the previous review.
See it attached.
All changes in v3 patch seem good, however I found some places which requires
cosmetic changes. Please see attached v3.1 patch for those changes.
Oops, you're right. I omitted 1-3 and 1-4 in actual measurement, but
IMO loading data is necessary to fill the shared buffer up, because
# of buffers which are deleted during COMMIT is major factor of this
patch. And, yes, I verified that all shared buffers are used after
all loading have been finished.I don't think it's an important factor, as the original code does this:
for (i = 0; i < NBuffers; i++)
{
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
...
}in the DropRelFileNodeAllBuffers. That loops through all shared buffers
no matter what, so IMHO the performance in this case depends on the
total size of the shared buffers. But maybe I'm missing something important.
I worry the effect of "continue" in the loop to the performance. If most of
shared buffers are not used at the moment of DROP, the load of DROP doesn't
contain the overhead of LockBufHdr and subsequent few lines.
Do you expect such situation in your use cases?
I've done a simple benchmark on my laptop with 2GB shared buffers, it's
attached in the drop-test.py (it's a bit messy, but it works).
[snip]
With those parameters, I got these numbers on the laptop:
creating 10000 tables
all tables created in 3.298694 seconds
dropping 10000 tables one by one ...
all tables dropped in 32.692478 seconds
creating 10000 tables
all tables created in 3.458178 seconds
dropping 10000 tables in batches by 100...
all tables dropped in 3.28268 secondsSo it's 33 seconds vs. 3.3 seconds, i.e. 10x speedup. On AWS we usually
get larger differences, as we use larger shared buffers and the memory
is significantly slower there.
Do you have performance numbers of same test on not-patched PG?
This patch aims to improve performance of bulk DROP, so 4th number in
the result above should be compared to 4th number of not-patched PG?
--
Shigeru HANADA
Attachments:
drop-in-transaction-v3.1.patchapplication/octet-stream; name=drop-in-transaction-v3.1.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 2446282..9e92230 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -312,6 +312,10 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+ SMgrRelation *srels = (SMgrRelation *) palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0,
+ maxrels = 1;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +339,33 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels)
+ {
+ maxrels *= 2;
+ srels = (SMgrRelation *)
+ repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dddb6c0..85d7a67 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -107,7 +107,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
bool *foundPtr);
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
-
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,31 +2094,37 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
- int i;
+ int i;
+
+ /* sort the list of rnodes */
+ pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
- return;
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
}
for (i = 0; i < NBuffers; i++)
{
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
+ RelFileNodeBackend *rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ rnodes,
+ nnodes, sizeof(RelFileNodeBackend),
+ rnode_comparator);
- /*
- * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
- * and saves some cycles.
- */
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode->node))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
@@ -2953,3 +2959,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void * p1, const void * p2)
+{
+ RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1;
+ RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2;
+
+ if (n1.node.relNode < n2.node.relNode)
+ return -1;
+ else if (n1.node.relNode > n2.node.relNode)
+ return 1;
+
+ if (n1.node.dbNode < n2.node.dbNode)
+ return -1;
+ else if (n1.node.dbNode > n2.node.dbNode)
+ return 1;
+
+ if (n1.node.spcNode < n2.node.spcNode)
+ return -1;
+ else if (n1.node.spcNode > n2.node.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..40a9d0b 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,64 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend rnodes[nrels];
+ ForkNumber forknum;
+
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++) {
+ CacheInvalidateSmgr(rnodes[i]);
+ }
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++) {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
On 6.12.2012 05:47, Shigeru Hanada wrote:
On Mon, Nov 12, 2012 at 4:36 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
Hi,
I've prepared a slightly updated patch, based on the previous review.
See it attached.All changes in v3 patch seem good, however I found some places which requires
cosmetic changes. Please see attached v3.1 patch for those changes.
OK, thanks!
Oops, you're right. I omitted 1-3 and 1-4 in actual measurement, but
IMO loading data is necessary to fill the shared buffer up, because
# of buffers which are deleted during COMMIT is major factor of this
patch. And, yes, I verified that all shared buffers are used after
all loading have been finished.I don't think it's an important factor, as the original code does this:
for (i = 0; i < NBuffers; i++)
{
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
...
}in the DropRelFileNodeAllBuffers. That loops through all shared buffers
no matter what, so IMHO the performance in this case depends on the
total size of the shared buffers. But maybe I'm missing something important.I worry the effect of "continue" in the loop to the performance. If most of
shared buffers are not used at the moment of DROP, the load of DROP doesn't
contain the overhead of LockBufHdr and subsequent few lines.
Do you expect such situation in your use cases?
I still fail to see the issue here - can you give an example of when
this would be a problem?
The code was like this before the patch, I only replaced the simple
comparison to a binary search ... Or do you think that this check means
"if the buffer is empty"?
/* buffer does not belong to any of the relations */
if (rnode == NULL)
continue;
Because it does not - notice that the 'rnode' is a result of the binary
search, not a direct check of the buffer.
So the following few lines (lock and recheck) will be skipped either for
unused buffers and buffers belonging to relations that are not being
dropped.
Maybe we could do a simple 'is the buffer empty' check before the
bsearch call, but I don't expect that to happen very often in real world
(the cache tends to be used).
I've done a simple benchmark on my laptop with 2GB shared buffers, it's
attached in the drop-test.py (it's a bit messy, but it works).[snip]
With those parameters, I got these numbers on the laptop:
creating 10000 tables
all tables created in 3.298694 seconds
dropping 10000 tables one by one ...
all tables dropped in 32.692478 seconds
creating 10000 tables
all tables created in 3.458178 seconds
dropping 10000 tables in batches by 100...
all tables dropped in 3.28268 secondsSo it's 33 seconds vs. 3.3 seconds, i.e. 10x speedup. On AWS we usually
get larger differences, as we use larger shared buffers and the memory
is significantly slower there.Do you have performance numbers of same test on not-patched PG?
This patch aims to improve performance of bulk DROP, so 4th number in
the result above should be compared to 4th number of not-patched PG?
I've re-run the tests with the current patch on my home workstation, and
the results are these (again 10k tables, dropped either one-by-one or in
batches of 100).
1) unpatched
dropping one-by-one: 15.5 seconds
dropping in batches of 100: 12.3 sec
2) patched (v3.1)
dropping one-by-one: 32.8 seconds
dropping in batches of 100: 3.0 sec
The problem here is that when dropping the tables one-by-one, the
bsearch overhead is tremendous and significantly increases the runtime.
I've done a simple check (if dropping a single table, use the original
simple comparison) and I got this:
3) patched (v3.2)
dropping one-by-one: 16.0 seconds
dropping in batches of 100: 3.3 sec
i.e. the best of both. But it seems like an unnecessary complexity to me
- if you need to drop a lot of tables you'll probably do that in a
transaction anyway.
Tomas
Attachments:
drop-in-transaction-v3.2.patchtext/plain; charset=UTF-8; name=drop-in-transaction-v3.2.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 2446282..9e92230 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -312,6 +312,10 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+ SMgrRelation *srels = (SMgrRelation *) palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0,
+ maxrels = 1;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +339,33 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels)
+ {
+ maxrels *= 2;
+ srels = (SMgrRelation *)
+ repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dddb6c0..716d795 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -107,7 +107,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
bool *foundPtr);
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
-
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,31 +2094,52 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
- int i;
+ int i;
+
+ /* sort the list of rnodes */
+ pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
- return;
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
}
for (i = 0; i < NBuffers; i++)
{
+
+ RelFileNodeBackend *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
-
- /*
- * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
- * and saves some cycles.
- */
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
- continue;
+
+ /* don't use the bsearch for a single-table drop */
+ if (nnodes == 1)
+ {
+ if (!RelFileNodeEquals(bufHdr->tag.rnode, rnodes->node))
+ continue;
+
+ rnode = rnodes;
+
+ } else {
+
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ rnodes,
+ nnodes, sizeof(RelFileNodeBackend),
+ rnode_comparator);
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
+ continue;
+
+ }
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode->node))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
@@ -2953,3 +2974,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void * p1, const void * p2)
+{
+ RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1;
+ RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2;
+
+ if (n1.node.relNode < n2.node.relNode)
+ return -1;
+ else if (n1.node.relNode > n2.node.relNode)
+ return 1;
+
+ if (n1.node.dbNode < n2.node.dbNode)
+ return -1;
+ else if (n1.node.dbNode > n2.node.dbNode)
+ return 1;
+
+ if (n1.node.spcNode < n2.node.spcNode)
+ return -1;
+ else if (n1.node.spcNode > n2.node.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..40a9d0b 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,64 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend rnodes[nrels];
+ ForkNumber forknum;
+
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++) {
+ CacheInvalidateSmgr(rnodes[i]);
+ }
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++) {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
On 2012-12-06 23:38:59 +0100, Tomas Vondra wrote:
On 6.12.2012 05:47, Shigeru Hanada wrote:
I've done a simple benchmark on my laptop with 2GB shared buffers, it's
attached in the drop-test.py (it's a bit messy, but it works).[snip]
With those parameters, I got these numbers on the laptop:
creating 10000 tables
all tables created in 3.298694 seconds
dropping 10000 tables one by one ...
all tables dropped in 32.692478 seconds
creating 10000 tables
all tables created in 3.458178 seconds
dropping 10000 tables in batches by 100...
all tables dropped in 3.28268 secondsSo it's 33 seconds vs. 3.3 seconds, i.e. 10x speedup. On AWS we usually
get larger differences, as we use larger shared buffers and the memory
is significantly slower there.Do you have performance numbers of same test on not-patched PG?
This patch aims to improve performance of bulk DROP, so 4th number in
the result above should be compared to 4th number of not-patched PG?I've re-run the tests with the current patch on my home workstation, and
the results are these (again 10k tables, dropped either one-by-one or in
batches of 100).1) unpatched
dropping one-by-one: 15.5 seconds
dropping in batches of 100: 12.3 sec2) patched (v3.1)
dropping one-by-one: 32.8 seconds
dropping in batches of 100: 3.0 secThe problem here is that when dropping the tables one-by-one, the
bsearch overhead is tremendous and significantly increases the runtime.
I've done a simple check (if dropping a single table, use the original
simple comparison) and I got this:3) patched (v3.2)
dropping one-by-one: 16.0 seconds
dropping in batches of 100: 3.3 seci.e. the best of both. But it seems like an unnecessary complexity to me
- if you need to drop a lot of tables you'll probably do that in a
transaction anyway.
Imo that's still a pretty bad performance difference. And your
single-table optimization will probably fall short as soon as the table
has indexes and/or a toast table...
+ +/* + * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so + * that it's suitable for bsearch. + */ +static int +rnode_comparator(const void * p1, const void * p2) +{ + RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1; + RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2; + + if (n1.node.relNode < n2.node.relNode) + return -1; + else if (n1.node.relNode > n2.node.relNode) + return 1; + + if (n1.node.dbNode < n2.node.dbNode) + return -1; + else if (n1.node.dbNode > n2.node.dbNode) + return 1; + + if (n1.node.spcNode < n2.node.spcNode) + return -1; + else if (n1.node.spcNode > n2.node.spcNode) + return 1; + else + return 0; +}
ISTM that this whole comparator could be replaced by memcmp(). That
could quite possibly lower the overhead of the bsearch() in simple
cases. We already rely on the fact that RelFileNode's have no padding
atm (c.f. buf_internal.h), so a memcmp() seems fine to me.
Greetings,
Andres Freund
--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 8.12.2012 15:26, Andres Freund wrote:
On 2012-12-06 23:38:59 +0100, Tomas Vondra wrote:
I've re-run the tests with the current patch on my home workstation, and
the results are these (again 10k tables, dropped either one-by-one or in
batches of 100).1) unpatched
dropping one-by-one: 15.5 seconds
dropping in batches of 100: 12.3 sec2) patched (v3.1)
dropping one-by-one: 32.8 seconds
dropping in batches of 100: 3.0 secThe problem here is that when dropping the tables one-by-one, the
bsearch overhead is tremendous and significantly increases the runtime.
I've done a simple check (if dropping a single table, use the original
simple comparison) and I got this:3) patched (v3.2)
dropping one-by-one: 16.0 seconds
dropping in batches of 100: 3.3 seci.e. the best of both. But it seems like an unnecessary complexity to me
- if you need to drop a lot of tables you'll probably do that in a
transaction anyway.Imo that's still a pretty bad performance difference. And your
single-table optimization will probably fall short as soon as the table
has indexes and/or a toast table...
Why? I haven't checked the code but either those objects are droppped
one-by-one (which seems unlikely) or they're added to the pending list
and then the new code will kick in, making it actually faster.
Yes, the original code might be faster even for 2 or 3 objects, but I
can't think of a simple solution to fix this, given that it really
depends on CPU/RAM speed and shared_buffers size.
+ +/* + * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so + * that it's suitable for bsearch. + */ +static int +rnode_comparator(const void * p1, const void * p2) +{ + RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1; + RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2; + + if (n1.node.relNode < n2.node.relNode) + return -1; + else if (n1.node.relNode > n2.node.relNode) + return 1; + + if (n1.node.dbNode < n2.node.dbNode) + return -1; + else if (n1.node.dbNode > n2.node.dbNode) + return 1; + + if (n1.node.spcNode < n2.node.spcNode) + return -1; + else if (n1.node.spcNode > n2.node.spcNode) + return 1; + else + return 0; +}ISTM that this whole comparator could be replaced by memcmp(). That
could quite possibly lower the overhead of the bsearch() in simple
cases. We already rely on the fact that RelFileNode's have no padding
atm (c.f. buf_internal.h), so a memcmp() seems fine to me.
Good point, I'll try that. The original code used a macro that simply
compared the fields and I copied that logic, but if we can remove the
code ...
Thanks for the review!
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 8.12.2012 15:49, Tomas Vondra wrote:
On 8.12.2012 15:26, Andres Freund wrote:
On 2012-12-06 23:38:59 +0100, Tomas Vondra wrote:
I've re-run the tests with the current patch on my home workstation, and
the results are these (again 10k tables, dropped either one-by-one or in
batches of 100).1) unpatched
dropping one-by-one: 15.5 seconds
dropping in batches of 100: 12.3 sec2) patched (v3.1)
dropping one-by-one: 32.8 seconds
dropping in batches of 100: 3.0 secThe problem here is that when dropping the tables one-by-one, the
bsearch overhead is tremendous and significantly increases the runtime.
I've done a simple check (if dropping a single table, use the original
simple comparison) and I got this:3) patched (v3.2)
dropping one-by-one: 16.0 seconds
dropping in batches of 100: 3.3 seci.e. the best of both. But it seems like an unnecessary complexity to me
- if you need to drop a lot of tables you'll probably do that in a
transaction anyway.Imo that's still a pretty bad performance difference. And your
single-table optimization will probably fall short as soon as the table
has indexes and/or a toast table...Why? I haven't checked the code but either those objects are droppped
one-by-one (which seems unlikely) or they're added to the pending list
and then the new code will kick in, making it actually faster.Yes, the original code might be faster even for 2 or 3 objects, but I
can't think of a simple solution to fix this, given that it really
depends on CPU/RAM speed and shared_buffers size.
I've done some test and yes - once there are other objects the
optimization falls short. For example for tables with one index, it
looks like this:
1) unpatched
one by one: 28.9 s
100 batches: 23.9 s
2) patched
one by one: 44.1 s
100 batches: 4.7 s
So the patched code is by about 50% slower, but this difference quickly
disappears with the number of indexes / toast tables etc.
I see this as an argument AGAINST such special-case optimization. My
reasoning is this:
* This difference is significant only if you're dropping a table with
low number of indexes / toast tables. In reality this is not going to
be very frequent.
* If you're dropping a single table, it really does not matter - the
difference will be like 100 ms vs. 200 ms or something like that.
* If you're dropping a lot of tables, you should do than in a
transaction anyway, or you should be aware that doing that in a
transaction will be faster (we can add this info into DROP TABLE
docs).
IMHO this is similar to doing a lot of INSERTs - doing all of them in a
single transaction is faster. The possibility that someone needs to drop
a lot of tables in separate transactions exists in theory, but I don't
know of a realistic real-world usage.
So I'd vote to ditch that special case optimization.
+ +/* + * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so + * that it's suitable for bsearch. + */ +static int +rnode_comparator(const void * p1, const void * p2) +{ + RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1; + RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2; + + if (n1.node.relNode < n2.node.relNode) + return -1; + else if (n1.node.relNode > n2.node.relNode) + return 1; + + if (n1.node.dbNode < n2.node.dbNode) + return -1; + else if (n1.node.dbNode > n2.node.dbNode) + return 1; + + if (n1.node.spcNode < n2.node.spcNode) + return -1; + else if (n1.node.spcNode > n2.node.spcNode) + return 1; + else + return 0; +}ISTM that this whole comparator could be replaced by memcmp(). That
could quite possibly lower the overhead of the bsearch() in simple
cases. We already rely on the fact that RelFileNode's have no padding
atm (c.f. buf_internal.h), so a memcmp() seems fine to me.Good point, I'll try that. The original code used a macro that simply
compared the fields and I copied that logic, but if we can remove the
code ...
Hmmm, I've replaced the comparator with this one:
static int
rnode_comparator(const void * p1, const void * p2)
{
return memcmp(p1, p2, sizeof(RelFileNode));
}
and while it's not significantly faster (actually it's often a bit
slower than the original comparator), it's a much simpler code.
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Sun, Dec 9, 2012 at 1:07 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
* If you're dropping a single table, it really does not matter - the
difference will be like 100 ms vs. 200 ms or something like that.
Did you try dropping 10,000 tables in 100 batches, as same as previous post?
If so the overhead introduced by the patch is only 1.52ms per table.
It seems acceptable to me.
So I'd vote to ditch that special case optimization.
+1. It seems very difficult to determine reasonable threshold of such
kind of optimization. As described above, the overhead seems enough
little, so using bsearch in any case seems fine.
Besides, v3.2 patch needs some more fixes, for minor issues.
* Opening curly bracket of if/for/while/etc. should be in the next
line, like this:
if (foo == bar) /* { <= not here */
{
...
}
* Use hard-tab instead of white-spaces to indent variable name in declarations.
For these two issues, please see the page below:
http://www.postgresql.org/docs/9.2/static/source-format.html
* PostgreSQL allows C89 features, so we can't use variable length array.
* Contains unintentional blank lines?
Please see attached patch for details of fixes above.
--
Shigeru HANADA
Attachments:
drop-in-tx.diffapplication/octet-stream; name=drop-in-tx.diffDownload
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 716d795..3dc7dae 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2113,7 +2113,6 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
for (i = 0; i < NBuffers; i++)
{
-
RelFileNodeBackend *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
@@ -2125,8 +2124,9 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
rnode = rnodes;
- } else {
-
+ }
+ else
+ {
rnode = bsearch((const void *) &(bufHdr->tag.rnode),
rnodes,
nnodes, sizeof(RelFileNodeBackend),
@@ -2135,7 +2135,6 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
/* buffer does not belong to any of the relations */
if (rnode == NULL)
continue;
-
}
LockBufHdr(bufHdr);
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 40a9d0b..fb9f8ab 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -422,9 +422,11 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
{
int i = 0;
- RelFileNodeBackend rnodes[nrels];
+ RelFileNodeBackend *rnodes;
ForkNumber forknum;
+ /* initialize an array which contains all relations to be dropped */
+ rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
for (i = 0; i < nrels; i++)
{
RelFileNodeBackend rnode = rels[i]->smgr_rnode;
@@ -458,7 +460,8 @@ void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
* this backend, too, and thereby provide a backstop that we closed our
* own smgr rel.
*/
- for (i = 0; i < nrels; i++) {
+ for (i = 0; i < nrels; i++)
+ {
CacheInvalidateSmgr(rnodes[i]);
}
@@ -470,11 +473,14 @@ void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
* xact.
*/
- for (i = 0; i < nrels; i++) {
- int which = rels[i]->smgr_which;
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
(*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
}
+
+ pfree(rnodes);
}
/*
On 2012-12-08 17:07:38 +0100, Tomas Vondra wrote:
On 8.12.2012 15:49, Tomas Vondra wrote:
On 8.12.2012 15:26, Andres Freund wrote:
On 2012-12-06 23:38:59 +0100, Tomas Vondra wrote:
I've re-run the tests with the current patch on my home workstation, and
the results are these (again 10k tables, dropped either one-by-one or in
batches of 100).1) unpatched
dropping one-by-one: 15.5 seconds
dropping in batches of 100: 12.3 sec2) patched (v3.1)
dropping one-by-one: 32.8 seconds
dropping in batches of 100: 3.0 secThe problem here is that when dropping the tables one-by-one, the
bsearch overhead is tremendous and significantly increases the runtime.
I've done a simple check (if dropping a single table, use the original
simple comparison) and I got this:3) patched (v3.2)
dropping one-by-one: 16.0 seconds
dropping in batches of 100: 3.3 seci.e. the best of both. But it seems like an unnecessary complexity to me
- if you need to drop a lot of tables you'll probably do that in a
transaction anyway.Imo that's still a pretty bad performance difference. And your
single-table optimization will probably fall short as soon as the table
has indexes and/or a toast table...Why? I haven't checked the code but either those objects are droppped
one-by-one (which seems unlikely) or they're added to the pending list
and then the new code will kick in, making it actually faster.Yes, the original code might be faster even for 2 or 3 objects, but I
can't think of a simple solution to fix this, given that it really
depends on CPU/RAM speed and shared_buffers size.I've done some test and yes - once there are other objects the
optimization falls short. For example for tables with one index, it
looks like this:1) unpatched
one by one: 28.9 s
100 batches: 23.9 s2) patched
one by one: 44.1 s
100 batches: 4.7 sSo the patched code is by about 50% slower, but this difference quickly
disappears with the number of indexes / toast tables etc.I see this as an argument AGAINST such special-case optimization. My
reasoning is this:* This difference is significant only if you're dropping a table with
low number of indexes / toast tables. In reality this is not going to
be very frequent.* If you're dropping a single table, it really does not matter - the
difference will be like 100 ms vs. 200 ms or something like that.
I don't particularly buy that argument. There are good reasons (like
avoiding deadlocks, long transactions) to drop multiple tables
in individual transactions.
Not that I have a good plan to how to work around that though :(
Greetings,
Andres Freund
--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Dne 10.12.2012 16:38, Andres Freund napsal:
On 2012-12-08 17:07:38 +0100, Tomas Vondra wrote:
I've done some test and yes - once there are other objects the
optimization falls short. For example for tables with one index, it
looks like this:1) unpatched
one by one: 28.9 s
100 batches: 23.9 s2) patched
one by one: 44.1 s
100 batches: 4.7 sSo the patched code is by about 50% slower, but this difference
quickly
disappears with the number of indexes / toast tables etc.I see this as an argument AGAINST such special-case optimization. My
reasoning is this:* This difference is significant only if you're dropping a table
with
low number of indexes / toast tables. In reality this is not going
to
be very frequent.* If you're dropping a single table, it really does not matter - the
difference will be like 100 ms vs. 200 ms or something like that.I don't particularly buy that argument. There are good reasons (like
avoiding deadlocks, long transactions) to drop multiple tables
in individual transactions.
Not that I have a good plan to how to work around that though :(
Yeah, if you need to drop the tables one by one for some reason, you
can't get rid of the overhead this way :-(
OTOH in the example above the overhead is ~50%, i.e. 1.5ms / table with
a
single index. Each such associated relation (index, TOAST table, ...)
means
a relation that needs to be dropped and on my machine, once I reach ~5
relations there's almost no difference as the overhead is balanced by
the
gains.
Not sure how to fix that in an elegant way, though :-(
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi,
I've updated the patch to include the optimization described in the
previous post, i.e. if the number of relations is below a certain
threshold, use a simple for loop, for large numbers of relations use
bsearch calls.
This is done by a new constant BSEARCH_LIMIT, which is set to 10 in the
patch. Then I've modified the 'drop-test' script to take yet another
argument - number of indexes for each table. So for example this
./drop-test.py 10000 100 3 'dbname=test'
means "create 10000 tables, 3 indexes for each of them, and then drop
them in batches of 100 tables."
Then I've run the test with 0, 1, 2, ... 11 indexes for each table for
(a) unpatched HEAD
(b) patch v3.1 (without the optimization)
(c) patch v3.3 (with BSEARCH_LIMIT=10)
and I got these results:
1) dropping one-by-one
----------------------
This is the really interesting part - the issue with the v3.1 is that
for a single table, it's ~2x slower than unpatched PostgreSQL.
0 1 2 3 4 5 6 7 8 9 10 11
--------------------------------------------------------------
unpatched 16 28 40 52 63 75 87 99 110 121 135 147
v3.1 33 43 46 56 58 60 63 72 75 76 79 80
v3.3 16 20 23 25 29 33 36 40 44 47 79 82
The values are durations in seconds, rounded to integer values. I've run
the test repeatedly and there's very small variance in the numbers.
The v3.3 improves that and it's actually even faster than unpatched
PostgreSQL. How can this happen? I believe it's because the code is
rewritten from
for each relation (r) in the drop list
DropRelFileNodeAllBuffers (r)
for each shared buffer
check and invalidate
to
copy the relations from drop list to array (a)
DropRelFileNodeAllBuffers(a)
for each shared buffer
for each relation (r) in the array
check and invalidate
At least that's the only explanation I was able to come up with.
Yet another interesting observation is that even the v3.1 is about as
fast as the unpatched code once there are 3 or more indexes (or TOAST
tables).
So my opinion is that the optimizated patch works as expected, and that
even without the optimization the performance would be acceptable for
most real-world cases.
2) dropping in transaction
--------------------------
This is mostly to verify that the code did not break anything, because
the optimization should not kick-in in this case at all. And that seems
to be the case:
0 1 2 3 4 5 6 7 8 9 10 11
--------------------------------------------------------------
unpatched 13 24 35 46 58 69 81 92 103 115 127 139
v3.1 3 5 7 8 10 12 14 15 16 18 20 21
v3.3 3 4 6 7 8 10 11 13 14 15 18 20
The differences between v3.1 and v3.3 are mostly due to rounding etc.
Attached is the v3.3 patch and the testing script I've been using for
the tests above. Feel free to run the tests on your hardware, with your
hardware, shared buffers size etc. I've run that on a 4-core i5-2500 CPU
with 2GB shared buffers.
Tomas
Attachments:
drop-in-transaction-v3.3.patchtext/plain; charset=UTF-8; name=drop-in-transaction-v3.3.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 8dc622a..8c12a43 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -312,6 +312,11 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0,
+ maxrels = 1;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +340,31 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels) {
+ maxrels *= 2;
+ srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dddb6c0..7a968f8 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -62,6 +62,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
+#define BSEARCH_LIMIT 10
/* GUC variables */
bool zero_damaged_pages = false;
@@ -108,6 +109,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,31 +2096,64 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes)
{
- int i;
+ int i, j;
+
+ /* sort the list of rnodes */
+ pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
- return;
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
}
for (i = 0; i < NBuffers; i++)
{
+ RelFileNodeBackend *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
-
+
/*
* As in DropRelFileNodeBuffers, an unlocked precheck should be safe
* and saves some cycles.
*/
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+
+ /*
+ * For low number of relations to drop just use a simple walk through,
+ * to save the bsearch overhead. The BSEARCH_LIMIT is rather a guess
+ * than a exactly determined value, as it depends on many factors (CPU
+ * and RAM speeds, amount of shared buffers etc.).
+ */
+ if (nnodes <= BSEARCH_LIMIT)
+ {
+ for (j = 0; j < nnodes; j++)
+ {
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnodes[j].node))
+ {
+ rnode = &rnodes[j];
+ break;
+ }
+ }
+ }
+ else
+ {
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ rnodes,
+ nnodes, sizeof(RelFileNodeBackend),
+ rnode_comparator);
+ }
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode->node))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
@@ -2953,3 +2988,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void * p1, const void * p2)
+{
+ RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1;
+ RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2;
+
+ if (n1.node.relNode < n2.node.relNode)
+ return -1;
+ else if (n1.node.relNode > n2.node.relNode)
+ return 1;
+
+ if (n1.node.dbNode < n2.node.dbNode)
+ return -1;
+ else if (n1.node.dbNode > n2.node.dbNode)
+ return 1;
+
+ if (n1.node.spcNode < n2.node.spcNode)
+ return -1;
+ else if (n1.node.spcNode > n2.node.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..bbce945 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,68 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend *rnodes;
+ ForkNumber forknum;
+
+ /* initialize an array which contains all relations to be dropped */
+ rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++)
+ CacheInvalidateSmgr(rnodes[i]);
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+
+ pfree(rnodes);
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
On 2012-12-17 00:31:00 +0100, Tomas Vondra wrote:
I've updated the patch to include the optimization described in the
previous post, i.e. if the number of relations is below a certain
threshold, use a simple for loop, for large numbers of relations use
bsearch calls.This is done by a new constant BSEARCH_LIMIT, which is set to 10 in the
patch. Then I've modified the 'drop-test' script to take yet another
argument - number of indexes for each table. So for example this./drop-test.py 10000 100 3 'dbname=test'
means "create 10000 tables, 3 indexes for each of them, and then drop
them in batches of 100 tables."Then I've run the test with 0, 1, 2, ... 11 indexes for each table for
(a) unpatched HEAD
(b) patch v3.1 (without the optimization)
(c) patch v3.3 (with BSEARCH_LIMIT=10)and I got these results:
Nice work!
1) dropping one-by-one
----------------------This is the really interesting part - the issue with the v3.1 is that
for a single table, it's ~2x slower than unpatched PostgreSQL.0 1 2 3 4 5 6 7 8 9 10 11
--------------------------------------------------------------
unpatched 16 28 40 52 63 75 87 99 110 121 135 147
v3.1 33 43 46 56 58 60 63 72 75 76 79 80
v3.3 16 20 23 25 29 33 36 40 44 47 79 82The values are durations in seconds, rounded to integer values. I've run
the test repeatedly and there's very small variance in the numbers.The v3.3 improves that and it's actually even faster than unpatched
PostgreSQL. How can this happen? I believe it's because the code is
rewritten fromfor each relation (r) in the drop list
DropRelFileNodeAllBuffers (r)
for each shared buffer
check and invalidateto
copy the relations from drop list to array (a)
DropRelFileNodeAllBuffers(a)
for each shared buffer
for each relation (r) in the array
check and invalidateAt least that's the only explanation I was able to come up with.
That seems like a rather sensible explanation. The earlier algorithm
iterated over shared buffers multiple times which is the expensive thing
here, the new version doesn't so its sensible that its faster as long as
the comparison method for checking whether a buffer belongs to relation
is suitably fast.
Yet another interesting observation is that even the v3.1 is about as
fast as the unpatched code once there are 3 or more indexes (or TOAST
tables).So my opinion is that the optimizated patch works as expected, and that
even without the optimization the performance would be acceptable for
most real-world cases.
I think except of the temp buffer issue mentioned below its ready.
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode) +DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes) { - int i; + int i, j; + + /* sort the list of rnodes */ + pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);/* If it's a local relation, it's localbuf.c's problem. */ - if (RelFileNodeBackendIsTemp(rnode)) + for (i = 0; i < nnodes; i++) { - if (rnode.backend == MyBackendId) - DropRelFileNodeAllLocalBuffers(rnode.node); - return; + if (RelFileNodeBackendIsTemp(rnodes[i])) + { + if (rnodes[i].backend == MyBackendId) + DropRelFileNodeAllLocalBuffers(rnodes[i].node); + } }
While you deal with local buffers here you don't anymore in the big loop
over shared buffers. That wasn't needed earlier since we just returned
after noticing we have local relation, but thats not the case anymore.
for (i = 0; i < NBuffers; i++) { + RelFileNodeBackend *rnode = NULL; volatile BufferDesc *bufHdr = &BufferDescriptors[i]; - + /* * As in DropRelFileNodeBuffers, an unlocked precheck should be safe * and saves some cycles. */ - if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node)) + + /* + * For low number of relations to drop just use a simple walk through, + * to save the bsearch overhead. The BSEARCH_LIMIT is rather a guess + * than a exactly determined value, as it depends on many factors (CPU + * and RAM speeds, amount of shared buffers etc.). + */ + if (nnodes <= BSEARCH_LIMIT)
I think thats a sensible plan. It makes sense that for a small number of
relations a sequential scan of the rnodes array is faster than a bsearch
and 10 sounds like a good value although I would guess the optimal value
is slightly higher on most machines. But if it works fine without
regressions thats pretty good...
+ +/* + * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so + * that it's suitable for bsearch. + */ +static int +rnode_comparator(const void * p1, const void * p2) +{ + RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1; + RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2; + + if (n1.node.relNode < n2.node.relNode) + return -1; + else if (n1.node.relNode > n2.node.relNode) + return 1; + + if (n1.node.dbNode < n2.node.dbNode) + return -1; + else if (n1.node.dbNode > n2.node.dbNode) + return 1; + + if (n1.node.spcNode < n2.node.spcNode) + return -1; + else if (n1.node.spcNode > n2.node.spcNode) + return 1; + else + return 0; +}
Still surprised this is supposed to be faster than a memcmp, but as you
seem to have measured it earlier..
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo) +{ + int i = 0; + RelFileNodeBackend *rnodes; + ForkNumber forknum; + + /* initialize an array which contains all relations to be dropped */ + rnodes = palloc(sizeof(RelFileNodeBackend) * nrels); + for (i = 0; i < nrels; i++) + { + RelFileNodeBackend rnode = rels[i]->smgr_rnode; + int which = rels[i]->smgr_which; + + rnodes[i] = rnode; + + /* Close the forks at smgr level */ + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + (*(smgrsw[which].smgr_close)) (rels[i], forknum); + } + + /* + * Get rid of any remaining buffers for the relation. bufmgr will just + * drop them without bothering to write the contents. + */ + DropRelFileNodeAllBuffers(rnodes, nrels);
I think it might be a good idea to handle temp relations on/buffers on
this level instead of trying to put it into
DropRelFileNodeAllBuffers. Would also allow you to only build an array
of RelFileNode's instead of RelFileNodeBackend's which might make it
slightl faster.
+ /* + * It'd be nice to tell the stats collector to forget it immediately, too. + * But we can't because we don't know the OID (and in cases involving + * relfilenode swaps, it's not always clear which table OID to forget, + * anyway). + */
This and at least one other comment seems to be in too many locations
now.
Greetings,
Andres Freund
--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 19.12.2012 02:18, Andres Freund wrote:
On 2012-12-17 00:31:00 +0100, Tomas Vondra wrote:
I think except of the temp buffer issue mentioned below its ready.
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode) +DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes) { - int i; + int i, j; + + /* sort the list of rnodes */ + pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);/* If it's a local relation, it's localbuf.c's problem. */ - if (RelFileNodeBackendIsTemp(rnode)) + for (i = 0; i < nnodes; i++) { - if (rnode.backend == MyBackendId) - DropRelFileNodeAllLocalBuffers(rnode.node); - return; + if (RelFileNodeBackendIsTemp(rnodes[i])) + { + if (rnodes[i].backend == MyBackendId) + DropRelFileNodeAllLocalBuffers(rnodes[i].node); + } }While you deal with local buffers here you don't anymore in the big loop
over shared buffers. That wasn't needed earlier since we just returned
after noticing we have local relation, but thats not the case anymore.
Hmm, but that would require us to handle the temp relations explicitly
wherever we call DropRelFileNodeAllBuffers. Currently there are two such
places - smgrdounlink() and smgrdounlinkall().
By placing it into DropRelFileNodeAllBuffers() this code is shared and I
think it's a good thing.
But that does not mean the code is perfect - it was based on the
assumption that if there's a mix of temp and regular relations, the temp
relations will be handled in the first part and the rest in the second one.
Maybe it'd be better to improve it so that the temp relations are
removed from the array after the first part (and skip the second one if
there are no remaining relations).
for (i = 0; i < NBuffers; i++) { + RelFileNodeBackend *rnode = NULL; volatile BufferDesc *bufHdr = &BufferDescriptors[i]; - + /* * As in DropRelFileNodeBuffers, an unlocked precheck should be safe * and saves some cycles. */ - if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node)) + + /* + * For low number of relations to drop just use a simple walk through, + * to save the bsearch overhead. The BSEARCH_LIMIT is rather a guess + * than a exactly determined value, as it depends on many factors (CPU + * and RAM speeds, amount of shared buffers etc.). + */ + if (nnodes <= BSEARCH_LIMIT)I think thats a sensible plan. It makes sense that for a small number of
relations a sequential scan of the rnodes array is faster than a bsearch
and 10 sounds like a good value although I would guess the optimal value
is slightly higher on most machines. But if it works fine without
regressions thats pretty good...
I think it's pointless to look for the optimal value in this case, given
on how many factors it depends. We could use 20 instead of 10, but I
wouldn't go higher probably.
+ +/* + * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so + * that it's suitable for bsearch. + */ +static int +rnode_comparator(const void * p1, const void * p2) +{ + RelFileNodeBackend n1 = * (RelFileNodeBackend *) p1; + RelFileNodeBackend n2 = * (RelFileNodeBackend *) p2; + + if (n1.node.relNode < n2.node.relNode) + return -1; + else if (n1.node.relNode > n2.node.relNode) + return 1; + + if (n1.node.dbNode < n2.node.dbNode) + return -1; + else if (n1.node.dbNode > n2.node.dbNode) + return 1; + + if (n1.node.spcNode < n2.node.spcNode) + return -1; + else if (n1.node.spcNode > n2.node.spcNode) + return 1; + else + return 0; +}Still surprised this is supposed to be faster than a memcmp, but as you
seem to have measured it earlier..
It surprised me too. These are the numbers with the current patch:
1) one by one
=============
0 2 4 6 8 10 12 14 16 18 20
--------------------------------------------------------------
current 15 22 28 34 41 75 77 82 92 99 106
memcmp 16 23 29 36 44 122 125 128 153 154 158
Until the number of indexes reaches ~10, the numbers are almost exactly
the same. Then the bsearch branch kicks in and it's clear how much
slower the memcmp comparator is.
2) batches of 100
=================
0 2 4 6 8 10 12 14 16 18 20
--------------------------------------------------------------
current 3 5 8 10 12 15 17 21 23 27 31
memcmp 4 7 10 13 16 19 22 28 30 32 36
Here the difference is much smaller, but even here the memcmp is
consistently a bit slower.
My theory is that while the current comparator starts with the most
variable part (relation OID), and thus usually starts after the
comparing the first 4B, the memcmp starts from the other end (tablespace
and database) and therefore needs to compare more data.
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo) +{ + int i = 0; + RelFileNodeBackend *rnodes; + ForkNumber forknum; + + /* initialize an array which contains all relations to be dropped */ + rnodes = palloc(sizeof(RelFileNodeBackend) * nrels); + for (i = 0; i < nrels; i++) + { + RelFileNodeBackend rnode = rels[i]->smgr_rnode; + int which = rels[i]->smgr_which; + + rnodes[i] = rnode; + + /* Close the forks at smgr level */ + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + (*(smgrsw[which].smgr_close)) (rels[i], forknum); + } + + /* + * Get rid of any remaining buffers for the relation. bufmgr will just + * drop them without bothering to write the contents. + */ + DropRelFileNodeAllBuffers(rnodes, nrels);I think it might be a good idea to handle temp relations on/buffers on
this level instead of trying to put it into
DropRelFileNodeAllBuffers. Would also allow you to only build an array
of RelFileNode's instead of RelFileNodeBackend's which might make it
slightl faster.
Hmmm, sadly that'd require duplication of code because it needs to be
done in smgrdounlink too.
+ /* + * It'd be nice to tell the stats collector to forget it immediately, too. + * But we can't because we don't know the OID (and in cases involving + * relfilenode swaps, it's not always clear which table OID to forget, + * anyway). + */This and at least one other comment seems to be in too many locations
now.
I see it in three places - smgrdounlink, smgrdounlinkall and
smgrdounlinkfork. Which ones you consider superfluous? I think it's
appropriate to keep them in all three places.
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 2012-12-20 02:54:48 +0100, Tomas Vondra wrote:
On 19.12.2012 02:18, Andres Freund wrote:
On 2012-12-17 00:31:00 +0100, Tomas Vondra wrote:
I think except of the temp buffer issue mentioned below its ready.
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode) +DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes) { - int i; + int i, j; + + /* sort the list of rnodes */ + pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);/* If it's a local relation, it's localbuf.c's problem. */ - if (RelFileNodeBackendIsTemp(rnode)) + for (i = 0; i < nnodes; i++) { - if (rnode.backend == MyBackendId) - DropRelFileNodeAllLocalBuffers(rnode.node); - return; + if (RelFileNodeBackendIsTemp(rnodes[i])) + { + if (rnodes[i].backend == MyBackendId) + DropRelFileNodeAllLocalBuffers(rnodes[i].node); + } }While you deal with local buffers here you don't anymore in the big loop
over shared buffers. That wasn't needed earlier since we just returned
after noticing we have local relation, but thats not the case anymore.Hmm, but that would require us to handle the temp relations explicitly
wherever we call DropRelFileNodeAllBuffers. Currently there are two such
places - smgrdounlink() and smgrdounlinkall().By placing it into DropRelFileNodeAllBuffers() this code is shared and I
think it's a good thing.But that does not mean the code is perfect - it was based on the
assumption that if there's a mix of temp and regular relations, the temp
relations will be handled in the first part and the rest in the second one.Maybe it'd be better to improve it so that the temp relations are
removed from the array after the first part (and skip the second one if
there are no remaining relations).
The problem is that afaik without the backend-local part a temporary
relation could match the same relfilenode as a "full" relation which
would have pretty bad consequences.
Still surprised this is supposed to be faster than a memcmp, but as you
seem to have measured it earlier..It surprised me too. These are the numbers with the current patch:
1) one by one
=============
0 2 4 6 8 10 12 14 16 18 20
--------------------------------------------------------------
current 15 22 28 34 41 75 77 82 92 99 106
memcmp 16 23 29 36 44 122 125 128 153 154 158Until the number of indexes reaches ~10, the numbers are almost exactly
the same. Then the bsearch branch kicks in and it's clear how much
slower the memcmp comparator is.2) batches of 100
=================
0 2 4 6 8 10 12 14 16 18 20
--------------------------------------------------------------
current 3 5 8 10 12 15 17 21 23 27 31
memcmp 4 7 10 13 16 19 22 28 30 32 36Here the difference is much smaller, but even here the memcmp is
consistently a bit slower.My theory is that while the current comparator starts with the most
variable part (relation OID), and thus usually starts after the
comparing the first 4B, the memcmp starts from the other end (tablespace
and database) and therefore needs to compare more data.
Thats a good guess. I think its ok this way, but if you feel like
playing you could just change the order in the struct...
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo) +{ + int i = 0; + RelFileNodeBackend *rnodes; + ForkNumber forknum; + + /* initialize an array which contains all relations to be dropped */ + rnodes = palloc(sizeof(RelFileNodeBackend) * nrels); + for (i = 0; i < nrels; i++) + { + RelFileNodeBackend rnode = rels[i]->smgr_rnode; + int which = rels[i]->smgr_which; + + rnodes[i] = rnode; + + /* Close the forks at smgr level */ + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + (*(smgrsw[which].smgr_close)) (rels[i], forknum); + } + + /* + * Get rid of any remaining buffers for the relation. bufmgr will just + * drop them without bothering to write the contents. + */ + DropRelFileNodeAllBuffers(rnodes, nrels);I think it might be a good idea to handle temp relations on/buffers on
this level instead of trying to put it into
DropRelFileNodeAllBuffers. Would also allow you to only build an array
of RelFileNode's instead of RelFileNodeBackend's which might make it
slightl faster.Hmmm, sadly that'd require duplication of code because it needs to be
done in smgrdounlink too.
It should be fairly small though.
int nrels_nonlocal = 0;
for (i = 0; i < nrels; i++)
{
RelFileNodeBackend rnode = rels[i]->smgr_rnode;
int which = rels[i]->smgr_which;
if (RelFileNodeBackendIsTemp(rnode))
DropRelFileNodeAllLocalBuffers
else
rnodes[nrels_nonlocal++];
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
(*(smgrsw[which].smgr_close)) (rels[i], forknum);
}
DropRelFileNodeAllLocalBuffers(rnode, nrels_nonlocal);
In smgrdounlink it should only be a
if (RelFileNodeBackendIsTemp(rnode))
DropRelFileNodeAllLocalBuffers(rnode)
else
DropRelFileNodeAllBuffers(rnode);
ISTM that would be cleaner then memmove'ing the rnode array to remove
the temp rels away.
+ /* + * It'd be nice to tell the stats collector to forget it immediately, too. + * But we can't because we don't know the OID (and in cases involving + * relfilenode swaps, it's not always clear which table OID to forget, + * anyway). + */This and at least one other comment seems to be in too many locations
now.I see it in three places - smgrdounlink, smgrdounlinkall and
smgrdounlinkfork. Which ones you consider superfluous? I think it's
appropriate to keep them in all three places.
I only read the patch, not the result, so maybe youre right. I'll look
at it sometime.
Greetings,
Andres Freund
--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 20.12.2012 12:33, Andres Freund wrote:
On 2012-12-20 02:54:48 +0100, Tomas Vondra wrote:
On 19.12.2012 02:18, Andres Freund wrote:
On 2012-12-17 00:31:00 +0100, Tomas Vondra wrote:
I think except of the temp buffer issue mentioned below its ready.
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode) +DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes) { - int i; + int i, j; + + /* sort the list of rnodes */ + pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);/* If it's a local relation, it's localbuf.c's problem. */ - if (RelFileNodeBackendIsTemp(rnode)) + for (i = 0; i < nnodes; i++) { - if (rnode.backend == MyBackendId) - DropRelFileNodeAllLocalBuffers(rnode.node); - return; + if (RelFileNodeBackendIsTemp(rnodes[i])) + { + if (rnodes[i].backend == MyBackendId) + DropRelFileNodeAllLocalBuffers(rnodes[i].node); + } }While you deal with local buffers here you don't anymore in the big loop
over shared buffers. That wasn't needed earlier since we just returned
after noticing we have local relation, but thats not the case anymore.Hmm, but that would require us to handle the temp relations explicitly
wherever we call DropRelFileNodeAllBuffers. Currently there are two such
places - smgrdounlink() and smgrdounlinkall().By placing it into DropRelFileNodeAllBuffers() this code is shared and I
think it's a good thing.But that does not mean the code is perfect - it was based on the
assumption that if there's a mix of temp and regular relations, the temp
relations will be handled in the first part and the rest in the second one.Maybe it'd be better to improve it so that the temp relations are
removed from the array after the first part (and skip the second one if
there are no remaining relations).The problem is that afaik without the backend-local part a temporary
relation could match the same relfilenode as a "full" relation which
would have pretty bad consequences.
Attached is a patch with fixed handling of temporary relations. I've
chosen to keep the logic in DropRelFileNodeAllBuffers and rather do a
local copy without the local relations.
regards
Tomas
Attachments:
drop-in-transaction-v4.patchtext/plain; charset=UTF-8; name=drop-in-transaction-v4.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 8dc622a..8c12a43 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -312,6 +312,11 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0,
+ maxrels = 1;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +340,31 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels) {
+ maxrels *= 2;
+ srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dddb6c0..8a7190b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -62,6 +62,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
+#define BSEARCH_LIMIT 10
/* GUC variables */
bool zero_damaged_pages = false;
@@ -108,6 +109,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,35 +2096,85 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes)
{
- int i;
+ int i, j, n = 0;
+ RelFileNode * nodes;
+
+ nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
+ else
+ {
+ nodes[n++] = rnodes[i].node;
+ }
+ }
+
+ /* If there are no non-local relations, then we're done. Release the memory
+ * and return. */
+ if (n == 0)
+ {
+ pfree(nodes);
return;
}
+ /* sort the list of rnodes (but only if we're going to use the bsearch) */
+ if (n > BSEARCH_LIMIT)
+ pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+
for (i = 0; i < NBuffers; i++)
{
+ RelFileNode *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
-
+
/*
* As in DropRelFileNodeBuffers, an unlocked precheck should be safe
* and saves some cycles.
*/
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+
+ /*
+ * For low number of relations to drop just use a simple walk through,
+ * to save the bsearch overhead. The BSEARCH_LIMIT is rather a guess
+ * than a exactly determined value, as it depends on many factors (CPU
+ * and RAM speeds, amount of shared buffers etc.).
+ */
+ if (n <= BSEARCH_LIMIT)
+ {
+ for (j = 0; j < n; j++)
+ {
+ if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
+ {
+ rnode = &nodes[j];
+ break;
+ }
+ }
+ }
+ else
+ {
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ nodes, n, sizeof(RelFileNode),
+ rnode_comparator);
+ }
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
+
+ pfree(nodes);
}
/* ---------------------------------------------------------------------
@@ -2953,3 +3005,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void * p1, const void * p2)
+{
+ RelFileNode n1 = * (RelFileNode *) p1;
+ RelFileNode n2 = * (RelFileNode *) p2;
+
+ if (n1.relNode < n2.relNode)
+ return -1;
+ else if (n1.relNode > n2.relNode)
+ return 1;
+
+ if (n1.dbNode < n2.dbNode)
+ return -1;
+ else if (n1.dbNode > n2.dbNode)
+ return 1;
+
+ if (n1.spcNode < n2.spcNode)
+ return -1;
+ else if (n1.spcNode > n2.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..bbce945 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,68 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend *rnodes;
+ ForkNumber forknum;
+
+ /* initialize an array which contains all relations to be dropped */
+ rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++)
+ CacheInvalidateSmgr(rnodes[i]);
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+
+ pfree(rnodes);
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
On Sun, Dec 23, 2012 at 8:41 PM, Tomas Vondra <tv@fuzzy.cz> wrote:
Attached is a patch with fixed handling of temporary relations. I've
chosen to keep the logic in DropRelFileNodeAllBuffers and rather do a
local copy without the local relations.
This looks pretty good, although it needs some cleanup for coding
style. And I think BSEARCH_LIMIT cuts a bit too broadly as a name for
the constant.
Granted that we can't decide on an exact value for BSEARCH_LIMIT, is
there any data to indicate that 10 is at least an approximately
correct value?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 30.12.2012 04:03, Robert Haas wrote:
On Sun, Dec 23, 2012 at 8:41 PM, Tomas Vondra <tv@fuzzy.cz> wrote:
Attached is a patch with fixed handling of temporary relations. I've
chosen to keep the logic in DropRelFileNodeAllBuffers and rather do a
local copy without the local relations.This looks pretty good, although it needs some cleanup for coding
style. And I think BSEARCH_LIMIT cuts a bit too broadly as a name for
the constant.
I thought I followed the conding style - which guidelines have I broken?
I agree that BSEARCH_LIMIT is a bit too short / not exactly descriptive.
What alternative would you suggest?
Granted that we can't decide on an exact value for BSEARCH_LIMIT, is
there any data to indicate that 10 is at least an approximately
correct value?
I've presented some relevant test results on 17/12, here is the
interesting part:
# indexes 0 1 2 3 4 5 6 7 8 9 10 11
--------------------------------------------------------------
unpatched 16 28 40 52 63 75 87 99 110 121 135 147
v3.1 33 43 46 56 58 60 63 72 75 76 79 80
v3.3 16 20 23 25 29 33 36 40 44 47 79 82
where v3.1 is a patch doing bsearch all the time, v3.3 uses the
BSEARCH_LIMIT optimization. A few observations:
(a) the v3.3 is consistently faster, and the time increases by about
3.5 second for each index
(b) the v3.1 is slower at the beginning, but at the end the time
increases much slower (~1.5 sec/index)
(c) once we get to 4 indexes (5 relations in total), both 3.1 and 3.3
are faster than the current code
(d) in case of v3.3, there's a step increase between 9 and 10 indexes
(because for 10 indexes the code chooses the bsearch path), but even
after this increase both versions are faster than the original code
Given (b) and (c) both patches should meet at about 24 (so we might
increase the BSEARCH_LIMIT e.g. to 25). It's a bit difficult to predict
the behaviour on different machines (different CPUs, different amounts
of shared_buffers etc.) but I guess this is safe.
But even if we stick to the current value (10) I believe it's safe and
both versions are much faster than the current code.
regards
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
There was an earlier suggestion by Andres Freund to use memcmp()
instead, but I don't see that in the latest posted version of the patch;
was there a specific rationale for taking it out or it was just lost in
the shuffle?
--
Álvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 1.1.2013 17:35, Alvaro Herrera wrote:
There was an earlier suggestion by Andres Freund to use memcmp()
instead, but I don't see that in the latest posted version of the patch;
was there a specific rationale for taking it out or it was just lost in
the shuffle?
No, I've tried that approach with a comparator like this:
static int
rnode_comparator(const void * p1, const void * p2)
{
return memcmp(p1, p2, sizeof(RelFileNode));
}
but it turned out to be slower than the current comparator. I've posted
some benchmark results and possible explanation on 20/12 (message
50D26FE8.1040800@fuzzy.cz).
If you could verify my results, that'd be great.
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Mon, Dec 31, 2012 at 11:51 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
I thought I followed the conding style - which guidelines have I broken?
+ /* If there are no non-local relations, then we're done. Release the memory
+ * and return. */
Multi-line comments should start with a line containing only /* and
end with a line containing only */.
+DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes)
and
+rnode_comparator(const void * p1, const void * p2)
The extra spaces after the asterisks should be removed.
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
void should be on a line by itself.
Sorry to nitpick.
As for BSEARCH_LIMIT, I don't have a great idea - maybe just
DROP_RELATIONS_BSEARCH_LIMIT?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Mon, Dec 24, 2012 at 02:41:37AM +0100, Tomas Vondra wrote:
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation)); + int nrels = 0, + i = 0, + maxrels = 1;
maxrels=1 is not good -- too much palloc traffic. I'd make it start at,
say, 8 instead.
--
Álvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 4.1.2013 17:42, Robert Haas wrote:
On Mon, Dec 31, 2012 at 11:51 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
I thought I followed the conding style - which guidelines have I broken?
+ /* If there are no non-local relations, then we're done. Release the memory + * and return. */Multi-line comments should start with a line containing only /* and
end with a line containing only */.+DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes) and +rnode_comparator(const void * p1, const void * p2)The extra spaces after the asterisks should be removed.
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo) +{void should be on a line by itself.
Sorry to nitpick.
No, thanks for the nitpicking! Code style is important.
As for BSEARCH_LIMIT, I don't have a great idea - maybe just
DROP_RELATIONS_BSEARCH_LIMIT?
Sounds good. I've changed the name and fixed the codestyle issues in the
attached version of the patch.
Tomas
Attachments:
drop-in-transaction-v6.patchtext/plain; charset=UTF-8; name=drop-in-transaction-v6.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 8dc622a..8c12a43 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -312,6 +312,11 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0,
+ maxrels = 1;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +340,31 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels) {
+ maxrels *= 2;
+ srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dddb6c0..52ca2b0 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -62,6 +62,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
+#define DROP_RELATIONS_BSEARCH_LIMIT 10
/* GUC variables */
bool zero_damaged_pages = false;
@@ -108,6 +109,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,35 +2096,87 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
- int i;
+ int i, j, n = 0;
+ RelFileNode * nodes;
+
+ nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
+ else
+ {
+ nodes[n++] = rnodes[i].node;
+ }
+ }
+
+ /*
+ * If there are no non-local relations, then we're done. Release the memory
+ * and return.
+ */
+ if (n == 0)
+ {
+ pfree(nodes);
return;
}
+ /* sort the list of rnodes (but only if we're going to use the bsearch) */
+ if (n > DROP_RELATIONS_BSEARCH_LIMIT)
+ pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+
for (i = 0; i < NBuffers; i++)
{
+ RelFileNode *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
-
+
/*
* As in DropRelFileNodeBuffers, an unlocked precheck should be safe
* and saves some cycles.
*/
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+
+ /*
+ * For low number of relations to drop just use a simple walk through,
+ * to save the bsearch overhead. The BSEARCH_LIMIT is rather a guess
+ * than a exactly determined value, as it depends on many factors (CPU
+ * and RAM speeds, amount of shared buffers etc.).
+ */
+ if (n <= DROP_RELATIONS_BSEARCH_LIMIT)
+ {
+ for (j = 0; j < n; j++)
+ {
+ if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
+ {
+ rnode = &nodes[j];
+ break;
+ }
+ }
+ }
+ else
+ {
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ nodes, n, sizeof(RelFileNode),
+ rnode_comparator);
+ }
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
+
+ pfree(nodes);
}
/* ---------------------------------------------------------------------
@@ -2953,3 +3007,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void *p1, const void *p2)
+{
+ RelFileNode n1 = *(RelFileNode *) p1;
+ RelFileNode n2 = *(RelFileNode *) p2;
+
+ if (n1.relNode < n2.relNode)
+ return -1;
+ else if (n1.relNode > n2.relNode)
+ return 1;
+
+ if (n1.dbNode < n2.dbNode)
+ return -1;
+ else if (n1.dbNode > n2.dbNode)
+ return 1;
+
+ if (n1.spcNode < n2.spcNode)
+ return -1;
+ else if (n1.spcNode > n2.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..f98ba27 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,69 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void
+smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend *rnodes;
+ ForkNumber forknum;
+
+ /* initialize an array which contains all relations to be dropped */
+ rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++)
+ CacheInvalidateSmgr(rnodes[i]);
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+
+ pfree(rnodes);
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
Hi Tomas,
I reviewed v6 patch, and found that several places need fix.
Sorry for extra nitpickings.
* I found another extra space after asterisk.
+ RelFileNode * nodes;
* Curly bracket which starts code block should be at the head of next line.
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels) {
* There are several trailing tabs in src/backend/catalog/storage.c.
* naming of DROP_RELATIONS_BSEARCH_LIMIT (or off-by-one bug?)
IIUC bsearch is used when # of relations to be dropped is *more* than
the value of DROP_RELATIONS_BSEARCH_LIMIT (10). IMO this behavior is
not what the macro name implies; I thought that bsearch is used for 10
relations. Besides, the word "LIMIT" is used as *upper limit* in
other macros. How about MIN_DROP_REL[ATION]S_BSEARCH or
DROP_REL[ATION]S_LINEAR_SEARCH_LIMIT?
# using RELS instead of RELATIONS would be better to shorten the name
* +1 for Alvaro's suggestion about avoiding palloc traffic by starting
with 8 elements or so.
Regards,
--
Shigeru HANADA
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 7.1.2013 10:07, Shigeru Hanada wrote:
Hi Tomas,
I reviewed v6 patch, and found that several places need fix.
Sorry for extra nitpickings.* I found another extra space after asterisk.
+ RelFileNode * nodes;
Thanks, fixed.
* Curly bracket which starts code block should be at the head of next line.
+ /* extend the array if needed (double the size) */ + if (maxrels <= nrels) {
Fixed.
* There are several trailing tabs in src/backend/catalog/storage.c.
Fixed (I balieve).
* naming of DROP_RELATIONS_BSEARCH_LIMIT (or off-by-one bug?)
IIUC bsearch is used when # of relations to be dropped is *more* than
the value of DROP_RELATIONS_BSEARCH_LIMIT (10). IMO this behavior is
not what the macro name implies; I thought that bsearch is used for 10
relations. Besides, the word "LIMIT" is used as *upper limit* in
other macros. How about MIN_DROP_REL[ATION]S_BSEARCH or
DROP_REL[ATION]S_LINEAR_SEARCH_LIMIT?
# using RELS instead of RELATIONS would be better to shorten the name
I've changed the name to DROP_RELS_BSEARCH_THRESHOLD. I believe this
naming is consistent with options like "geqo_threshold" - when the
number of relations reaches the specified value, the bsearch is used.
I've also increased the value from 10 to 20, in accordance with the
previous discussion.
* +1 for Alvaro's suggestion about avoiding palloc traffic by starting
with 8 elements or so.
Done.
regards
Tomas
Attachments:
drop-in-transaction-v7.patchtext/x-diff; name=drop-in-transaction-v7.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 8dc622a..d8dabc6 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -312,6 +312,11 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ SMgrRelation *srels = palloc(sizeof(SMgrRelation));
+ int nrels = 0,
+ i = 0,
+ maxrels = 8;
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +340,32 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels)
+ {
+ maxrels *= 2;
+ srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dddb6c0..1665630 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -62,6 +62,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
+#define DROP_RELS_BSEARCH_THRESHOLD 15
/* GUC variables */
bool zero_damaged_pages = false;
@@ -108,6 +109,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,35 +2096,87 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
- int i;
+ int i, j, n = 0;
+ RelFileNode *nodes;
+
+ nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
+ else
+ {
+ nodes[n++] = rnodes[i].node;
+ }
+ }
+
+ /*
+ * If there are no non-local relations, then we're done. Release the memory
+ * and return.
+ */
+ if (n == 0)
+ {
+ pfree(nodes);
return;
}
+ /* sort the list of rnodes (but only if we're going to use the bsearch) */
+ if (n > DROP_RELS_BSEARCH_THRESHOLD)
+ pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+
for (i = 0; i < NBuffers; i++)
{
+ RelFileNode *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
-
+
/*
* As in DropRelFileNodeBuffers, an unlocked precheck should be safe
* and saves some cycles.
*/
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+
+ /*
+ * For low number of relations to drop just use a simple walk through,
+ * to save the bsearch overhead. The BSEARCH_LIMIT is rather a guess
+ * than a exactly determined value, as it depends on many factors (CPU
+ * and RAM speeds, amount of shared buffers etc.).
+ */
+ if (n < DROP_RELS_BSEARCH_THRESHOLD)
+ {
+ for (j = 0; j < n; j++)
+ {
+ if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
+ {
+ rnode = &nodes[j];
+ break;
+ }
+ }
+ }
+ else
+ {
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ nodes, n, sizeof(RelFileNode),
+ rnode_comparator);
+ }
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
+
+ pfree(nodes);
}
/* ---------------------------------------------------------------------
@@ -2953,3 +3007,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void *p1, const void *p2)
+{
+ RelFileNode n1 = *(RelFileNode *) p1;
+ RelFileNode n2 = *(RelFileNode *) p2;
+
+ if (n1.relNode < n2.relNode)
+ return -1;
+ else if (n1.relNode > n2.relNode)
+ return 1;
+
+ if (n1.dbNode < n2.dbNode)
+ return -1;
+ else if (n1.dbNode > n2.dbNode)
+ return 1;
+
+ if (n1.spcNode < n2.spcNode)
+ return -1;
+ else if (n1.spcNode > n2.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..f98ba27 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,69 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void
+smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend *rnodes;
+ ForkNumber forknum;
+
+ /* initialize an array which contains all relations to be dropped */
+ rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++)
+ CacheInvalidateSmgr(rnodes[i]);
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+
+ pfree(rnodes);
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
Hi Tomas,
On Tue, Jan 8, 2013 at 7:08 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
* I found another extra space after asterisk.
+ RelFileNode * nodes;
Thanks, fixed.
check
* Curly bracket which starts code block should be at the head of next line.
+ /* extend the array if needed (double the size) */ + if (maxrels <= nrels) {Fixed.
check
* There are several trailing tabs in src/backend/catalog/storage.c.
Fixed (I balieve).
check
* naming of DROP_RELATIONS_BSEARCH_LIMIT (or off-by-one bug?)
IIUC bsearch is used when # of relations to be dropped is *more* than
the value of DROP_RELATIONS_BSEARCH_LIMIT (10). IMO this behavior is
not what the macro name implies; I thought that bsearch is used for 10
relations. Besides, the word "LIMIT" is used as *upper limit* in
other macros. How about MIN_DROP_REL[ATION]S_BSEARCH or
DROP_REL[ATION]S_LINEAR_SEARCH_LIMIT?
# using RELS instead of RELATIONS would be better to shorten the nameI've changed the name to DROP_RELS_BSEARCH_THRESHOLD. I believe this
naming is consistent with options like "geqo_threshold" - when the
number of relations reaches the specified value, the bsearch is used.I've also increased the value from 10 to 20, in accordance with the
previous discussion.
New name sounds good to me, but the #define says that the value is 15.
Should it be fixed to 20?
* +1 for Alvaro's suggestion about avoiding palloc traffic by starting
with 8 elements or so.Done.
Not yet. Initial size of srels array is still 1 element.
Regards,
--
Shigeru HANADA
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 8.1.2013 03:47, Shigeru Hanada wrote:
* naming of DROP_RELATIONS_BSEARCH_LIMIT (or off-by-one bug?)
IIUC bsearch is used when # of relations to be dropped is *more* than
the value of DROP_RELATIONS_BSEARCH_LIMIT (10). IMO this behavior is
not what the macro name implies; I thought that bsearch is used for 10
relations. Besides, the word "LIMIT" is used as *upper limit* in
other macros. How about MIN_DROP_REL[ATION]S_BSEARCH or
DROP_REL[ATION]S_LINEAR_SEARCH_LIMIT?
# using RELS instead of RELATIONS would be better to shorten the nameI've changed the name to DROP_RELS_BSEARCH_THRESHOLD. I believe this
naming is consistent with options like "geqo_threshold" - when the
number of relations reaches the specified value, the bsearch is used.I've also increased the value from 10 to 20, in accordance with the
previous discussion.New name sounds good to me, but the #define says that the value is 15.
Should it be fixed to 20?
Ah, yes, please increase it to 20. I've increased it from 10 to 15
first, but I think 20 is nearer the optimal value and I forgot to change
it in the code.
* +1 for Alvaro's suggestion about avoiding palloc traffic by starting
with 8 elements or so.Done.
Not yet. Initial size of srels array is still 1 element.
I've checked the patch and I see there 'maxrels = 8' - or do you mean
something else?
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Tomas Vondra wrote:
On 8.1.2013 03:47, Shigeru Hanada wrote:
* +1 for Alvaro's suggestion about avoiding palloc traffic by starting
with 8 elements or so.Done.
Not yet. Initial size of srels array is still 1 element.
I've checked the patch and I see there 'maxrels = 8' - or do you mean
something else?
Well, you need to ensure that the initial palloc is an array of that
size.
--
Álvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 8.1.2013 22:30, Alvaro Herrera wrote:
Tomas Vondra wrote:
On 8.1.2013 03:47, Shigeru Hanada wrote:
* +1 for Alvaro's suggestion about avoiding palloc traffic by starting
with 8 elements or so.Done.
Not yet. Initial size of srels array is still 1 element.
I've checked the patch and I see there 'maxrels = 8' - or do you mean
something else?Well, you need to ensure that the initial palloc is an array of that
size.
Oh, right - I forgot to modify the palloc() call. Thanks for spotting
this. Attached is a patch with increased threshold and fixed palloc call.
Tomas
Attachments:
drop-in-transaction-v8.patchtext/x-diff; name=drop-in-transaction-v8.patchDownload
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 8dc622a..b1790eb 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -312,6 +312,11 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
+
+ int nrels = 0,
+ i = 0,
+ maxrels = 8;
+ SMgrRelation *srels = palloc(maxrels * sizeof(SMgrRelation));
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +340,32 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
- smgrdounlink(srel, false);
- smgrclose(srel);
+
+ /* extend the array if needed (double the size) */
+ if (maxrels <= nrels)
+ {
+ maxrels *= 2;
+ srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
+
+ if (nrels > 0)
+ {
+ smgrdounlinkall(srels, nrels, false);
+
+ for (i = 0; i < nrels; i++)
+ smgrclose(srels[i]);
+ }
+
+ pfree(srels);
+
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index dddb6c0..2330fda 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -62,6 +62,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
+#define DROP_RELS_BSEARCH_THRESHOLD 20
/* GUC variables */
bool zero_damaged_pages = false;
@@ -108,6 +109,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void * p1, const void * p2);
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2094,35 +2096,87 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodeAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
- int i;
+ int i, j, n = 0;
+ RelFileNode *nodes;
+
+ nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
/* If it's a local relation, it's localbuf.c's problem. */
- if (RelFileNodeBackendIsTemp(rnode))
+ for (i = 0; i < nnodes; i++)
{
- if (rnode.backend == MyBackendId)
- DropRelFileNodeAllLocalBuffers(rnode.node);
+ if (RelFileNodeBackendIsTemp(rnodes[i]))
+ {
+ if (rnodes[i].backend == MyBackendId)
+ DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+ }
+ else
+ {
+ nodes[n++] = rnodes[i].node;
+ }
+ }
+
+ /*
+ * If there are no non-local relations, then we're done. Release the memory
+ * and return.
+ */
+ if (n == 0)
+ {
+ pfree(nodes);
return;
}
+ /* sort the list of rnodes (but only if we're going to use the bsearch) */
+ if (n > DROP_RELS_BSEARCH_THRESHOLD)
+ pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+
for (i = 0; i < NBuffers; i++)
{
+ RelFileNode *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
-
+
/*
* As in DropRelFileNodeBuffers, an unlocked precheck should be safe
* and saves some cycles.
*/
- if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+
+ /*
+ * For low number of relations to drop just use a simple walk through,
+ * to save the bsearch overhead. The BSEARCH_LIMIT is rather a guess
+ * than a exactly determined value, as it depends on many factors (CPU
+ * and RAM speeds, amount of shared buffers etc.).
+ */
+ if (n < DROP_RELS_BSEARCH_THRESHOLD)
+ {
+ for (j = 0; j < n; j++)
+ {
+ if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
+ {
+ rnode = &nodes[j];
+ break;
+ }
+ }
+ }
+ else
+ {
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ nodes, n, sizeof(RelFileNode),
+ rnode_comparator);
+ }
+
+ /* buffer does not belong to any of the relations */
+ if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
+
+ pfree(nodes);
}
/* ---------------------------------------------------------------------
@@ -2953,3 +3007,31 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
+
+/*
+ * Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]), so
+ * that it's suitable for bsearch.
+ */
+static int
+rnode_comparator(const void *p1, const void *p2)
+{
+ RelFileNode n1 = *(RelFileNode *) p1;
+ RelFileNode n2 = *(RelFileNode *) p2;
+
+ if (n1.relNode < n2.relNode)
+ return -1;
+ else if (n1.relNode > n2.relNode)
+ return 1;
+
+ if (n1.dbNode < n2.dbNode)
+ return -1;
+ else if (n1.dbNode > n2.dbNode)
+ return 1;
+
+ if (n1.spcNode < n2.spcNode)
+ return -1;
+ else if (n1.spcNode > n2.spcNode)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5dff8b3..f98ba27 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
- DropRelFileNodeAllBuffers(rnode);
+ DropRelFileNodeAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,69 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
+void
+smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileNodeBackend *rnodes;
+ ForkNumber forknum;
+
+ /* initialize an array which contains all relations to be dropped */
+ rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+ int which = rels[i]->smgr_which;
+
+ rnodes[i] = rnode;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+ }
+
+ /*
+ * Get rid of any remaining buffers for the relation. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelFileNodeAllBuffers(rnodes, nrels);
+
+ /*
+ * It'd be nice to tell the stats collector to forget it immediately, too.
+ * But we can't because we don't know the OID (and in cases involving
+ * relfilenode swaps, it's not always clear which table OID to forget,
+ * anyway).
+ */
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for this rel. We should do this
+ * before starting the actual unlinking, in case we fail partway through
+ * that step. Note that the sinval message will eventually come back to
+ * this backend, too, and thereby provide a backstop that we closed our
+ * own smgr rel.
+ */
+ for (i = 0; i < nrels; i++)
+ CacheInvalidateSmgr(rnodes[i]);
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+ }
+
+ pfree(rnodes);
+}
+
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..1deee42 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend * rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 9eccf76..272476b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
Hi Tomas,
On Wed, Jan 9, 2013 at 6:38 AM, Tomas Vondra <tv@fuzzy.cz> wrote:
Well, you need to ensure that the initial palloc is an array of that
size.Oh, right - I forgot to modify the palloc() call. Thanks for spotting
this. Attached is a patch with increased threshold and fixed palloc call.
OK, both threshold and initial palloc were fixed correctly.
I'll mark this patch as "Ready for committer".
Good work! :-)
Regards,
--
Shigeru HANADA
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Tomas Vondra wrote:
Hi,
attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.
Made some tweaks and pushed (added comments to new functions, ensure
that we never try to palloc(0), renamed DropRelFileNodeAllBuffers to
plural, made the "use bsearch" logic a bit simpler).
Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).
I'm curious -- why would you drop tables in groups of 100 instead of
just doing the 100,000 in a single transaction? Maybe that's faster
now, because you'd do a single scan of the buffer pool instead of 1000?
(I'm assuming that "in groups of" means you do each group in a separate
transaction)
--
Álvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Alvaro Herrera <alvherre@2ndquadrant.com> writes:
Made some tweaks and pushed (added comments to new functions, ensure
that we never try to palloc(0), renamed DropRelFileNodeAllBuffers to
plural, made the "use bsearch" logic a bit simpler).
FWIW, there's nothing particularly wrong with palloc(0) ...
regards, tom lane
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 17.1.2013 20:19, Alvaro Herrera wrote:
I'm curious -- why would you drop tables in groups of 100 instead of
just doing the 100,000 in a single transaction? Maybe that's faster
now, because you'd do a single scan of the buffer pool instead of 1000?
(I'm assuming that "in groups of" means you do each group in a separate
transaction)
There's a limited number of locks, and each DROP acquires a lock
(possibly more than one).
Tomas
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 01/18/2013 03:19 AM, Alvaro Herrera wrote:
Tomas Vondra wrote:
Hi,
attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.Made some tweaks and pushed (added comments to new functions, ensure
that we never try to palloc(0), renamed DropRelFileNodeAllBuffers to
plural, made the "use bsearch" logic a bit simpler).
Commit ref is
Flagged as committed.
--
Craig Ringer http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services