Adding hook in BufferSync for backup purposes
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Attachments:
0001-Add-hook-to-BuferSync.patchtext/x-diff; name=0001-Add-hook-to-BuferSync.patchDownload
From 01d692e01716d6904847e4ce73faabbd7cc9fa97 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sat, 5 Aug 2017 12:36:32 +0500
Subject: [PATCH] Add hook to BuferSync
---
src/backend/storage/buffer/bufmgr.c | 37 +++++++++--------------------------
src/include/storage/buf_internals.h | 39 +++++++++++++++++++++++++++++++++++--
2 files changed, 46 insertions(+), 30 deletions(-)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 15795b0c5a..57bb5b89f0 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -76,34 +76,6 @@ typedef struct PrivateRefCountEntry
/* 64 bytes, about the size of a cache line on common systems */
#define REFCOUNT_ARRAY_ENTRIES 8
-/*
- * Status of buffers to checkpoint for a particular tablespace, used
- * internally in BufferSync.
- */
-typedef struct CkptTsStatus
-{
- /* oid of the tablespace */
- Oid tsId;
-
- /*
- * Checkpoint progress for this tablespace. To make progress comparable
- * between tablespaces the progress is, for each tablespace, measured as a
- * number between 0 and the total number of to-be-checkpointed pages. Each
- * page checkpointed in this tablespace increments this space's progress
- * by progress_slice.
- */
- float8 progress;
- float8 progress_slice;
-
- /* number of to-be checkpointed pages in this tablespace */
- int num_to_scan;
- /* already processed pages in this tablespace */
- int num_scanned;
-
- /* current offset in CkptBufferIds for this tablespace */
- int index;
-} CkptTsStatus;
-
/* GUC variables */
bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
@@ -1764,6 +1736,10 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
}
}
+
+/* Hook for plugins to get control in BufferSync() */
+checkpointer_buffer_sync_hook_type checkpointer_buffer_sync_hook = NULL;
+
/*
* BufferSync -- Write out all dirty buffers in the pool.
*
@@ -1926,6 +1902,11 @@ BufferSync(int flags)
Assert(num_spaces > 0);
+ if(checkpointer_buffer_sync_hook)
+ checkpointer_buffer_sync_hook(CheckpointStats.ckpt_write_t,
+ CkptBufferIds,
+ per_ts_stat);
+
/*
* Build a min-heap over the write-progress in the individual tablespaces,
* and compute how large a portion of the total progress a single
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index b768b6fc96..b9e4f19f3a 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -278,8 +278,6 @@ extern PGDLLIMPORT WritebackContext BackendWritebackContext;
/* in localbuf.c */
extern BufferDesc *LocalBufferDescriptors;
-/* in bufmgr.c */
-
/*
* Structure to sort buffers per file on checkpoints.
*
@@ -295,9 +293,46 @@ typedef struct CkptSortItem
int buf_id;
} CkptSortItem;
+/* in bufmgr.c */
extern CkptSortItem *CkptBufferIds;
/*
+ * Status of buffers to checkpoint for a particular tablespace, used
+ * internally in BufferSync.
+ */
+typedef struct CkptTsStatus
+{
+ /* oid of the tablespace */
+ Oid tsId;
+
+ /*
+ * Checkpoint progress for this tablespace. To make progress comparable
+ * between tablespaces the progress is, for each tablespace, measured as a
+ * number between 0 and the total number of to-be-checkpointed pages. Each
+ * page checkpointed in this tablespace increments this space's progress
+ * by progress_slice.
+ */
+ float8 progress;
+ float8 progress_slice;
+
+ /* number of to-be checkpointed pages in this tablespace */
+ int num_to_scan;
+ /* already processed pages in this tablespace */
+ int num_scanned;
+
+ /* current offset in CkptBufferIds for this tablespace */
+ int index;
+} CkptTsStatus;
+
+/* Hook for plugins to get control in BufferSync() */
+typedef void (*checkpointer_buffer_sync_hook_type) (TimestampTz ckpt_write_t,
+ CkptSortItem *ckptBufferIds,
+ CkptTsStatus *per_ts_stat);
+
+/* in bufmgr.c */
+extern PGDLLIMPORT checkpointer_buffer_sync_hook_type checkpointer_buffer_sync_hook;
+
+/*
* Internal buffer management routines
*/
/* bufmgr.c */
--
2.11.0 (Apple Git-81)
Андрей Бородин wrote:
==What==
I propose to add hook inside BufferSync() function it inform extensions that we
are going to write pages to disk. Please see patch attached. I pass a timestamp
of the checkpoint, but it would be good if we could also pass there number of
checkpoint or something like this to ensure that some checkpoints were not lost
(this could yield malformed backups).
==State==
This is just an idea to discuss, I could not find something like this in
pgsql-hackers as for now. Neither I could find similar hooks in the code.
Is this hook sufficient to implement page tracking for differential backups?
I’m not sure, but seems like it is.
Hi,
I remember discussing the topic of differential base-backups with
somebody (probably Marco and Gabriele). The idea we had was to have a
new relation fork which stores an LSN for each group of pages,
indicating the LSN of the newest change to those pages. The backup tool
"scans" the whole LSN fork, and grabs images of all pages that have LSNs
newer than the one used for the previous base backup.
(I think your sketch above should use LSNs rather than timestamps).
I suppose your hook idea lets you implement the LSN fork in an
extension, rather than having it be part of core. The idea of hooking
onto BufferSync makes me uneasy, though -- like it's not the correct
place to do it. I think it should be at the point where the buffer is
modified (i.e. when WAL is written) rather than when it's checkpointed
out.
--
Álvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Alvaro Herrera <alvherre@2ndquadrant.com> writes:
I suppose your hook idea lets you implement the LSN fork in an
extension, rather than having it be part of core. The idea of hooking
onto BufferSync makes me uneasy, though -- like it's not the correct
place to do it.
Yeah. Keep in mind that if the extension does anything at all that could
possibly throw an error, and if that error condition persists across
multiple tries, you will have broken the database completely: it will
be impossible to complete a checkpoint, and your WAL segment pool will
grow until it exhausts disk. So the idea of doing something that involves
unspecified extension behavior, especially possible interaction with
an external backup agent, right there is pretty terrifying.
Other problems with the proposed patch: it misses coverage of
BgBufferSync, and I don't like exposing an ad-hoc structure like
CkptTsStatus as part of an extension API. The algorithm used by
BufferSync to schedule buffer writes has changed multiple times
before and doubtless will again; if we're going to have a hook
here it should depend as little as possible on those details.
regards, tom lane
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Alvaro, Tom, thank you for your valuable comments.
Alvaro:
I remember discussing the topic of differential base-backups with
somebody (probably Marco and Gabriele). The idea we had was to have a
new relation fork which stores an LSN for each group of pages,
indicating the LSN of the newest change to those pages. The backup tool
"scans" the whole LSN fork, and grabs images of all pages that have LSNs
newer than the one used for the previous base backup.
Thanks for the pointer, I’ve found the discussions and now I’m in a process of extraction of the knowledge from there
I think it should be at the point where the buffer is
modified (i.e. when WAL is written) rather than when it's checkpointed
out.
WAL is flushed before actual pages are written to disk(sent to kernel). I’d like to notify extensions right after we exactly sure pages were flushed.
But you are right, BufferSync is not good place for this:
1. It lacks LSNs
2. It’s not the only place to flush: bgwriter goes through nearby function FlushBuffer() and many AMs write directly to smgr (for example when matapge is created)
BufferSync() seemed sooo comfortable and efficient place for flashing info on dirty pages, already sorted and grouped by tablespace, but it is absolutely incorrect to do it there. I’ll look for the better place.
7 авг. 2017 г., в 18:37, Tom Lane <tgl@sss.pgh.pa.us> написал(а):
Yeah. Keep in mind that if the extension does anything at all that could
possibly throw an error, and if that error condition persists across
multiple tries, you will have broken the database completely: it will
be impossible to complete a checkpoint, and your WAL segment pool will
grow until it exhausts disk. So the idea of doing something that involves
unspecified extension behavior, especially possible interaction with
an external backup agent, right there is pretty terrifying.
I think that API for extensions should tend to protect developer from breaking everything, but may allow it with precaution warnings in docs and comments. Please let me know if this assumption is incorrect.
Other problems with the proposed patch: it misses coverage of
BgBufferSync, and I don't like exposing an ad-hoc structure like
CkptTsStatus as part of an extension API. The algorithm used by
BufferSync to schedule buffer writes has changed multiple times
before and doubtless will again; if we're going to have a hook
here it should depend as little as possible on those details.
OK, now I see that «buf_internals.h» had word internals for a reason. Thanks for pointing that out, I didn’t knew about changes in these algorithms.
Best regards, Andrey Borodin, Yandex.
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Andrey Borodin <x4mmm@yandex-team.ru> writes:
7 авг. 2017 г., в 18:37, Tom Lane <tgl@sss.pgh.pa.us> написал(а):
Yeah. Keep in mind that if the extension does anything at all that could
possibly throw an error, and if that error condition persists across
multiple tries, you will have broken the database completely: it will
be impossible to complete a checkpoint, and your WAL segment pool will
grow until it exhausts disk. So the idea of doing something that involves
unspecified extension behavior, especially possible interaction with
an external backup agent, right there is pretty terrifying.
I think that API for extensions should tend to protect developer from
breaking everything, but may allow it with precaution warnings in docs
and comments. Please let me know if this assumption is incorrect.
My point is not to claim that we mustn't put a hook there. It's that what
such a hook could safely do is tightly constrained, and you've not offered
clear evidence that there's something useful to be done within those
constraints. Alvaro seems to think that the result might be better
reached by hooking in at other places, and my gut reaction is similar.
regards, tom lane
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi hackers!
8 авг. 2017 г., в 11:27, Tom Lane <tgl@sss.pgh.pa.us> написал(а):
My point is not to claim that we mustn't put a hook there. It's that what
such a hook could safely do is tightly constrained, and you've not offered
clear evidence that there's something useful to be done within those
constraints. Alvaro seems to think that the result might be better
reached by hooking in at other places, and my gut reaction is similar.
I was studying through work done by Marco and Gabriel on the matter of tracking pages for the incremental backup, and I have found PTRACK patch by Yury Zhuravlev and PostgresPro [0]https://gist.github.com/stalkerg/ab833d94e2f64df241f1835651e06e4b. This work seems to be solid and thoughtful. I have drafted a new prototype for hooks enabling incremental backup as extension based on PTRACK calls.
If you look at the original patch you can see that attached patch differs slightly: functionality to track end of critical section is omitted. I have not included it in this draft because it changes very sensitive place even for those who do not need it.
Subscriber of proposed hook must remember that it is invoked under critical section. But there cannot me more than XLR_MAX_BLOCK_ID blocks for one transaction. Proposed draft does not add any functionality to track finished transactions or any atomic unit of work, just provides a flow of changed block numbers. Neither does this draft provide any assumption on where to store this information. I suppose subscribers could trigger asynchronous writes somewhere as long as info for given segment is accumulated (do we need a hook on segment end then?). During inremental backup you can skip scanning those WAL segments for which you have accumulated changeset of block numbers.
The thing which is not clear to me: if we are accumulating blocknumbers under critical section, then we must place them to preallocated array. When is the best time to take away these blocknumbers to empty that array and avoid overflow? PTRACK has array of XLR_MAX_BLOCK_ID length and saves these array during the end of each critical section. But I want to avoid intervention into critical sections.
Thank you for your attention, any thoughts will be appreciated.
Best regards, Andrey Borodin.
[0]: https://gist.github.com/stalkerg/ab833d94e2f64df241f1835651e06e4b
Attachments:
0001-hooks-to-watch-for-changed-pages.patchapplication/octet-stream; name=0001-hooks-to-watch-for-changed-pages.patch; x-unix-mode=0644Download
From 070f598f5e0998c719c877295f18f36183eefccc Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Mon, 28 Aug 2017 10:21:53 +0500
Subject: [PATCH] hooks to watch for changed pages
---
src/backend/access/transam/xlog.c | 11 +++++++++++
src/backend/access/transam/xloginsert.c | 7 +++++++
src/include/access/xloginsert.h | 6 ++++++
3 files changed, 24 insertions(+)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index df4843f409..ffbd87e94e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7024,6 +7024,7 @@ StartupXLOG(void)
do
{
bool switchedTLI = false;
+ int nblock;
#ifdef WAL_DEBUG
if (XLOG_DEBUG ||
@@ -7186,6 +7187,16 @@ StartupXLOG(void)
/* Pop the error context stack */
error_context_stack = errcallback.previous;
+ if (xlog_insert_buffer_hook)
+ for(nblock = 0; nblock < xlogreader->max_block_id; nblock++)
+ {
+ if(xlogreader->blocks[nblock].forknum == MAIN_FORKNUM)
+ {
+ xlog_insert_buffer_hook(xlogreader->blocks[nblock].blkno,
+ xlogreader->blocks[nblock].rnode, true);
+ }
+ }
+
/*
* Update lastReplayedEndRecPtr after this record has been
* successfully replayed.
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 3af03ecdb1..26dd72c21e 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -205,6 +205,9 @@ XLogResetInsertion(void)
begininsert_called = false;
}
+/* Hook for plugins to get control in during page insertion into xlog */
+PGDLLIMPORT xlog_insert_buffer_hook_type xlog_insert_buffer_hook = NULL;
+
/*
* Register a reference to a buffer with the WAL record being constructed.
* This must be called for every page that the WAL-logged operation modifies.
@@ -256,6 +259,10 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
#endif
regbuf->in_use = true;
+ if (xlog_insert_buffer_hook && regbuf->forkno == MAIN_FORKNUM)
+ {
+ xlog_insert_buffer_hook(regbuf->block, regbuf->rnode, false);
+ }
}
/*
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 174c88677f..c367669fa5 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -58,4 +58,10 @@ extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
extern void InitXLogInsert(void);
+/* Hook for plugins to get control in during page insertion into xlog */
+typedef void (*xlog_insert_buffer_hook_type) (BlockNumber block_number, RelFileNode rel, bool recovery);
+
+/* in xloginsert.c */
+extern PGDLLIMPORT xlog_insert_buffer_hook_type xlog_insert_buffer_hook;
+
#endif /* XLOGINSERT_H */
--
2.11.0 (Apple Git-81)