V3 of PITR performance improvement for 8.4 (WIP)
This is the V3 of PITR performance improvement (readahead). The
change of the code is as follows:
1) Now readahead is integrated into the core so that it can deal with
sync.rep's log shipping.
2) posix_fadvise() call was integrated with Greg Stark's patch.
Still need some more work for the following:
1) Integrate with sync.rep's wal-receiver part,
2) fix to deal with WAL segment change.
V3 is posted for pre-review purpose and V4 will be posted at the
beginning of January.
Regards;
--
------
Koichi Suzuki
Attachments:
readahead-20081226.patchtext/x-diff; name=readahead-20081226.patchDownload
diff -rcN head.org/config/c-library.m4 head/config/c-library.m4
*** head.org/config/c-library.m4 2008-08-21 22:53:28.000000000 +0900
--- head/config/c-library.m4 2008-12-26 12:10:54.000000000 +0900
***************
*** 216,221 ****
--- 216,273 ----
AC_SUBST(HAVE_POSIX_SIGNALS)])# PGAC_FUNC_POSIX_SIGNALS
+ # PGAC_CHECK_POSIX_FADVISE
+ # ------------------------
+ #
+ # Check to see if we have a *working* posix_fadvise. We can't actually check
+ # that it helps performance but we can make sure it doesn't crash. We can't
+ # even do that if we're cross-compiling :(
+
+ AC_DEFUN([PGAC_POSIX_FADVISE],
+ [AC_MSG_CHECKING([for working posix_fadvise])
+ AC_CACHE_VAL(pgac_cv_posix_fadvise,
+ [AC_TRY_RUN([
+ #define _XOPEN_SOURCE 600
+ #include <unistd.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <fcntl.h>
+
+ static int does_posix_fadvise_work(void)
+ {
+ #ifndef POSIX_FADV_WILLNEED
+ return 0;
+ #else
+ int fd;
+ char data[8192];
+ fd = creat("conftest.posix_fadvise", 0600);
+ if (fd < 0)
+ return 0;
+ if (write(fd, data, 8192) != 8192)
+ return 0;
+ if (posix_fadvise(fd, 0, 8192, POSIX_FADV_WILLNEED) != 0)
+ return 0;
+ return 1;
+ #endif
+ }
+
+ int main() {
+ return !does_posix_fadvise_work();
+ }],
+ [pgac_cv_posix_fadvise=1],
+ [pgac_cv_posix_fadvise=0],
+ [pgac_cv_posix_fadvise=cross],
+ )])
+
+ case $pgac_cv_posix_fadvise in
+ cross) AC_MSG_RESULT([cannot test (not on host machine)]);;
+ ?*) AC_MSG_RESULT([yes])
+ AC_DEFINE(USE_POSIX_FADVISE)
+ ;;
+ *) AC_MSG_RESULT([no -- avoiding use of posix_fadvise]);;
+ esac])# PGAC_CHECK_POSIX_FADVISE
+
+
# PGAC_FUNC_SNPRINTF_LONG_LONG_INT_FORMAT
# ---------------------------------------
# Determine which format snprintf uses for long long int. We handle
diff -rcN head.org/configure head/configure
*** head.org/configure 2008-12-11 16:34:05.000000000 +0900
--- head/configure 2008-12-26 12:10:54.000000000 +0900
***************
*** 16205,16211 ****
! for ac_func in cbrt dlopen fcvt fdatasync getpeereid getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs
do
as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
{ echo "$as_me:$LINENO: checking for $ac_func" >&5
--- 16205,16212 ----
!
! for ac_func in cbrt dlopen fcvt fdatasync posix_fadvise getpeereid getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf confstr towlower utime utimes waitpid wcstombs
do
as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
{ echo "$as_me:$LINENO: checking for $ac_func" >&5
***************
*** 25256,25261 ****
--- 25257,25357 ----
SEMA_IMPLEMENTATION="src/backend/port/win32_sema.c"
fi
+ # check that posix_fadvise doesn't crash or otherwise cause problems
+ if test x"$HAVE_POSIX_FADVISE" -a x"HAVE_DECL_POSIX_FADVISE"; then
+ { echo "$as_me:$LINENO: checking for working posix_fadvise" >&5
+ echo $ECHO_N "checking for working posix_fadvise... $ECHO_C" >&6; }
+ if test "${pgac_cv_posix_fadvise+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+ else
+ if test "$cross_compiling" = yes; then
+ pgac_cv_posix_fadvise=cross
+ else
+ cat >conftest.$ac_ext <<_ACEOF
+ /* confdefs.h. */
+ _ACEOF
+ cat confdefs.h >>conftest.$ac_ext
+ cat >>conftest.$ac_ext <<_ACEOF
+ /* end confdefs.h. */
+
+ #define _XOPEN_SOURCE 600
+ #include <unistd.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <fcntl.h>
+
+ static int does_posix_fadvise_work(void)
+ {
+ #ifndef POSIX_FADV_WILLNEED
+ return 0;
+ #else
+ int fd;
+ char data[8192];
+ fd = creat("conftest.posix_fadvise", 0600);
+ if (fd < 0)
+ return 0;
+ if (write(fd, data, 8192) != 8192)
+ return 0;
+ if (posix_fadvise(fd, 0, 8192, POSIX_FADV_WILLNEED) != 0)
+ return 0;
+ return 1;
+ #endif
+ }
+
+ int main() {
+ return !does_posix_fadvise_work();
+ }
+ _ACEOF
+ rm -f conftest$ac_exeext
+ if { (ac_try="$ac_link"
+ case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+ { (case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ pgac_cv_posix_fadvise=1
+ else
+ echo "$as_me: program exited with status $ac_status" >&5
+ echo "$as_me: failed program was:" >&5
+ sed 's/^/| /' conftest.$ac_ext >&5
+
+ ( exit $ac_status )
+ pgac_cv_posix_fadvise=0
+ fi
+ rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+ fi
+
+
+ fi
+
+
+ case $pgac_cv_posix_fadvise in
+ cross) { echo "$as_me:$LINENO: result: cannot test (not on host machine)" >&5
+ echo "${ECHO_T}cannot test (not on host machine)" >&6; };;
+ ?*) { echo "$as_me:$LINENO: result: yes" >&5
+ echo "${ECHO_T}yes" >&6; }
+ cat >>confdefs.h <<\_ACEOF
+ #define USE_POSIX_FADVISE 1
+ _ACEOF
+
+ ;;
+ *) { echo "$as_me:$LINENO: result: no -- avoiding use of posix_fadvise" >&5
+ echo "${ECHO_T}no -- avoiding use of posix_fadvise" >&6; };;
+ esac
+ fi
# Select shared-memory implementation type.
if test "$PORTNAME" != "win32"; then
diff -rcN head.org/configure.in head/configure.in
*** head.org/configure.in 2008-12-11 16:34:07.000000000 +0900
--- head/configure.in 2008-12-26 12:10:54.000000000 +0900
***************
*** 1143,1149 ****
AC_FUNC_ACCEPT_ARGTYPES
PGAC_FUNC_GETTIMEOFDAY_1ARG
! AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getpeereid getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs])
AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])
AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>])
--- 1143,1149 ----
AC_FUNC_ACCEPT_ARGTYPES
PGAC_FUNC_GETTIMEOFDAY_1ARG
! AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync posix_fadvise getpeereid getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf confstr towlower utime utimes waitpid wcstombs])
AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])
AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>])
***************
*** 1654,1659 ****
--- 1654,1663 ----
SEMA_IMPLEMENTATION="src/backend/port/win32_sema.c"
fi
+ # check that posix_fadvise doesn't crash or otherwise cause problems
+ if test x"$HAVE_POSIX_FADVISE" -a x"HAVE_DECL_POSIX_FADVISE"; then
+ PGAC_POSIX_FADVISE
+ fi
# Select shared-memory implementation type.
if test "$PORTNAME" != "win32"; then
diff -rcN head.org/src/backend/access/gin/ginxlog.c head/src/backend/access/gin/ginxlog.c
*** head.org/src/backend/access/gin/ginxlog.c 2008-07-12 06:06:29.000000000 +0900
--- head/src/backend/access/gin/ginxlog.c 2008-12-26 12:10:54.000000000 +0900
***************
*** 15,20 ****
--- 15,21 ----
#include "access/gin.h"
#include "access/xlogutils.h"
+ #include "access/readahead.h"
#include "storage/bufmgr.h"
#include "utils/memutils.h"
***************
*** 519,524 ****
--- 520,629 ----
}
}
+ /*
+ * gin_readahead - enqueue information about data pages
+ *
+ * The readahead module stores information about pages that are modified through
+ * redo-ing record.
+ *
+ */
+ bool
+ gin_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ Assert(record);
+
+ switch (info)
+ {
+ case XLOG_GIN_CREATE_INDEX:
+ {
+ RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(*node, GIN_ROOT_BLKNO, lsn.xrecoff, false);
+ break;
+ }
+ case XLOG_GIN_CREATE_PTREE:
+ {
+ ginxlogCreatePostingTree *data =
+ (ginxlogCreatePostingTree *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(data->node, data->blkno, lsn.xrecoff, false);
+ break;
+ }
+ case XLOG_GIN_INSERT:
+ {
+ ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(data->node, data->blkno,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_GIN_SPLIT:
+ {
+ int readahead_cnt;
+ ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record);
+
+ readahead_cnt = 2;
+ if (data->isRootSplit)
+ readahead_cnt++;
+
+ if (!ReadAheadHasRoom(readahead_cnt))
+ return false;
+
+ ReadAheadAddEntry(data->node, data->lblkno, lsn.xrecoff, false);
+ ReadAheadAddEntry(data->node, data->rblkno, lsn.xrecoff, false);
+ if (data->isRootSplit)
+ {
+ ReadAheadAddEntry(data->node, data->rootBlkno,
+ lsn.xrecoff, false);
+ }
+ break;
+ }
+ case XLOG_GIN_VACUUM_PAGE:
+ {
+ ginxlogVacuumPage *data =
+ (ginxlogVacuumPage *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(data->node, data->blkno,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_GIN_DELETE_PAGE:
+ {
+ int readahead_cnt;
+ ginxlogDeletePage *data =
+ (ginxlogDeletePage *) XLogRecGetData(record);
+ readahead_cnt = 2;
+ if (data->leftBlkno != InvalidBlockNumber)
+ readahead_cnt++;
+
+ if (!ReadAheadHasRoom(2))
+ return false;
+ ReadAheadAddEntry(data->node, data->blkno,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ ReadAheadAddEntry(data->node, data->parentBlkno,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ if (data->leftBlkno != InvalidBlockNumber)
+ {
+ ReadAheadAddEntry(data->node, data->leftBlkno,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3);
+ }
+ break;
+ }
+ }
+
+ return true;
+ }
+
void
gin_xlog_startup(void)
{
diff -rcN head.org/src/backend/access/gist/gistxlog.c head/src/backend/access/gist/gistxlog.c
*** head.org/src/backend/access/gist/gistxlog.c 2008-06-19 09:46:03.000000000 +0900
--- head/src/backend/access/gist/gistxlog.c 2008-12-26 12:10:54.000000000 +0900
***************
*** 15,20 ****
--- 15,21 ----
#include "access/gist_private.h"
#include "access/xlogutils.h"
+ #include "access/readahead.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "utils/memutils.h"
***************
*** 500,505 ****
--- 501,584 ----
}
}
+ /*
+ * gist_readahead - enqueue information about data pages
+ *
+ * The readahead module stores information about pages that are modified through
+ * redo-ing record.
+ *
+ */
+ bool
+ gist_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ Assert(record);
+
+ switch (info)
+ {
+ case XLOG_GIST_PAGE_UPDATE:
+ case XLOG_GIST_NEW_ROOT:
+ {
+ PageUpdateRecord xlrec;
+
+ decodePageUpdateRecord(&xlrec, record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec.data->node, xlrec.data->blkno,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_GIST_PAGE_SPLIT:
+ {
+ int i;
+
+ PageSplitRecord rec;
+ decodePageSplitRecord(&rec, record);
+
+ if (!ReadAheadHasRoom(rec.data->npage))
+ return false;
+ for (i = 0; i < rec.data->npage; i++)
+ {
+ ReadAheadAddEntry(rec.data->node, rec.page[i].header->blkno,
+ lsn.xrecoff, false);
+ }
+ break;
+ }
+ case XLOG_GIST_INSERT_COMPLETE:
+ {
+ /*
+ * This WAL record never touch data page, so nothi ng
+ * to do.
+ */
+ break;
+ }
+ case XLOG_GIST_CREATE_INDEX:
+ {
+ RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(*node, GIST_ROOT_BLKNO, lsn.xrecoff, false);
+ break;
+ }
+ case XLOG_GIST_PAGE_DELETE:
+ {
+ gistxlogPageDelete *xldata =
+ (gistxlogPageDelete *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xldata->node, xldata->blkno,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ }
+
+ return true;
+ }
+
IndexTuple
gist_form_invalid_tuple(BlockNumber blkno)
{
diff -rcN head.org/src/backend/access/heap/heapam.c head/src/backend/access/heap/heapam.c
*** head.org/src/backend/access/heap/heapam.c 2008-12-17 01:26:08.000000000 +0900
--- head/src/backend/access/heap/heapam.c 2008-12-26 12:10:54.000000000 +0900
***************
*** 42,47 ****
--- 42,48 ----
#include "access/heapam.h"
#include "access/hio.h"
#include "access/multixact.h"
+ #include "access/readahead.h"
#include "access/relscan.h"
#include "access/sysattr.h"
#include "access/transam.h"
***************
*** 4970,4975 ****
--- 4971,5122 ----
}
/*
+ * heap_readahead - enqueue information about data pages
+ *
+ * The readahead module stores information about pages that are modified through
+ * redo-ing record.
+ *
+ */
+ bool
+ heap_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ Assert(record);
+
+ switch (info & XLOG_HEAP_OPMASK)
+ {
+ case XLOG_HEAP_INSERT:
+ {
+ xl_heap_insert *xlrec =
+ (xl_heap_insert *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->target.node,
+ ItemPointerGetBlockNumber(&xlrec->target.tid),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_HEAP_DELETE:
+ {
+ xl_heap_delete *xlrec =
+ (xl_heap_delete *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->target.node,
+ ItemPointerGetBlockNumber(&xlrec->target.tid),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_HEAP_UPDATE:
+ case XLOG_HEAP_MOVE:
+ case XLOG_HEAP_HOT_UPDATE:
+ {
+ bool samepage;
+ xl_heap_update *xlrec =
+ (xl_heap_update *) XLogRecGetData(record);
+
+ samepage = ItemPointerGetBlockNumber(&xlrec->newtid) ==
+ ItemPointerGetBlockNumber(&xlrec->target.tid);
+
+ if (!ReadAheadHasRoom(1 + (samepage ? 0 : 1)))
+ return false;
+ /* store page which contains updated tuple. */
+ ReadAheadAddEntry(xlrec->target.node,
+ ItemPointerGetBlockNumber(&xlrec->target.tid),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ /* store another page if any. */
+ if (!samepage)
+ ReadAheadAddEntry(xlrec->target.node,
+ ItemPointerGetBlockNumber(&xlrec->newtid),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ break;
+ }
+ case XLOG_HEAP_NEWPAGE:
+ {
+ xl_heap_newpage *xlrec =
+ (xl_heap_newpage *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->node, xlrec->blkno,
+ lsn.xrecoff, false);
+ break;
+ }
+ case XLOG_HEAP_LOCK:
+ {
+ xl_heap_lock *xlrec =
+ (xl_heap_lock *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->target.node,
+ ItemPointerGetBlockNumber(&xlrec->target.tid),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_HEAP_INPLACE:
+ {
+ xl_heap_inplace *xlrec =
+ (xl_heap_inplace *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->target.node,
+ ItemPointerGetBlockNumber(&xlrec->target.tid),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ /*
+ * heap2_readahead - enqueue information about data pages
+ *
+ * The readahead module stores information about pages that are modified through
+ * redo-ing record.
+ *
+ */
+ bool
+ heap2_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ Assert(record);
+
+ switch (record->xl_info)
+ {
+ case XLOG_HEAP2_FREEZE:
+ {
+ xl_heap_freeze *xlrec =
+ (xl_heap_freeze *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->node, xlrec->block,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_HEAP2_CLEAN:
+ case XLOG_HEAP2_CLEAN_MOVE:
+ {
+ xl_heap_clean *xlrec =
+ (xl_heap_clean *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->node, xlrec->block,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ /*
* heap_sync - sync a heap, for use when no WAL has been written
*
* This forces the heap contents (including TOAST heap if any) down to disk.
diff -rcN head.org/src/backend/access/nbtree/nbtxlog.c head/src/backend/access/nbtree/nbtxlog.c
*** head.org/src/backend/access/nbtree/nbtxlog.c 2008-06-12 18:12:30.000000000 +0900
--- head/src/backend/access/nbtree/nbtxlog.c 2008-12-26 12:10:54.000000000 +0900
***************
*** 16,21 ****
--- 16,22 ----
#include "access/nbtree.h"
#include "access/transam.h"
+ #include "access/readahead.h"
#include "storage/bufmgr.h"
/*
***************
*** 878,883 ****
--- 879,1014 ----
}
}
+ /*
+ * btree_readahead - enqueue information about data pages
+ *
+ */
+ bool
+ btree_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ Assert(record);
+
+ switch (info)
+ {
+ case XLOG_BTREE_INSERT_LEAF:
+ case XLOG_BTREE_INSERT_UPPER:
+ case XLOG_BTREE_INSERT_META:
+ {
+ int readahead_cnt;
+ xl_btree_insert *xlrec =
+ (xl_btree_insert *) XLogRecGetData(record);
+
+ readahead_cnt = 1;
+ if (info == XLOG_BTREE_INSERT_META)
+ readahead_cnt++;
+
+ if (!ReadAheadHasRoom(readahead_cnt))
+ return false;
+ ReadAheadAddEntry(xlrec->target.node,
+ BlockIdGetBlockNumber(&xlrec->target.tid.ip_blkid),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ if (info == XLOG_BTREE_INSERT_META)
+ {
+ ReadAheadAddEntry(xlrec->target.node,
+ BTREE_METAPAGE, lsn.xrecoff, false);
+ }
+ break;
+ }
+ case XLOG_BTREE_SPLIT_L:
+ case XLOG_BTREE_SPLIT_L_ROOT:
+ case XLOG_BTREE_SPLIT_R:
+ case XLOG_BTREE_SPLIT_R_ROOT:
+ {
+ int readahead_cnt;
+ xl_btree_split *xlrec =
+ (xl_btree_split *) XLogRecGetData(record);
+
+ readahead_cnt = 2;
+ if (xlrec->rnext != P_NONE)
+ readahead_cnt++;
+
+ if (!ReadAheadHasRoom(readahead_cnt))
+ return false;
+
+ ReadAheadAddEntry(xlrec->node, xlrec->rightsib,
+ lsn.xrecoff, false);
+ ReadAheadAddEntry(xlrec->node, xlrec->leftsib,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ if (xlrec->rnext != P_NONE)
+ {
+ ReadAheadAddEntry(xlrec->node, xlrec->rnext,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ }
+ break;
+ }
+ case XLOG_BTREE_DELETE:
+ {
+ xl_btree_delete *xlrec =
+ (xl_btree_delete *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->node, xlrec->block,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ break;
+ }
+ case XLOG_BTREE_DELETE_PAGE:
+ case XLOG_BTREE_DELETE_PAGE_META:
+ case XLOG_BTREE_DELETE_PAGE_HALF:
+ {
+ int readahead_cnt;
+ xl_btree_delete_page *xlrec =
+ (xl_btree_delete_page *) XLogRecGetData(record);
+
+ readahead_cnt = 3;
+ if (info == XLOG_BTREE_DELETE_PAGE_META)
+ readahead_cnt++;
+ if (xlrec->leftblk != P_NONE)
+ readahead_cnt++;
+
+ /* parent page */
+ ReadAheadAddEntry(xlrec->target.node,
+ ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ /* rightsib page */
+ ReadAheadAddEntry(xlrec->target.node, xlrec->rightblk,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ /* leftsib page, if exists */
+ if (xlrec->leftblk != P_NONE)
+ {
+ ReadAheadAddEntry(xlrec->target.node, xlrec->leftblk,
+ lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3);
+ }
+ /* target page */
+ ReadAheadAddEntry(xlrec->target.node,
+ xlrec->deadblk, lsn.xrecoff, false);
+ /* metapage, if exists */
+ if (info == XLOG_BTREE_DELETE_PAGE_META)
+ {
+ ReadAheadAddEntry(xlrec->target.node,
+ BTREE_METAPAGE, lsn.xrecoff, false);
+ }
+ break;
+ }
+ case XLOG_BTREE_NEWROOT:
+ {
+ xl_btree_newroot *xlrec =
+ (xl_btree_newroot *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ /* FPW does not exists. */
+ ReadAheadAddEntry(xlrec->node, xlrec->rootblk,
+ lsn.xrecoff, false);
+ break;
+ }
+ }
+
+ return true;
+ }
+
void
btree_xlog_startup(void)
{
diff -rcN head.org/src/backend/access/transam/Makefile head/src/backend/access/transam/Makefile
*** head.org/src/backend/access/transam/Makefile 2008-02-19 19:30:07.000000000 +0900
--- head/src/backend/access/transam/Makefile 2008-12-26 12:10:54.000000000 +0900
***************
*** 12,18 ****
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o
include $(top_srcdir)/src/backend/common.mk
--- 12,18 ----
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o readahead.o
include $(top_srcdir)/src/backend/common.mk
diff -rcN head.org/src/backend/access/transam/readahead.c head/src/backend/access/transam/readahead.c
*** head.org/src/backend/access/transam/readahead.c 1970-01-01 09:00:00.000000000 +0900
--- head/src/backend/access/transam/readahead.c 2008-12-26 12:37:50.000000000 +0900
***************
*** 0 ****
--- 1,292 ----
+ /*-------------------------------------------------------------------------
+ *
+ * readahead.c
+ * Store information of data pages which should be read ahead.
+ *
+ * Portions Copyright (c) 2008, Nippon Telegraph and Telephone Corporation
+ * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+
+ #include <unistd.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <fcntl.h>
+
+ #include "postgres.h"
+ #include "access/readahead.h"
+ #include "access/xlog_internal.h"
+ #include "catalog/catalog.h"
+ #include "storage/relfilenode.h"
+ #include "storage/block.h"
+
+ #if USE_POSIX_FADVISE
+
+ /*
+ * Information about the data page which will be read ahead.
+ */
+ struct XLogReadAhead {
+ /*
+ * The physical location of the data page.
+ */
+ RelFileNode node;
+ BlockNumber blkno;
+
+ /*
+ * xrecoff is the byte offset of location in the WAL segment file as
+ * defined in xlogdefs.h. The read ahead command does not deal with more
+ * than one WAL segment file at once, and xlogid is not going to be changed
+ * during read-ahead. This is why we need only xrecoff.
+ */
+ uint32 xrecoff;
+
+ /*
+ * has_fpw indicates whether an WAL record contains full page write or not.
+ * This is used to skip unnecessary read-aheads.
+ */
+ bool has_fpw;
+ };
+ typedef struct XLogReadAhead XLogReadAhead;
+
+ /*
+ * ReadAheadQueueSize is the initail size of XLogReadAhead queue.
+ * When the number of XLogReadAhead reaches this amount, we execute readahead.
+ * Queue uses 16MB.
+ */
+ #define ReadAheadQueueSize (16 * 1024 * 1024 / sizeof(XLogReadAhead))
+
+ /* The queue for XLogReadAhead entries. */
+ static XLogReadAhead ReadAheadQueue[ReadAheadQueueSize];
+
+ /* The number of XLogReadAhead entries currently used. */
+ static uint32 ReadAheadQueueUsed = 0;
+
+ /* prototype of local function */
+ static int ReadAheadCompare(const void *l, const void *r);
+
+ /*
+ * Append a new XLogReadAhead entry to the queue
+ *
+ * If XLogReadAhead queue is fullfilled, prefetch first and add ReadAheadQueue
+ * to empty queue.
+ */
+ void
+ ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff,
+ bool has_fpw)
+ {
+ /* all entries are used, so prefetch pages and make the queue empty */
+ if (ReadAheadQueueUsed >= ReadAheadQueueSize)
+ {
+ ReadAheadExecute();
+ }
+
+ /* Append a new XLogReadAhead ReadAheadQueue to the queue. */
+ ReadAheadQueue[ReadAheadQueueUsed].node = node;
+ ReadAheadQueue[ReadAheadQueueUsed].blkno = blkno;
+ ReadAheadQueue[ReadAheadQueueUsed].xrecoff = xrecoff;
+ ReadAheadQueue[ReadAheadQueueUsed].has_fpw = has_fpw;
+
+ ReadAheadQueueUsed++;
+ }
+
+ /*
+ * ReadAhead queue availability check
+ *
+ * If the XLogReadAhead queue has enough room for appending more num of
+ * XLogReadAhead,
+ * return true. If it does not, try to double the queue.
+ * If new queue could't be allocated, return false.
+ */
+ bool
+ ReadAheadHasRoom(int num)
+ {
+ return (ReadAheadQueueUsed + num <= ReadAheadQueueSize);
+ }
+
+ /*
+ * Check whether info1 and info2 point same data page.
+ */
+ #define IS_SAME_PAGE(info1, info2) \
+ (RelFileNodeEquals((info1).node, (info2).node) && \
+ (info1).blkno == (info2).blkno)
+
+ /*
+ * Execute read ahead data pages
+ *
+ * Before we actually read ahead data pages, sort the XLogReadAhead in the queue
+ * for avoiding duplicated disk access and hopefully, reducing seek time.
+ * We also skip read ahead data pages which has full page write.
+ *
+ * For performance, we keep file opened until reading another file.
+ */
+ void
+ ReadAheadExecute(void)
+ {
+ int fd = -1;
+ int i;
+ BlockNumber last_segno = InvalidBlockNumber;
+ BlockNumber segno;
+ XLogReadAhead last_entry = { { 0, 0, 0, }, 0, 0, false };
+
+ ereport(DEBUG1, (errmsg("%d blocks are prefetch canditate",
+ ReadAheadQueueUsed)));
+
+ /* Sort the XLogReadAhead queue for effective disk access. */
+ qsort(ReadAheadQueue, ReadAheadQueueUsed, sizeof(XLogReadAhead),
+ ReadAheadCompare);
+
+ for (i = 0; i < ReadAheadQueueUsed; i++)
+ {
+ /* Do read ahead once per a page if it doesn't have full page write. */
+ if (IS_SAME_PAGE(last_entry, ReadAheadQueue[i]) ||
+ ReadAheadQueue[i].has_fpw)
+ {
+ last_entry = ReadAheadQueue[i];
+ continue;
+ }
+
+ /* Open data file if not opened yet. */
+ last_segno = last_entry.blkno / RELSEG_SIZE;
+ segno = ReadAheadQueue[i].blkno / RELSEG_SIZE;
+
+ if (last_segno != segno ||
+ !RelFileNodeEquals(last_entry.node, ReadAheadQueue[i].node))
+ {
+ char *path;
+ char *fullpath;
+
+ if (fd != -1)
+ {
+ close(fd);
+ }
+
+ /*
+ * Generate file path and add segment number if necessary.
+ * Based on _mdfd_openseg() in src/backend/storage/smgr.c
+ */
+ path = relpath(ReadAheadQueue[i].node, MAIN_FORKNUM);
+ if (segno > 0)
+ {
+ fullpath = palloc(strlen(path) + 12);
+ sprintf(fullpath, "%s.%u", path, segno);
+ pfree(path);
+ }
+ else
+ fullpath = path;
+
+ fd = open(fullpath, O_RDONLY | PG_BINARY);
+ pfree(fullpath);
+ if (fd == -1)
+ {
+ /*
+ * Even if open() returns error, continue to read ahead.
+ * We assume that the data file has not been created yet.
+ */
+ last_entry = ReadAheadQueue[i];
+ continue;
+ }
+ }
+
+ /* Read ahead with posix_fadvise() */
+ if (fd != -1)
+ {
+ /* Even if posix_fadvise() returns error, continue to read ahead. */
+ posix_fadvise(fd, (ReadAheadQueue[i].blkno % RELSEG_SIZE) * BLCKSZ,
+ BLCKSZ, POSIX_FADV_WILLNEED);
+ }
+
+ /* Store XLogReadAhead to skip duplicate pages. */
+ last_entry = ReadAheadQueue[i];
+ }
+
+ ReadAheadQueueUsed = 0;
+ }
+
+ /*
+ * Compare two XLogReadAhead objects
+ *
+ * When l > r, then return 1, l == r, then return 0, and l < r, then return -1.
+ * The priority of comparison clauses shows below;
+ * 1. node.spcNode
+ * 2. node.dbNode
+ * 3. node.relNode
+ * 4. blkno
+ * 5. xrecoff
+ */
+ static int
+ ReadAheadCompare(const void *l, const void *r)
+ {
+ XLogReadAhead *left = (XLogReadAhead *)l;
+ XLogReadAhead *right = (XLogReadAhead *)r;
+
+ /* compare node.spcNode */
+ if (left->node.spcNode > right->node.spcNode)
+ return 1;
+ else if (left->node.spcNode < right->node.spcNode)
+ return -1;
+
+ /* compare node.dbNode */
+ if (left->node.dbNode > right->node.dbNode)
+ return 1;
+ else if (left->node.dbNode < right->node.dbNode)
+ return -1;
+
+ /* compare node.relNode */
+ if (left->node.relNode > right->node.relNode)
+ return 1;
+ else if (left->node.relNode < right->node.relNode)
+ return -1;
+
+ /* compare blkno */
+ if (left->blkno > right->blkno)
+ return 1;
+ else if (left->blkno < right->blkno)
+ return -1;
+
+ /* compare xrecoff */
+ if (left->xrecoff > right->xrecoff)
+ return 1;
+ else if (left->xrecoff < right->xrecoff)
+ return -1;
+
+ /* These two XLogReadAhead are same. */
+ return 0;
+ }
+
+ #else
+
+ void
+ ReadAheadStart(void)
+ {
+ /* do nothing */
+ }
+
+ void
+ ReadAheadEnd(void)
+ {
+ /* do nothing */
+ }
+
+ void
+ ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff,
+ bool has_fpw)
+ {
+ /* do nothing */
+ }
+
+ bool
+ ReadAheadHasRoom(int num)
+ {
+ /* do nothing */
+ return true;
+ }
+
+ void
+ ReadAheadExecute(void)
+ {
+ /* do nothing */
+ }
+
+ #endif /* USE_POSIX_FADVISE */
diff -rcN head.org/src/backend/access/transam/rmgr.c head/src/backend/access/transam/rmgr.c
*** head.org/src/backend/access/transam/rmgr.c 2008-11-19 19:34:50.000000000 +0900
--- head/src/backend/access/transam/rmgr.c 2008-12-26 12:10:54.000000000 +0900
***************
*** 24,43 ****
const RmgrData RmgrTable[RM_MAX_ID + 1] = {
! {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL},
! {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL},
! {"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL},
! {"CLOG", clog_redo, clog_desc, NULL, NULL, NULL},
! {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL},
! {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
! {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
! {"Reserved 7", NULL, NULL, NULL, NULL, NULL},
! {"Reserved 8", NULL, NULL, NULL, NULL, NULL},
! {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
! {"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
! {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
! {"Hash", hash_redo, hash_desc, NULL, NULL, NULL},
! {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint},
! {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint},
! {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL}
};
--- 24,43 ----
const RmgrData RmgrTable[RM_MAX_ID + 1] = {
! {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL, NULL},
! {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL, NULL},
! {"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL, NULL},
! {"CLOG", clog_redo, clog_desc, NULL, NULL, NULL, NULL},
! {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL, NULL},
! {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL, NULL},
! {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL, NULL},
! {"Reserved 7", NULL, NULL, NULL, NULL, NULL, NULL},
! {"Reserved 8", NULL, NULL, NULL, NULL, NULL, NULL},
! {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL, heap2_readahead},
! {"Heap", heap_redo, heap_desc, NULL, NULL, NULL, heap_readahead},
! {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint, btree_readahead},
! {"Hash", hash_redo, hash_desc, NULL, NULL, NULL, NULL},
! {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint, gin_readahead},
! {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint, gist_readahead},
! {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL, seq_readahead}
};
diff -rcN head.org/src/backend/access/transam/xlog.c head/src/backend/access/transam/xlog.c
*** head.org/src/backend/access/transam/xlog.c 2008-12-17 10:39:03.000000000 +0900
--- head/src/backend/access/transam/xlog.c 2008-12-26 16:25:30.000000000 +0900
***************
*** 24,29 ****
--- 24,30 ----
#include "access/clog.h"
#include "access/multixact.h"
+ #include "access/readahead.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/tuptoaster.h"
***************
*** 381,386 ****
--- 382,398 ----
static char *readRecordBuf = NULL;
static uint32 readRecordBufSize = 0;
+ /*
+ * Buffer for queued WAL records (fixed size)
+ *
+ * This buffer is used for holding WAL records and their LSNs. When the all WAL
+ * records of one WAL segment file are read, redo them and make the buffer
+ * empty. Therefore, twice of XLogSegSize, determined by the total size of WAL
+ * records and LSNs, must be enough for the buffer.
+ */
+ static char RecordQueueBuf[XLogSegSize * 2];
+ static uint32 RecordQueueBufUsed = 0;
+
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr; /* start of last record read */
static XLogRecPtr EndRecPtr; /* end+1 of last record read */
***************
*** 441,446 ****
--- 453,461 ----
static void rm_redo_error_callback(void *arg);
static int get_sync_bit(int method);
+ static void PushRecord(XLogRecPtr lsn, XLogRecord *record);
+ static bool PushReadAhead(XLogRecPtr lsn, XLogRecord *record);
+ static void RedoRecords(void);
/*
* Insert an XLOG record having the specified RMID and info bytes,
***************
*** 2364,2369 ****
--- 2379,2386 ----
ListCell *cell;
int fd;
+ ereport(DEBUG1, (errmsg("XLOG switch to %X/%X", log, seg)));
+
/*
* Loop looking for a suitable timeline ID: we might need to read any of
* the timelines listed in expectedTLIs.
***************
*** 3327,3332 ****
--- 3344,3356 ----
readOff += XLOG_BLCKSZ;
if (readOff >= XLogSegSize)
{
+ /*
+ * Reached to the end of current WAL segment file, redo all of
+ * WAL records in the queue.
+ */
+ ereport(DEBUG1, (errmsg("switching WAL segment")));
+ RedoRecords();
+
close(readFile);
readFile = -1;
NextLogSeg(readId, readSeg);
***************
*** 3424,3429 ****
--- 3448,3460 ----
return (XLogRecord *) buffer;
next_record_is_invalid:;
+ /*
+ * Reached to unused area of current WAL segment file, redo all of WAL
+ * records in the queue.
+ */
+ ereport(DEBUG1, (errmsg("next record is invalid(maybe unused area)")));
+ RedoRecords();
+
if (readFile >= 0)
{
close(readFile);
***************
*** 5154,5160 ****
{
bool recoveryContinue = true;
bool recoveryApply = true;
- ErrorContextCallback errcontext;
InRedo = true;
ereport(LOG,
--- 5185,5190 ----
***************
*** 5196,5228 ****
break;
}
! /* Setup error traceback support for ereport() */
! errcontext.callback = rm_redo_error_callback;
! errcontext.arg = (void *) record;
! errcontext.previous = error_context_stack;
! error_context_stack = &errcontext;
!
! /* nextXid must be beyond record's xid */
! if (TransactionIdFollowsOrEquals(record->xl_xid,
! ShmemVariableCache->nextXid))
{
! ShmemVariableCache->nextXid = record->xl_xid;
! TransactionIdAdvance(ShmemVariableCache->nextXid);
}
! if (record->xl_info & XLR_BKP_BLOCK_MASK)
! RestoreBkpBlocks(record, EndRecPtr);
!
! RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
!
! /* Pop the error context stack */
! error_context_stack = errcontext.previous;
LastRec = ReadRecPtr;
record = ReadRecord(NULL, LOG);
} while (record != NULL && recoveryContinue);
/*
* end of main redo apply loop
*/
--- 5226,5269 ----
break;
}
! /*
! * Push WAL record in WAL record buffer with its LSN for
! * delayed redo.
! * If the WAL record queue is full, redo all WAL records in the
! * queue and make the queue empty.
! */
! ereport(DEBUG1,
! (errmsg("WAL record queue is used %d(%d) bytes at %X/%08X.",
! RecordQueueBufUsed, record->xl_tot_len,
! EndRecPtr.xlogid, EndRecPtr.xrecoff)));
! if (RecordQueueBufUsed + MAXALIGN(sizeof(XLogRecPtr)) +
! MAXALIGN(record->xl_tot_len) > sizeof(RecordQueueBuf))
{
! ereport(DEBUG1, (errmsg("WAL record queue is full")));
! RedoRecords();
}
+ PushRecord(EndRecPtr, record);
! /*
! * Push page information to prefetch later.
! * If no more space, redo all records in queue and make the
! * queue empty.
! */
! while (!PushReadAhead(EndRecPtr, record))
! {
! ereport(DEBUG1, (errmsg("ReadAhead queue is full.")));
! RedoRecords();
! }
LastRec = ReadRecPtr;
record = ReadRecord(NULL, LOG);
} while (record != NULL && recoveryContinue);
+ /* All WAL records are read, redo all queued WAL records. */
+ ereport(DEBUG1, (errmsg("end of redo apply loop")));
+ RedoRecords();
+
/*
* end of main redo apply loop
*/
***************
*** 5447,5452 ****
--- 5488,5575 ----
}
/*
+ * Push the pair of WAL record and its LSN.
+ * Both WAL records and LSNs are aligned as same as WAL segment file.
+ */
+ static void
+ PushRecord(XLogRecPtr lsn, XLogRecord *record)
+ {
+ Assert(record);
+
+ memcpy(RecordQueueBuf + RecordQueueBufUsed, &lsn, sizeof(XLogRecPtr));
+ RecordQueueBufUsed += MAXALIGN(sizeof(XLogRecPtr));
+ memcpy(RecordQueueBuf + RecordQueueBufUsed, record, record->xl_tot_len);
+ RecordQueueBufUsed += MAXALIGN(record->xl_tot_len);
+ }
+
+ /*
+ * Push page information to readahead module.
+ */
+ static bool
+ PushReadAhead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ Assert(record);
+
+ if (!RmgrTable[record->xl_rmid].rm_readahead)
+ return true;
+
+ return RmgrTable[record->xl_rmid].rm_readahead(lsn, record);
+ }
+
+ /*
+ * Redo all WAL records in the queue and make the it empty.
+ */
+ static void
+ RedoRecords(void)
+ {
+ ErrorContextCallback errcontext;
+ uint32 off = 0;
+
+ /* Readahead data pages which will be modified during redo. */
+ ReadAheadExecute();
+
+ while (off < RecordQueueBufUsed)
+ {
+ XLogRecPtr lsn;
+ XLogRecord *record;
+
+ /* Extract LSN and WAL record image from local buffer. */
+ memcpy(&lsn, RecordQueueBuf + off, sizeof(XLogRecPtr));
+ off += MAXALIGN(sizeof(XLogRecPtr));
+ record = (XLogRecord *)(RecordQueueBuf + off);
+
+ /* Setup error traceback support for ereport() */
+ errcontext.callback = rm_redo_error_callback;
+ errcontext.arg = (void *) record;
+ errcontext.previous = error_context_stack;
+ error_context_stack = &errcontext;
+
+ /* nextXid must be beyond record's xid */
+ if (TransactionIdFollowsOrEquals(record->xl_xid,
+ ShmemVariableCache->nextXid))
+ {
+ ShmemVariableCache->nextXid = record->xl_xid;
+ TransactionIdAdvance(ShmemVariableCache->nextXid);
+ }
+
+ if (record->xl_info & XLR_BKP_BLOCK_MASK)
+ RestoreBkpBlocks(record, lsn);
+
+ /* Redo with WAL record and its LSN. */
+ RmgrTable[record->xl_rmid].rm_redo(lsn, record);
+
+ /* Pop the error context stack */
+ error_context_stack = errcontext.previous;
+
+ off += MAXALIGN(record->xl_tot_len);
+ }
+
+ /* Make RecordQueueBuf empty. */
+ MemSet(RecordQueueBuf, 0, sizeof(RecordQueueBuf));
+ RecordQueueBufUsed = 0;
+ }
+
+ /*
* Subroutine to try to fetch and validate a prior checkpoint record.
*
* whichChkpt identifies the checkpoint (merely for reporting purposes).
diff -rcN head.org/src/backend/commands/sequence.c head/src/backend/commands/sequence.c
*** head.org/src/backend/commands/sequence.c 2008-11-02 10:45:27.000000000 +0900
--- head/src/backend/commands/sequence.c 2008-12-26 12:10:54.000000000 +0900
***************
*** 15,20 ****
--- 15,21 ----
#include "postgres.h"
#include "access/heapam.h"
+ #include "access/readahead.h"
#include "access/transam.h"
#include "access/xact.h"
#include "access/xlogutils.h"
***************
*** 1382,1384 ****
--- 1383,1415 ----
appendStringInfo(buf, "rel %u/%u/%u",
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode);
}
+
+ /*
+ * seq_readahead - enqueue information about data pages
+ *
+ * The readahead module stores information about pages that are modified through
+ * redo-ing record.
+ *
+ */
+ bool
+ seq_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ Assert(record);
+
+ switch (info)
+ {
+ case XLOG_SEQ_LOG:
+ {
+ xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record);
+
+ if (!ReadAheadHasRoom(1))
+ return false;
+ ReadAheadAddEntry(xlrec->node, 0, lsn.xrecoff, false);
+ break;
+ }
+ }
+
+ return true;
+ }
diff -rcN head.org/src/backend/port/Makefile head/src/backend/port/Makefile
*** head.org/src/backend/port/Makefile 2008-10-30 01:06:46.000000000 +0900
--- head/src/backend/port/Makefile 2008-12-26 12:10:54.000000000 +0900
***************
*** 21,27 ****
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
! OBJS = dynloader.o pg_sema.o pg_shmem.o $(TAS)
ifeq ($(PORTNAME), darwin)
SUBDIRS += darwin
--- 21,27 ----
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
! OBJS = dynloader.o pg_sema.o pg_shmem.o posix_fadvise.o $(TAS)
ifeq ($(PORTNAME), darwin)
SUBDIRS += darwin
diff -rcN head.org/src/backend/port/posix_fadvise.c head/src/backend/port/posix_fadvise.c
*** head.org/src/backend/port/posix_fadvise.c 1970-01-01 09:00:00.000000000 +0900
--- head/src/backend/port/posix_fadvise.c 2008-12-26 12:10:54.000000000 +0900
***************
*** 0 ****
--- 1,86 ----
+ /*-------------------------------------------------------------------------
+ *
+ * posix_fadvise.c
+ *
+ * Some versions of glibc are known to crash on posix_fadvise so we want to
+ * check for unsafe versions before using posix_fadvise (a configure test isn't
+ * really good enough since the version of glibc on the build system might be
+ * different than on the running system).
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+ #include "postgres.h"
+
+ #define _XOPEN_SOURCE 600
+ #include <fcntl.h>
+ #include <unistd.h>
+
+ /* This checks that libc has a working posix_fadvise() based on our autoconf
+ * test (which actually runs it on the build machine) but also checks that we
+ * aren't running on a buggy old version of glibc.
+ */
+
+ /* XXX not sure what file to put the prototype in -- currently we just define
+ * it with extern in guc.c which is the only caller */
+ bool check_posix_fadvise(int elevel);
+
+ bool check_posix_fadvise(int elevel)
+ {
+
+ /* Check whether our configure test found a working POSIX_FADVISE */
+ #ifndef USE_POSIX_FADVISE
+ ereport(elevel,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not determine if this system has a working posix_fadvise"),
+ errdetail("Check configure.log produced by configure for more information")));
+ return false;
+ #endif
+
+
+ /* Double check glibc isn't a buggy version */
+ #if defined(HAVE_CONFSTR) && defined(_CS_GNU_LIBC_VERSION)
+ {
+ size_t l;
+ char buf[100], v[100];
+ unsigned v1, v2, v3;
+ int nmatched;
+
+ l = confstr(_CS_GNU_LIBC_VERSION, buf, sizeof(buf));
+ if (l >= sizeof(buf))
+ elog(FATAL, "unexpected result from confstr()");
+
+ nmatched = sscanf(buf, "%s %u.%u.%u", v, &v1, &v2, &v3);
+
+ /* It looks like the bug was fixed in 2.3.5 but insist on the more
+ * recent 2.3.6 release (the last of the 2.3.x currently) just to be
+ * safe */
+ if (nmatched < 1 ||
+ strcmp(v, "glibc") ||
+ !((nmatched >= 2 && v1 > 2) ||
+ (nmatched >= 3 && v1 == 2 && v2 > 3) ||
+ (nmatched >= 4 && v1 == 2 && v2 == 3 && v3 >= 6)))
+ {
+ ereport(elevel,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not determine if this system has a working posix_fadvise"),
+ errdetail("Note that some Glibc versions earlier than 2.3.6 are known to have buggy versions of posix_fadvise")));
+ return false;
+ }
+ }
+ #else
+ #ifdef __GLIBC__
+ ereport(elevel,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not determine if this system has a working posix_fadvise"),
+ errdetail("Note that some Glibc versions earlier than 2.3.6 are known to have buggy versions of posix_fadvise")));
+ return false;
+ #endif
+ #endif
+
+ return true;
+ }
diff -rcN head.org/src/include/access/gin.h head/src/include/access/gin.h
*** head.org/src/include/access/gin.h 2008-11-04 05:47:49.000000000 +0900
--- head/src/include/access/gin.h 2008-12-26 12:10:54.000000000 +0900
***************
*** 256,261 ****
--- 256,262 ----
/* ginxlog.c */
extern void gin_redo(XLogRecPtr lsn, XLogRecord *record);
extern void gin_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool gin_readahead(XLogRecPtr lsn, XLogRecord *record);
extern void gin_xlog_startup(void);
extern void gin_xlog_cleanup(void);
extern bool gin_safe_restartpoint(void);
diff -rcN head.org/src/include/access/gist_private.h head/src/include/access/gist_private.h
*** head.org/src/include/access/gist_private.h 2008-10-22 21:53:56.000000000 +0900
--- head/src/include/access/gist_private.h 2008-12-26 12:10:54.000000000 +0900
***************
*** 250,255 ****
--- 250,256 ----
/* gistxlog.c */
extern void gist_redo(XLogRecPtr lsn, XLogRecord *record);
extern void gist_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool gist_readahead(XLogRecPtr lsn, XLogRecord *record);
extern void gist_xlog_startup(void);
extern void gist_xlog_cleanup(void);
extern bool gist_safe_restartpoint(void);
diff -rcN head.org/src/include/access/heapam.h head/src/include/access/heapam.h
*** head.org/src/include/access/heapam.h 2008-11-07 05:51:15.000000000 +0900
--- head/src/include/access/heapam.h 2008-12-26 12:10:54.000000000 +0900
***************
*** 124,131 ****
--- 124,133 ----
extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool heap_readahead(XLogRecPtr lsn, XLogRecord *rptr);
extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool heap2_readahead(XLogRecPtr lsn, XLogRecord *rptr);
extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
ItemPointerData from,
diff -rcN head.org/src/include/access/nbtree.h head/src/include/access/nbtree.h
*** head.org/src/include/access/nbtree.h 2008-07-14 05:45:47.000000000 +0900
--- head/src/include/access/nbtree.h 2008-12-26 12:10:54.000000000 +0900
***************
*** 591,596 ****
--- 591,597 ----
*/
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool btree_readahead(XLogRecPtr lns, XLogRecord *rptr);
extern void btree_xlog_startup(void);
extern void btree_xlog_cleanup(void);
extern bool btree_safe_restartpoint(void);
diff -rcN head.org/src/include/access/readahead.h head/src/include/access/readahead.h
*** head.org/src/include/access/readahead.h 1970-01-01 09:00:00.000000000 +0900
--- head/src/include/access/readahead.h 2008-12-26 12:24:15.000000000 +0900
***************
*** 0 ****
--- 1,31 ----
+ /*-------------------------------------------------------------------------
+ *
+ * readahead.h
+ * Store information of data pages which should be read ahead.
+ *
+ * Portions Copyright (c) 2008, Nippon Telegraph and Telephone Corporation
+ * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+
+ #ifndef READAHEAD_H
+ #define READAHEAD_H
+
+ #include "postgres.h"
+ #include "storage/relfilenode.h"
+ #include "storage/block.h"
+ #include "access/xlogdefs.h"
+ #include "access/xlog.h"
+
+ /*
+ * Prototype of public function.
+ */
+ void ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff,
+ bool has_fpw);
+ bool ReadAheadHasRoom(int num);
+ void ReadAheadExecute(void);
+
+ #endif /* READAHEAD_H */
+
diff -rcN head.org/src/include/access/xlog.h head/src/include/access/xlog.h
*** head.org/src/include/access/xlog.h 2008-05-12 17:35:05.000000000 +0900
--- head/src/include/access/xlog.h 2008-12-26 12:10:54.000000000 +0900
***************
*** 196,201 ****
--- 196,202 ----
extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool xlog_readahead(XLogRecPtr lsn, XLogRecord *rptr);
extern void UpdateControlFile(void);
extern Size XLOGShmemSize(void);
diff -rcN head.org/src/include/access/xlog_internal.h head/src/include/access/xlog_internal.h
*** head.org/src/include/access/xlog_internal.h 2008-08-11 20:05:11.000000000 +0900
--- head/src/include/access/xlog_internal.h 2008-12-26 12:10:54.000000000 +0900
***************
*** 235,240 ****
--- 235,241 ----
void (*rm_startup) (void);
void (*rm_cleanup) (void);
bool (*rm_safe_restartpoint) (void);
+ bool (*rm_readahead) (XLogRecPtr lsn, XLogRecord *rptr);
} RmgrData;
extern const RmgrData RmgrTable[];
diff -rcN head.org/src/include/commands/sequence.h head/src/include/commands/sequence.h
*** head.org/src/include/commands/sequence.h 2008-05-17 08:36:05.000000000 +0900
--- head/src/include/commands/sequence.h 2008-12-26 12:10:54.000000000 +0900
***************
*** 98,102 ****
--- 98,103 ----
extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void seq_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool seq_readahead(XLogRecPtr lsn, XLogRecord *record);
#endif /* SEQUENCE_H */
diff -rcN head.org/src/include/pg_config.h.in head/src/include/pg_config.h.in
*** head.org/src/include/pg_config.h.in 2008-12-11 16:34:08.000000000 +0900
--- head/src/include/pg_config.h.in 2008-12-26 12:10:54.000000000 +0900
***************
*** 99,105 ****
#undef HAVE_DECL_F_FULLFSYNC
/* Define to 1 if you have the declaration of `posix_fadvise', and to 0 if you
! don't. */
#undef HAVE_DECL_POSIX_FADVISE
/* Define to 1 if you have the declaration of `snprintf', and to 0 if you
--- 99,106 ----
#undef HAVE_DECL_F_FULLFSYNC
/* Define to 1 if you have the declaration of `posix_fadvise', and to 0 if you
! don't. (posix_fadvise() is abi-incompatible with a non-prototyped call so
! this is important) */
#undef HAVE_DECL_POSIX_FADVISE
/* Define to 1 if you have the declaration of `snprintf', and to 0 if you
***************
*** 339,344 ****
--- 340,348 ----
/* Define to 1 if you have the POSIX signal interface. */
#undef HAVE_POSIX_SIGNALS
+ /* Define to 1 if you have the posix_fadvise() call */
+ #undef HAVE_POSIX_FADVISE
+
/* Define to 1 if you have the `pstat' function. */
#undef HAVE_PSTAT
***************
*** 505,510 ****
--- 509,517 ----
/* Define to 1 if you have the `sysconf' function. */
#undef HAVE_SYSCONF
+ /* Define to 1 if you have the `confstr' call */
+ #undef HAVE_CONFSTR
+
/* Define to 1 if you have the syslog interface. */
#undef HAVE_SYSLOG
***************
*** 731,736 ****
--- 738,748 ----
/* Define to 1 to build with PAM support. (--with-pam) */
#undef USE_PAM
+ /* Define to 1 to use posix_fadvise() (Many platforms have buggy or ineffective
+ * implementations. We can test for some known bugs but it's hard to test for
+ * effectiveness) */
+ #undef USE_POSIX_FADVISE
+
/* Use replacement snprintf() functions. */
#undef USE_REPL_SNPRINTF
"Koichi Suzuki" <koichi.szk@gmail.com> writes:
This is the V3 of PITR performance improvement (readahead). The
change of the code is as follows:1) Now readahead is integrated into the core so that it can deal with
sync.rep's log shipping.
2) posix_fadvise() call was integrated with Greg Stark's patch.
Wow, this is really cool. It integrates into core the readahead of WAL records
using a new RM method for each WAL type. That's great.
I haven't looked closely yet, I assume this avoids the code duplication? I did
notice there's a whole queue data structure for planning which blocks to
prefetch. Why is that necessary instead of just keeping a count of how many
blocks have been prefetched? Does it help avoid prefetching the same blocks
repeatedly?
--
Gregory Stark
EnterpriseDB http://www.enterprisedb.com
Ask me about EnterpriseDB's On-Demand Production Tuning