V3 of PITR performance improvement for 8.4 (WIP)

Started by Koichi Suzukiabout 17 years ago2 messages
#1Koichi Suzuki
koichi.szk@gmail.com
1 attachment(s)

This is the V3 of PITR performance improvement (readahead). The
change of the code is as follows:

1) Now readahead is integrated into the core so that it can deal with
sync.rep's log shipping.
2) posix_fadvise() call was integrated with Greg Stark's patch.

Still need some more work for the following:

1) Integrate with sync.rep's wal-receiver part,
2) fix to deal with WAL segment change.

V3 is posted for pre-review purpose and V4 will be posted at the
beginning of January.

Regards;

--
------
Koichi Suzuki

Attachments:

readahead-20081226.patchtext/x-diff; name=readahead-20081226.patchDownload
diff -rcN head.org/config/c-library.m4 head/config/c-library.m4
*** head.org/config/c-library.m4	2008-08-21 22:53:28.000000000 +0900
--- head/config/c-library.m4	2008-12-26 12:10:54.000000000 +0900
***************
*** 216,221 ****
--- 216,273 ----
  AC_SUBST(HAVE_POSIX_SIGNALS)])# PGAC_FUNC_POSIX_SIGNALS
  
  
+ # PGAC_CHECK_POSIX_FADVISE
+ # ------------------------
+ #
+ # Check to see if we have a *working* posix_fadvise. We can't actually check
+ # that it helps performance but we can make sure it doesn't crash. We can't
+ # even do that if we're cross-compiling :(
+ 
+ AC_DEFUN([PGAC_POSIX_FADVISE],
+ [AC_MSG_CHECKING([for working posix_fadvise])
+ AC_CACHE_VAL(pgac_cv_posix_fadvise,
+ [AC_TRY_RUN([
+ #define _XOPEN_SOURCE 600
+ #include <unistd.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <fcntl.h>
+ 
+ static int does_posix_fadvise_work(void)
+ {
+ #ifndef POSIX_FADV_WILLNEED
+     return 0;
+ #else
+     int fd;
+     char data[8192];
+     fd = creat("conftest.posix_fadvise", 0600);
+     if (fd < 0)
+        return 0;
+     if (write(fd, data, 8192) != 8192)
+        return 0;
+     if (posix_fadvise(fd, 0, 8192, POSIX_FADV_WILLNEED) != 0)
+        return 0;
+     return 1;
+ #endif
+ }
+ 
+ int main() {
+   return !does_posix_fadvise_work();
+ }],
+ [pgac_cv_posix_fadvise=1],
+ [pgac_cv_posix_fadvise=0],
+ [pgac_cv_posix_fadvise=cross],
+ )])
+ 
+ case $pgac_cv_posix_fadvise in
+   cross) AC_MSG_RESULT([cannot test (not on host machine)]);;
+   ?*)    AC_MSG_RESULT([yes])
+          AC_DEFINE(USE_POSIX_FADVISE)
+ 	 ;;
+   *)     AC_MSG_RESULT([no -- avoiding use of posix_fadvise]);;
+ esac])# PGAC_CHECK_POSIX_FADVISE
+ 
+ 
  # PGAC_FUNC_SNPRINTF_LONG_LONG_INT_FORMAT
  # ---------------------------------------
  # Determine which format snprintf uses for long long int.  We handle
diff -rcN head.org/configure head/configure
*** head.org/configure	2008-12-11 16:34:05.000000000 +0900
--- head/configure	2008-12-26 12:10:54.000000000 +0900
***************
*** 16205,16211 ****
  
  
  
! for ac_func in cbrt dlopen fcvt fdatasync getpeereid getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs
  do
  as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
  { echo "$as_me:$LINENO: checking for $ac_func" >&5
--- 16205,16212 ----
  
  
  
! 
! for ac_func in cbrt dlopen fcvt fdatasync posix_fadvise getpeereid getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf confstr towlower utime utimes waitpid wcstombs
  do
  as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
  { echo "$as_me:$LINENO: checking for $ac_func" >&5
***************
*** 25256,25261 ****
--- 25257,25357 ----
    SEMA_IMPLEMENTATION="src/backend/port/win32_sema.c"
  fi
  
+ # check that posix_fadvise doesn't crash or otherwise cause problems
+ if test x"$HAVE_POSIX_FADVISE" -a x"HAVE_DECL_POSIX_FADVISE"; then
+    { echo "$as_me:$LINENO: checking for working posix_fadvise" >&5
+ echo $ECHO_N "checking for working posix_fadvise... $ECHO_C" >&6; }
+ if test "${pgac_cv_posix_fadvise+set}" = set; then
+   echo $ECHO_N "(cached) $ECHO_C" >&6
+ else
+   if test "$cross_compiling" = yes; then
+   pgac_cv_posix_fadvise=cross
+ else
+   cat >conftest.$ac_ext <<_ACEOF
+ /* confdefs.h.  */
+ _ACEOF
+ cat confdefs.h >>conftest.$ac_ext
+ cat >>conftest.$ac_ext <<_ACEOF
+ /* end confdefs.h.  */
+ 
+ #define _XOPEN_SOURCE 600
+ #include <unistd.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <fcntl.h>
+ 
+ static int does_posix_fadvise_work(void)
+ {
+ #ifndef POSIX_FADV_WILLNEED
+     return 0;
+ #else
+     int fd;
+     char data[8192];
+     fd = creat("conftest.posix_fadvise", 0600);
+     if (fd < 0)
+        return 0;
+     if (write(fd, data, 8192) != 8192)
+        return 0;
+     if (posix_fadvise(fd, 0, 8192, POSIX_FADV_WILLNEED) != 0)
+        return 0;
+     return 1;
+ #endif
+ }
+ 
+ int main() {
+   return !does_posix_fadvise_work();
+ }
+ _ACEOF
+ rm -f conftest$ac_exeext
+ if { (ac_try="$ac_link"
+ case "(($ac_try" in
+   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+   *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+   (eval "$ac_link") 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+   { (case "(($ac_try" in
+   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+   *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+   (eval "$ac_try") 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }; }; then
+   pgac_cv_posix_fadvise=1
+ else
+   echo "$as_me: program exited with status $ac_status" >&5
+ echo "$as_me: failed program was:" >&5
+ sed 's/^/| /' conftest.$ac_ext >&5
+ 
+ ( exit $ac_status )
+ pgac_cv_posix_fadvise=0
+ fi
+ rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+ fi
+ 
+ 
+ fi
+ 
+ 
+ case $pgac_cv_posix_fadvise in
+   cross) { echo "$as_me:$LINENO: result: cannot test (not on host machine)" >&5
+ echo "${ECHO_T}cannot test (not on host machine)" >&6; };;
+   ?*)    { echo "$as_me:$LINENO: result: yes" >&5
+ echo "${ECHO_T}yes" >&6; }
+          cat >>confdefs.h <<\_ACEOF
+ #define USE_POSIX_FADVISE 1
+ _ACEOF
+ 
+ 	 ;;
+   *)     { echo "$as_me:$LINENO: result: no -- avoiding use of posix_fadvise" >&5
+ echo "${ECHO_T}no -- avoiding use of posix_fadvise" >&6; };;
+ esac
+ fi
  
  # Select shared-memory implementation type.
  if test "$PORTNAME" != "win32"; then
diff -rcN head.org/configure.in head/configure.in
*** head.org/configure.in	2008-12-11 16:34:07.000000000 +0900
--- head/configure.in	2008-12-26 12:10:54.000000000 +0900
***************
*** 1143,1149 ****
  AC_FUNC_ACCEPT_ARGTYPES
  PGAC_FUNC_GETTIMEOFDAY_1ARG
  
! AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getpeereid getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs])
  
  AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])
  AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>])
--- 1143,1149 ----
  AC_FUNC_ACCEPT_ARGTYPES
  PGAC_FUNC_GETTIMEOFDAY_1ARG
  
! AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync posix_fadvise getpeereid getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf confstr towlower utime utimes waitpid wcstombs])
  
  AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])
  AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>])
***************
*** 1654,1659 ****
--- 1654,1663 ----
    SEMA_IMPLEMENTATION="src/backend/port/win32_sema.c"
  fi
  
+ # check that posix_fadvise doesn't crash or otherwise cause problems
+ if test x"$HAVE_POSIX_FADVISE" -a x"HAVE_DECL_POSIX_FADVISE"; then
+    PGAC_POSIX_FADVISE
+ fi
  
  # Select shared-memory implementation type.
  if test "$PORTNAME" != "win32"; then
diff -rcN head.org/src/backend/access/gin/ginxlog.c head/src/backend/access/gin/ginxlog.c
*** head.org/src/backend/access/gin/ginxlog.c	2008-07-12 06:06:29.000000000 +0900
--- head/src/backend/access/gin/ginxlog.c	2008-12-26 12:10:54.000000000 +0900
***************
*** 15,20 ****
--- 15,21 ----
  
  #include "access/gin.h"
  #include "access/xlogutils.h"
+ #include "access/readahead.h"
  #include "storage/bufmgr.h"
  #include "utils/memutils.h"
  
***************
*** 519,524 ****
--- 520,629 ----
  	}
  }
  
+ /*
+  *    gin_readahead  - enqueue information about data pages
+  *
+  * The readahead module stores information about pages that are modified through
+  * redo-ing record.
+  *
+  */
+ bool
+ gin_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ 
+ 	Assert(record);
+ 
+ 	switch (info)
+ 	{
+ 		case XLOG_GIN_CREATE_INDEX:
+ 			{
+ 				RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(*node, GIN_ROOT_BLKNO, lsn.xrecoff, false);
+ 				break;
+ 			}
+ 		case XLOG_GIN_CREATE_PTREE:
+ 			{
+ 				ginxlogCreatePostingTree *data =
+ 					(ginxlogCreatePostingTree *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(data->node, data->blkno, lsn.xrecoff, false);
+ 				break;
+ 			}
+ 		case XLOG_GIN_INSERT:
+ 			{
+ 				ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(data->node, data->blkno,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_GIN_SPLIT:
+ 			{
+ 				int readahead_cnt;
+ 				ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record);
+ 
+ 				readahead_cnt = 2;
+ 				if (data->isRootSplit)
+ 					readahead_cnt++;
+ 
+ 				if (!ReadAheadHasRoom(readahead_cnt))
+ 					return false;
+ 
+ 				ReadAheadAddEntry(data->node, data->lblkno, lsn.xrecoff, false);
+ 				ReadAheadAddEntry(data->node, data->rblkno, lsn.xrecoff, false);
+ 				if (data->isRootSplit)
+ 				{
+ 					ReadAheadAddEntry(data->node, data->rootBlkno,
+ 						lsn.xrecoff, false);
+ 				}
+ 				break;
+ 			}
+ 		case XLOG_GIN_VACUUM_PAGE:
+ 			{
+ 				ginxlogVacuumPage *data =
+ 					(ginxlogVacuumPage *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(data->node, data->blkno,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_GIN_DELETE_PAGE:
+ 			{
+ 				int readahead_cnt;
+ 				ginxlogDeletePage *data =
+ 					(ginxlogDeletePage *) XLogRecGetData(record);
+ 				readahead_cnt = 2;
+ 				if (data->leftBlkno != InvalidBlockNumber)
+ 					readahead_cnt++;
+ 
+ 				if (!ReadAheadHasRoom(2))
+ 					return false;
+ 				ReadAheadAddEntry(data->node, data->blkno,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				ReadAheadAddEntry(data->node, data->parentBlkno,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ 				if (data->leftBlkno != InvalidBlockNumber)
+ 				{
+ 					ReadAheadAddEntry(data->node, data->leftBlkno,
+ 						lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3);
+ 				}
+                         break;
+ 			}
+ 	}
+ 
+ 	return true;
+ }
+ 
  void
  gin_xlog_startup(void)
  {
diff -rcN head.org/src/backend/access/gist/gistxlog.c head/src/backend/access/gist/gistxlog.c
*** head.org/src/backend/access/gist/gistxlog.c	2008-06-19 09:46:03.000000000 +0900
--- head/src/backend/access/gist/gistxlog.c	2008-12-26 12:10:54.000000000 +0900
***************
*** 15,20 ****
--- 15,21 ----
  
  #include "access/gist_private.h"
  #include "access/xlogutils.h"
+ #include "access/readahead.h"
  #include "miscadmin.h"
  #include "storage/bufmgr.h"
  #include "utils/memutils.h"
***************
*** 500,505 ****
--- 501,584 ----
  	}
  }
  
+ /*
+  *    gist_readahead   - enqueue information about data pages
+  *
+  * The readahead module stores information about pages that are modified through
+  * redo-ing record.
+  *
+  */
+ bool
+ gist_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ 
+ 	Assert(record);
+ 	
+ 	switch (info)
+ 	{
+ 		case XLOG_GIST_PAGE_UPDATE:
+ 		case XLOG_GIST_NEW_ROOT:
+ 			{
+ 				PageUpdateRecord xlrec;
+ 
+ 				decodePageUpdateRecord(&xlrec, record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec.data->node, xlrec.data->blkno,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_GIST_PAGE_SPLIT:
+ 			{
+ 				int i;
+ 
+ 				PageSplitRecord rec;
+ 				decodePageSplitRecord(&rec, record);
+ 
+ 				if (!ReadAheadHasRoom(rec.data->npage))
+ 					return false;
+ 				for (i = 0; i < rec.data->npage; i++)
+ 				{
+ 					ReadAheadAddEntry(rec.data->node, rec.page[i].header->blkno,
+ 						lsn.xrecoff, false);
+ 				}
+ 				break;
+ 			}
+ 		case XLOG_GIST_INSERT_COMPLETE:
+ 			{
+ 				/*
+ 				 * This WAL record never touch data page, so nothi ng
+ 				 * to do.
+ 				 */
+ 				break;
+ 			}
+ 		case XLOG_GIST_CREATE_INDEX:
+ 			{
+ 				RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(*node, GIST_ROOT_BLKNO, lsn.xrecoff, false);
+ 				break;
+ 			}
+ 		case XLOG_GIST_PAGE_DELETE:
+ 			{
+ 				gistxlogPageDelete *xldata =
+ 					(gistxlogPageDelete *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xldata->node, xldata->blkno,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 	}
+ 
+ 	return true;
+ }
+ 
  IndexTuple
  gist_form_invalid_tuple(BlockNumber blkno)
  {
diff -rcN head.org/src/backend/access/heap/heapam.c head/src/backend/access/heap/heapam.c
*** head.org/src/backend/access/heap/heapam.c	2008-12-17 01:26:08.000000000 +0900
--- head/src/backend/access/heap/heapam.c	2008-12-26 12:10:54.000000000 +0900
***************
*** 42,47 ****
--- 42,48 ----
  #include "access/heapam.h"
  #include "access/hio.h"
  #include "access/multixact.h"
+ #include "access/readahead.h"
  #include "access/relscan.h"
  #include "access/sysattr.h"
  #include "access/transam.h"
***************
*** 4970,4975 ****
--- 4971,5122 ----
  }
  
  /*
+  *	heap_readahead	- enqueue information about data pages
+  *
+  * The readahead module stores information about pages that are modified through
+  * redo-ing record.
+  *
+  */
+ bool
+ heap_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ 
+ 	Assert(record);
+ 
+ 	switch (info & XLOG_HEAP_OPMASK)
+ 	{
+ 		case XLOG_HEAP_INSERT:
+ 			{
+ 				xl_heap_insert *xlrec =
+ 					(xl_heap_insert *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->target.node,
+ 					ItemPointerGetBlockNumber(&xlrec->target.tid),
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_HEAP_DELETE:
+ 			{
+ 				xl_heap_delete *xlrec =
+ 					(xl_heap_delete *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->target.node,
+ 					ItemPointerGetBlockNumber(&xlrec->target.tid),
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_HEAP_UPDATE:
+ 		case XLOG_HEAP_MOVE:
+ 		case XLOG_HEAP_HOT_UPDATE:
+ 			{
+ 				bool samepage;
+ 				xl_heap_update *xlrec =
+ 					(xl_heap_update *) XLogRecGetData(record);
+ 
+ 				samepage = ItemPointerGetBlockNumber(&xlrec->newtid) ==
+ 					ItemPointerGetBlockNumber(&xlrec->target.tid);
+ 
+ 				if (!ReadAheadHasRoom(1 + (samepage ? 0 : 1)))
+ 					return false;
+ 				/* store page which contains updated tuple. */ 
+ 				ReadAheadAddEntry(xlrec->target.node,
+ 					ItemPointerGetBlockNumber(&xlrec->target.tid),
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				/* store another page if any. */ 
+ 				if (!samepage)
+ 					ReadAheadAddEntry(xlrec->target.node,
+ 						ItemPointerGetBlockNumber(&xlrec->newtid),
+ 						lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ 				break;
+ 			}
+ 		case XLOG_HEAP_NEWPAGE:
+ 			{
+ 				xl_heap_newpage *xlrec =
+ 					(xl_heap_newpage *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->node, xlrec->blkno,
+ 					lsn.xrecoff, false);
+ 				break;
+ 			}
+ 		case XLOG_HEAP_LOCK:
+ 			{
+ 				xl_heap_lock *xlrec =
+ 					(xl_heap_lock *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->target.node, 
+ 					ItemPointerGetBlockNumber(&xlrec->target.tid),
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_HEAP_INPLACE:
+ 			{
+ 				xl_heap_inplace *xlrec =
+ 					(xl_heap_inplace *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->target.node,
+ 					ItemPointerGetBlockNumber(&xlrec->target.tid),
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 	}
+ 
+ 	return true;
+ }
+ 
+ /*
+  *	heap2_readahead	- enqueue information about data pages
+  *
+  * The readahead module stores information about pages that are modified through
+  * redo-ing record.
+  *
+  */
+ bool
+ heap2_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	Assert(record);
+ 
+ 	switch (record->xl_info)
+ 	{
+ 		case XLOG_HEAP2_FREEZE:
+ 			{
+ 				xl_heap_freeze *xlrec =
+ 					(xl_heap_freeze *) XLogRecGetData(record);
+ 			
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->node, xlrec->block,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_HEAP2_CLEAN:
+ 		case XLOG_HEAP2_CLEAN_MOVE:
+ 			{
+ 				xl_heap_clean *xlrec =
+ 					(xl_heap_clean *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->node, xlrec->block,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 	}
+ 
+ 	return true;
+ }
+ 
+ /*
   *	heap_sync		- sync a heap, for use when no WAL has been written
   *
   * This forces the heap contents (including TOAST heap if any) down to disk.
diff -rcN head.org/src/backend/access/nbtree/nbtxlog.c head/src/backend/access/nbtree/nbtxlog.c
*** head.org/src/backend/access/nbtree/nbtxlog.c	2008-06-12 18:12:30.000000000 +0900
--- head/src/backend/access/nbtree/nbtxlog.c	2008-12-26 12:10:54.000000000 +0900
***************
*** 16,21 ****
--- 16,22 ----
  
  #include "access/nbtree.h"
  #include "access/transam.h"
+ #include "access/readahead.h"
  #include "storage/bufmgr.h"
  
  /*
***************
*** 878,883 ****
--- 879,1014 ----
  	}
  }
  
+ /*
+  * btree_readahead	- enqueue information about data pages
+  *
+  */
+ bool
+ btree_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ 	
+ 	Assert(record);
+ 
+ 	switch (info)
+ 	{
+ 		case XLOG_BTREE_INSERT_LEAF:
+ 		case XLOG_BTREE_INSERT_UPPER:
+ 		case XLOG_BTREE_INSERT_META:
+ 			{
+ 				int readahead_cnt;
+ 				xl_btree_insert *xlrec =
+ 					(xl_btree_insert *) XLogRecGetData(record);
+ 
+ 				readahead_cnt = 1;
+ 				if (info == XLOG_BTREE_INSERT_META)
+ 					readahead_cnt++;
+ 
+ 				if (!ReadAheadHasRoom(readahead_cnt))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->target.node,
+ 					BlockIdGetBlockNumber(&xlrec->target.tid.ip_blkid),
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				if (info == XLOG_BTREE_INSERT_META)
+ 				{
+ 					ReadAheadAddEntry(xlrec->target.node,
+ 						BTREE_METAPAGE, lsn.xrecoff, false);
+ 				}
+ 				break;
+ 			}
+ 		case XLOG_BTREE_SPLIT_L:
+ 		case XLOG_BTREE_SPLIT_L_ROOT:
+ 		case XLOG_BTREE_SPLIT_R:
+ 		case XLOG_BTREE_SPLIT_R_ROOT:
+ 			{
+ 				int readahead_cnt;
+ 				xl_btree_split *xlrec =
+ 					(xl_btree_split *) XLogRecGetData(record);
+ 
+ 				readahead_cnt = 2;
+ 				if (xlrec->rnext != P_NONE)
+ 					readahead_cnt++;
+ 
+ 				if (!ReadAheadHasRoom(readahead_cnt))
+ 					return false;
+ 
+ 				ReadAheadAddEntry(xlrec->node, xlrec->rightsib,
+ 					lsn.xrecoff, false);
+ 				ReadAheadAddEntry(xlrec->node, xlrec->leftsib,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				if (xlrec->rnext != P_NONE)
+ 				{
+ 					ReadAheadAddEntry(xlrec->node, xlrec->rnext,
+ 						lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ 				}
+ 				break;
+ 			}
+ 		case XLOG_BTREE_DELETE:
+ 			{
+ 				xl_btree_delete *xlrec =
+ 					(xl_btree_delete *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				ReadAheadAddEntry(xlrec->node, xlrec->block,
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				break;
+ 			}
+ 		case XLOG_BTREE_DELETE_PAGE:
+ 		case XLOG_BTREE_DELETE_PAGE_META:
+ 		case XLOG_BTREE_DELETE_PAGE_HALF:
+ 			{
+ 				int readahead_cnt;
+ 				xl_btree_delete_page *xlrec =
+ 					(xl_btree_delete_page *) XLogRecGetData(record);
+ 
+ 				readahead_cnt = 3;
+ 				if (info == XLOG_BTREE_DELETE_PAGE_META)
+ 					readahead_cnt++;
+ 				if (xlrec->leftblk != P_NONE)
+ 					readahead_cnt++;
+ 
+ 				/* parent page */
+ 				ReadAheadAddEntry(xlrec->target.node,
+ 					ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+ 					lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1);
+ 				/* rightsib page */
+ 				ReadAheadAddEntry(xlrec->target.node, xlrec->rightblk,
+ 						lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2);
+ 				/* leftsib page, if exists */
+ 				if (xlrec->leftblk != P_NONE)
+ 				{
+ 					ReadAheadAddEntry(xlrec->target.node, xlrec->leftblk,
+ 						lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3);
+ 				}
+ 				/* target page */
+ 				ReadAheadAddEntry(xlrec->target.node,
+ 					xlrec->deadblk, lsn.xrecoff, false);
+ 				/* metapage, if exists */
+ 				if (info == XLOG_BTREE_DELETE_PAGE_META)
+ 				{
+ 					ReadAheadAddEntry(xlrec->target.node,
+ 						BTREE_METAPAGE, lsn.xrecoff, false);
+ 				}
+ 				break;
+ 			}
+ 		case XLOG_BTREE_NEWROOT:
+ 			{
+ 				xl_btree_newroot *xlrec =
+ 					(xl_btree_newroot *) XLogRecGetData(record);
+ 
+ 				if (!ReadAheadHasRoom(1))
+ 					return false;
+ 				/* FPW does not exists. */
+ 				ReadAheadAddEntry(xlrec->node, xlrec->rootblk,
+ 					lsn.xrecoff, false);
+ 				break;
+ 			}
+ 	}
+ 
+ 	return true;
+ }
+ 
  void
  btree_xlog_startup(void)
  {
diff -rcN head.org/src/backend/access/transam/Makefile head/src/backend/access/transam/Makefile
*** head.org/src/backend/access/transam/Makefile	2008-02-19 19:30:07.000000000 +0900
--- head/src/backend/access/transam/Makefile	2008-12-26 12:10:54.000000000 +0900
***************
*** 12,18 ****
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 12,18 ----
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o readahead.o
  
  include $(top_srcdir)/src/backend/common.mk
  
diff -rcN head.org/src/backend/access/transam/readahead.c head/src/backend/access/transam/readahead.c
*** head.org/src/backend/access/transam/readahead.c	1970-01-01 09:00:00.000000000 +0900
--- head/src/backend/access/transam/readahead.c	2008-12-26 12:37:50.000000000 +0900
***************
*** 0 ****
--- 1,292 ----
+ /*-------------------------------------------------------------------------
+  *
+  * readahead.c
+  *		Store information of data pages which should be read ahead.
+  *
+  * Portions Copyright (c) 2008, Nippon Telegraph and Telephone Corporation
+  * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *-------------------------------------------------------------------------
+  */
+ 
+ #include <unistd.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <fcntl.h>
+ 
+ #include "postgres.h"
+ #include "access/readahead.h"
+ #include "access/xlog_internal.h"
+ #include "catalog/catalog.h"
+ #include "storage/relfilenode.h"
+ #include "storage/block.h"
+ 
+ #if USE_POSIX_FADVISE
+ 
+ /*
+  * Information about the data page which will be read ahead.
+  */
+ struct XLogReadAhead {
+ 	/*
+ 	 * The physical location of the data page.
+ 	 */
+ 	RelFileNode node;
+ 	BlockNumber blkno;
+ 
+ 	/*
+ 	 * xrecoff is the byte offset of location in the WAL segment file as
+ 	 * defined in xlogdefs.h. The read ahead command does not deal with more
+ 	 * than one WAL segment file at once, and xlogid is not going to be changed
+ 	 * during read-ahead. This is why we need only xrecoff. 
+ 	 */
+ 	uint32 xrecoff;
+ 
+ 	/*
+ 	 * has_fpw indicates whether an WAL record contains full page write or not.
+ 	 * This is used to skip unnecessary read-aheads.
+ 	 */
+ 	bool has_fpw;
+ }; 
+ typedef struct XLogReadAhead XLogReadAhead;
+ 
+ /*
+  * ReadAheadQueueSize is the initail size of XLogReadAhead queue.
+  * When the number of XLogReadAhead reaches this amount, we execute readahead.
+  * Queue uses 16MB.
+  */
+ #define ReadAheadQueueSize	(16 * 1024 * 1024 / sizeof(XLogReadAhead))
+ 
+ /* The queue for XLogReadAhead entries. */
+ static XLogReadAhead ReadAheadQueue[ReadAheadQueueSize];
+ 
+ /* The number of XLogReadAhead entries currently used. */
+ static uint32 ReadAheadQueueUsed = 0;
+ 
+ /* prototype of local function */
+ static int ReadAheadCompare(const void *l, const void *r);
+ 
+ /*
+  * Append a new XLogReadAhead entry to the queue
+  *
+  * If XLogReadAhead queue is fullfilled, prefetch first and add ReadAheadQueue
+  * to empty queue.
+  */
+ void
+ ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff,
+ 	bool has_fpw)
+ {
+ 	/* all entries are used, so prefetch pages and make the queue empty */
+ 	if (ReadAheadQueueUsed >= ReadAheadQueueSize)
+ 	{
+ 		ReadAheadExecute();
+ 	}
+ 
+ 	/* Append a new XLogReadAhead ReadAheadQueue to the queue. */
+ 	ReadAheadQueue[ReadAheadQueueUsed].node = node;
+ 	ReadAheadQueue[ReadAheadQueueUsed].blkno = blkno;
+ 	ReadAheadQueue[ReadAheadQueueUsed].xrecoff = xrecoff;
+ 	ReadAheadQueue[ReadAheadQueueUsed].has_fpw = has_fpw;
+ 
+ 	ReadAheadQueueUsed++;
+ }
+ 
+ /*
+  * ReadAhead queue availability check
+  *
+  * If the XLogReadAhead queue has enough room for appending more num of
+  * XLogReadAhead,
+  * return true. If it does not, try to double the queue.
+  * If new queue could't be allocated, return false.
+  */
+ bool
+ ReadAheadHasRoom(int num)
+ {
+ 	return (ReadAheadQueueUsed + num <= ReadAheadQueueSize);
+ }
+ 
+ /*
+  * Check whether info1 and info2 point same data page.
+  */
+ #define IS_SAME_PAGE(info1, info2) \
+ 	(RelFileNodeEquals((info1).node, (info2).node) && \
+ 		(info1).blkno == (info2).blkno)
+  
+ /*
+  * Execute read ahead data pages
+  * 
+  * Before we actually read ahead data pages, sort the XLogReadAhead in the queue
+  * for avoiding duplicated disk access and hopefully, reducing seek time.
+  * We also skip read ahead data pages which has full page write.
+  *
+  * For performance, we keep file opened until reading another file.
+  */
+ void
+ ReadAheadExecute(void)
+ {
+ 	int fd = -1;
+ 	int i;
+ 	BlockNumber last_segno = InvalidBlockNumber;
+ 	BlockNumber segno;
+ 	XLogReadAhead last_entry = { { 0, 0, 0, }, 0, 0, false };
+ 
+ 	ereport(DEBUG1, (errmsg("%d blocks are prefetch canditate",
+ 		ReadAheadQueueUsed)));
+ 
+ 	/* Sort the XLogReadAhead queue for effective disk access. */
+ 	qsort(ReadAheadQueue, ReadAheadQueueUsed, sizeof(XLogReadAhead),
+ 		ReadAheadCompare);
+ 
+ 	for (i = 0; i < ReadAheadQueueUsed; i++)
+ 	{
+ 		/* Do read ahead once per a page if it doesn't have full page write. */
+ 		if (IS_SAME_PAGE(last_entry, ReadAheadQueue[i]) ||
+ 				ReadAheadQueue[i].has_fpw)
+ 		{
+ 			last_entry = ReadAheadQueue[i];
+ 			continue;
+ 		}
+ 
+ 		/* Open data file if not opened yet. */
+ 		last_segno = last_entry.blkno / RELSEG_SIZE;
+ 		segno = ReadAheadQueue[i].blkno / RELSEG_SIZE;
+ 
+ 		if (last_segno != segno ||
+ 				!RelFileNodeEquals(last_entry.node, ReadAheadQueue[i].node))
+ 		{
+ 			char *path;
+ 			char *fullpath;
+ 
+ 			if (fd != -1)
+ 			{
+ 				close(fd);
+ 			}
+ 
+ 			/*
+ 			 * Generate file path and add segment number if necessary.
+ 			 * Based on _mdfd_openseg() in src/backend/storage/smgr.c
+ 			 */
+ 			path = relpath(ReadAheadQueue[i].node, MAIN_FORKNUM);
+ 			if (segno > 0)
+ 			{
+ 				fullpath = palloc(strlen(path) + 12);
+ 				sprintf(fullpath, "%s.%u", path, segno);
+ 				pfree(path);
+ 			}
+ 			else
+ 				fullpath = path;
+ 
+ 			fd = open(fullpath, O_RDONLY | PG_BINARY);
+ 			pfree(fullpath);
+ 			if (fd == -1)
+ 			{
+ 				/*
+ 				 * Even if open() returns error, continue to read ahead.
+ 				 * We assume that the data file has not been created yet.
+ 				 */
+ 				last_entry = ReadAheadQueue[i];
+ 				continue;
+ 			}
+ 		}
+ 
+ 		/* Read ahead with posix_fadvise() */
+ 		if (fd != -1)
+ 		{
+ 			/* Even if posix_fadvise() returns error, continue to read ahead. */
+ 			posix_fadvise(fd, (ReadAheadQueue[i].blkno % RELSEG_SIZE) * BLCKSZ,
+ 				BLCKSZ, POSIX_FADV_WILLNEED);
+ 		}
+ 
+ 		/* Store XLogReadAhead to skip duplicate pages. */
+ 		last_entry = ReadAheadQueue[i];
+ 	}
+ 
+ 	ReadAheadQueueUsed = 0;
+ }
+ 
+ /*
+  * Compare two XLogReadAhead objects
+  *
+  * When l > r, then return 1, l == r, then return 0, and l < r, then return -1.
+  * The priority of comparison clauses shows below;
+  *    1. node.spcNode
+  *    2. node.dbNode
+  *    3. node.relNode
+  *    4. blkno
+  *    5. xrecoff
+  */
+ static int
+ ReadAheadCompare(const void *l, const void *r)
+ {
+ 	XLogReadAhead *left = (XLogReadAhead *)l;
+ 	XLogReadAhead *right = (XLogReadAhead *)r;
+ 
+ 	/* compare node.spcNode */
+ 	if (left->node.spcNode > right->node.spcNode)
+ 		return 1;
+ 	else if (left->node.spcNode < right->node.spcNode)
+ 		return -1;
+ 
+ 	/* compare node.dbNode */
+ 	if (left->node.dbNode > right->node.dbNode)
+ 		return 1;
+ 	else if (left->node.dbNode < right->node.dbNode)
+ 		return -1;
+ 
+ 	/* compare node.relNode */
+ 	if (left->node.relNode > right->node.relNode)
+ 		return 1;
+ 	else if (left->node.relNode < right->node.relNode)
+ 		return -1;
+  
+ 	/* compare blkno */
+ 	if (left->blkno > right->blkno)
+ 		return 1;
+ 	else if (left->blkno < right->blkno)
+ 		return -1;
+ 
+ 	/* compare xrecoff */
+ 	if (left->xrecoff > right->xrecoff)
+ 		return 1;
+ 	else if (left->xrecoff < right->xrecoff)
+ 		return -1;
+ 
+ 	/* These two XLogReadAhead are same. */
+ 	return 0;
+ }
+ 
+ #else
+ 
+ void
+ ReadAheadStart(void)
+ {
+ 	/* do nothing */
+ }
+ 
+ void
+ ReadAheadEnd(void)
+ {
+ 	/* do nothing */
+ }
+ 
+ void
+ ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff,
+ 	bool has_fpw)
+ {
+ 	/* do nothing */
+ }
+ 
+ bool
+ ReadAheadHasRoom(int num)
+ {
+ 	/* do nothing */
+ 	return true;
+ }
+ 
+ void
+ ReadAheadExecute(void)
+ {
+ 	/* do nothing */
+ }
+ 
+ #endif	/* USE_POSIX_FADVISE */
diff -rcN head.org/src/backend/access/transam/rmgr.c head/src/backend/access/transam/rmgr.c
*** head.org/src/backend/access/transam/rmgr.c	2008-11-19 19:34:50.000000000 +0900
--- head/src/backend/access/transam/rmgr.c	2008-12-26 12:10:54.000000000 +0900
***************
*** 24,43 ****
  
  
  const RmgrData RmgrTable[RM_MAX_ID + 1] = {
! 	{"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL},
! 	{"Transaction", xact_redo, xact_desc, NULL, NULL, NULL},
! 	{"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL},
! 	{"CLOG", clog_redo, clog_desc, NULL, NULL, NULL},
! 	{"Database", dbase_redo, dbase_desc, NULL, NULL, NULL},
! 	{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
! 	{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
! 	{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
! 	{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
! 	{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
! 	{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
! 	{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
! 	{"Hash", hash_redo, hash_desc, NULL, NULL, NULL},
! 	{"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint},
! 	{"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint},
! 	{"Sequence", seq_redo, seq_desc, NULL, NULL, NULL}
  };
--- 24,43 ----
  
  
  const RmgrData RmgrTable[RM_MAX_ID + 1] = {
! 	{"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL, NULL},
! 	{"Transaction", xact_redo, xact_desc, NULL, NULL, NULL, NULL},
! 	{"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL, NULL},
! 	{"CLOG", clog_redo, clog_desc, NULL, NULL, NULL, NULL},
! 	{"Database", dbase_redo, dbase_desc, NULL, NULL, NULL, NULL},
! 	{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL, NULL},
! 	{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL, NULL},
! 	{"Reserved 7", NULL, NULL, NULL, NULL, NULL, NULL},
! 	{"Reserved 8", NULL, NULL, NULL, NULL, NULL, NULL},
! 	{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL, heap2_readahead},
! 	{"Heap", heap_redo, heap_desc, NULL, NULL, NULL, heap_readahead},
! 	{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint, btree_readahead},
! 	{"Hash", hash_redo, hash_desc, NULL, NULL, NULL, NULL},
! 	{"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint, gin_readahead},
! 	{"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint, gist_readahead},
! 	{"Sequence", seq_redo, seq_desc, NULL, NULL, NULL, seq_readahead}
  };
diff -rcN head.org/src/backend/access/transam/xlog.c head/src/backend/access/transam/xlog.c
*** head.org/src/backend/access/transam/xlog.c	2008-12-17 10:39:03.000000000 +0900
--- head/src/backend/access/transam/xlog.c	2008-12-26 16:25:30.000000000 +0900
***************
*** 24,29 ****
--- 24,30 ----
  
  #include "access/clog.h"
  #include "access/multixact.h"
+ #include "access/readahead.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
  #include "access/tuptoaster.h"
***************
*** 381,386 ****
--- 382,398 ----
  static char *readRecordBuf = NULL;
  static uint32 readRecordBufSize = 0;
  
+ /*
+  * Buffer for queued WAL records (fixed size)
+  * 
+  * This buffer is used for holding WAL records and their LSNs. When the all WAL
+  * records of one WAL segment file are read, redo them and make the buffer
+  * empty. Therefore, twice of XLogSegSize, determined by the total size of WAL 
+  * records and LSNs, must be enough for the buffer.
+  */
+ static char RecordQueueBuf[XLogSegSize * 2];
+ static uint32 RecordQueueBufUsed = 0;
+ 
  /* State information for XLOG reading */
  static XLogRecPtr ReadRecPtr;	/* start of last record read */
  static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
***************
*** 441,446 ****
--- 453,461 ----
  static void rm_redo_error_callback(void *arg);
  static int get_sync_bit(int method);
  
+ static void PushRecord(XLogRecPtr lsn, XLogRecord *record);
+ static bool PushReadAhead(XLogRecPtr lsn, XLogRecord *record);
+ static void RedoRecords(void);
  
  /*
   * Insert an XLOG record having the specified RMID and info bytes,
***************
*** 2364,2369 ****
--- 2379,2386 ----
  	ListCell   *cell;
  	int			fd;
  
+ 	ereport(DEBUG1, (errmsg("XLOG switch to %X/%X", log, seg)));
+ 
  	/*
  	 * Loop looking for a suitable timeline ID: we might need to read any of
  	 * the timelines listed in expectedTLIs.
***************
*** 3327,3332 ****
--- 3344,3356 ----
  			readOff += XLOG_BLCKSZ;
  			if (readOff >= XLogSegSize)
  			{
+ 				/*
+ 				 * Reached to the end of current WAL segment file, redo all of
+ 				 * WAL records in the queue.
+ 				 */
+ 				ereport(DEBUG1, (errmsg("switching WAL segment")));
+ 				RedoRecords();
+ 
  				close(readFile);
  				readFile = -1;
  				NextLogSeg(readId, readSeg);
***************
*** 3424,3429 ****
--- 3448,3460 ----
  	return (XLogRecord *) buffer;
  
  next_record_is_invalid:;
+ 	/*
+ 	 * Reached to unused area of current WAL segment file, redo all of WAL
+ 	 * records in the queue.
+ 	 */
+ 	ereport(DEBUG1, (errmsg("next record is invalid(maybe unused area)")));
+ 	RedoRecords();
+ 
  	if (readFile >= 0)
  	{
  		close(readFile);
***************
*** 5154,5160 ****
  		{
  			bool		recoveryContinue = true;
  			bool		recoveryApply = true;
- 			ErrorContextCallback errcontext;
  
  			InRedo = true;
  			ereport(LOG,
--- 5185,5190 ----
***************
*** 5196,5228 ****
  						break;
  				}
  
! 				/* Setup error traceback support for ereport() */
! 				errcontext.callback = rm_redo_error_callback;
! 				errcontext.arg = (void *) record;
! 				errcontext.previous = error_context_stack;
! 				error_context_stack = &errcontext;
! 
! 				/* nextXid must be beyond record's xid */
! 				if (TransactionIdFollowsOrEquals(record->xl_xid,
! 												 ShmemVariableCache->nextXid))
  				{
! 					ShmemVariableCache->nextXid = record->xl_xid;
! 					TransactionIdAdvance(ShmemVariableCache->nextXid);
  				}
  
! 				if (record->xl_info & XLR_BKP_BLOCK_MASK)
! 					RestoreBkpBlocks(record, EndRecPtr);
! 
! 				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
! 
! 				/* Pop the error context stack */
! 				error_context_stack = errcontext.previous;
  
  				LastRec = ReadRecPtr;
  
  				record = ReadRecord(NULL, LOG);
  			} while (record != NULL && recoveryContinue);
  
  			/*
  			 * end of main redo apply loop
  			 */
--- 5226,5269 ----
  						break;
  				}
  
! 				/*
! 				 * Push WAL record in WAL record buffer with its LSN for
! 				 * delayed redo. 
! 				 * If the WAL record queue is full, redo all WAL records in the
! 				 * queue and make the queue empty.
! 				 */
! 				ereport(DEBUG1,
! 					(errmsg("WAL record queue is used %d(%d) bytes at %X/%08X.",
! 						RecordQueueBufUsed, record->xl_tot_len,
! 						EndRecPtr.xlogid, EndRecPtr.xrecoff)));
! 				if (RecordQueueBufUsed + MAXALIGN(sizeof(XLogRecPtr)) +
! 						MAXALIGN(record->xl_tot_len) > sizeof(RecordQueueBuf))
  				{
! 					ereport(DEBUG1, (errmsg("WAL record queue is full")));
! 					RedoRecords();
  				}
+ 				PushRecord(EndRecPtr, record);
  
! 				/*
! 				 * Push page information to prefetch later.
! 				 * If no more space, redo all records in queue and make the
! 				 * queue empty.
! 				 */
! 				while (!PushReadAhead(EndRecPtr, record))
! 				{
! 					ereport(DEBUG1, (errmsg("ReadAhead queue is full.")));
! 					RedoRecords();
! 				}
  
  				LastRec = ReadRecPtr;
  
  				record = ReadRecord(NULL, LOG);
  			} while (record != NULL && recoveryContinue);
  
+ 			/* All WAL records are read, redo all queued WAL records.  */
+ 			ereport(DEBUG1, (errmsg("end of redo apply loop")));
+ 			RedoRecords();
+ 
  			/*
  			 * end of main redo apply loop
  			 */
***************
*** 5447,5452 ****
--- 5488,5575 ----
  }
  
  /*
+  * Push the pair of WAL record and its LSN.
+  * Both WAL records and LSNs are aligned as same as WAL segment file.
+  */
+ static void
+ PushRecord(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	Assert(record);
+ 
+ 	memcpy(RecordQueueBuf + RecordQueueBufUsed, &lsn, sizeof(XLogRecPtr));
+ 	RecordQueueBufUsed += MAXALIGN(sizeof(XLogRecPtr));
+ 	memcpy(RecordQueueBuf + RecordQueueBufUsed, record, record->xl_tot_len);
+ 	RecordQueueBufUsed += MAXALIGN(record->xl_tot_len);
+ }
+ 
+ /*
+  * Push page information to readahead module.
+  */
+ static bool
+ PushReadAhead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	Assert(record);
+ 
+ 	if (!RmgrTable[record->xl_rmid].rm_readahead)
+ 		return true;
+ 
+ 	return RmgrTable[record->xl_rmid].rm_readahead(lsn, record);
+ }
+ 
+ /*
+  * Redo all WAL records in the queue and make the it empty.
+  */
+ static void
+ RedoRecords(void)
+ {
+ 	ErrorContextCallback errcontext;
+ 	uint32 off = 0;
+ 
+ 	/* Readahead data pages which will be modified during redo. */
+ 	ReadAheadExecute();
+ 
+ 	while (off < RecordQueueBufUsed)
+ 	{
+ 		XLogRecPtr lsn;
+ 		XLogRecord *record;
+ 
+ 		/* Extract LSN and WAL record image from local buffer. */
+ 		memcpy(&lsn, RecordQueueBuf + off, sizeof(XLogRecPtr));
+ 		off += MAXALIGN(sizeof(XLogRecPtr));
+ 		record = (XLogRecord *)(RecordQueueBuf + off);
+ 
+ 		/* Setup error traceback support for ereport() */
+ 		errcontext.callback = rm_redo_error_callback;
+ 		errcontext.arg = (void *) record;
+ 		errcontext.previous = error_context_stack;
+ 		error_context_stack = &errcontext;
+ 
+ 		/* nextXid must be beyond record's xid */
+ 		if (TransactionIdFollowsOrEquals(record->xl_xid,
+ 										 ShmemVariableCache->nextXid))
+ 		{
+ 			ShmemVariableCache->nextXid = record->xl_xid;
+ 			TransactionIdAdvance(ShmemVariableCache->nextXid);
+ 		}
+ 
+ 		if (record->xl_info & XLR_BKP_BLOCK_MASK)
+ 			RestoreBkpBlocks(record, lsn);
+ 
+ 		/* Redo with WAL record and its LSN. */
+ 		RmgrTable[record->xl_rmid].rm_redo(lsn, record);
+ 
+ 		/* Pop the error context stack */
+ 		error_context_stack = errcontext.previous;
+ 
+ 		off += MAXALIGN(record->xl_tot_len);
+ 	}
+ 
+ 	/* Make RecordQueueBuf empty. */
+ 	MemSet(RecordQueueBuf, 0, sizeof(RecordQueueBuf));
+ 	RecordQueueBufUsed = 0;
+ }
+ 
+ /*
   * Subroutine to try to fetch and validate a prior checkpoint record.
   *
   * whichChkpt identifies the checkpoint (merely for reporting purposes).
diff -rcN head.org/src/backend/commands/sequence.c head/src/backend/commands/sequence.c
*** head.org/src/backend/commands/sequence.c	2008-11-02 10:45:27.000000000 +0900
--- head/src/backend/commands/sequence.c	2008-12-26 12:10:54.000000000 +0900
***************
*** 15,20 ****
--- 15,21 ----
  #include "postgres.h"
  
  #include "access/heapam.h"
+ #include "access/readahead.h"
  #include "access/transam.h"
  #include "access/xact.h"
  #include "access/xlogutils.h"
***************
*** 1382,1384 ****
--- 1383,1415 ----
  	appendStringInfo(buf, "rel %u/%u/%u",
  			   xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode);
  }
+ 
+ /*
+  *    seq_readahead  - enqueue information about data pages
+  *
+  * The readahead module stores information about pages that are modified through
+  * redo-ing record.
+  *
+  */
+ bool
+ seq_readahead(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ 
+ 	Assert(record);
+ 
+     switch (info)
+     {
+         case XLOG_SEQ_LOG:
+             {
+                 xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record);
+ 
+                 if (!ReadAheadHasRoom(1))
+                     return false;
+                 ReadAheadAddEntry(xlrec->node, 0, lsn.xrecoff, false);
+                 break;
+             }
+     }
+ 
+ 	return true;
+ }
diff -rcN head.org/src/backend/port/Makefile head/src/backend/port/Makefile
*** head.org/src/backend/port/Makefile	2008-10-30 01:06:46.000000000 +0900
--- head/src/backend/port/Makefile	2008-12-26 12:10:54.000000000 +0900
***************
*** 21,27 ****
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = dynloader.o pg_sema.o pg_shmem.o $(TAS)
  
  ifeq ($(PORTNAME), darwin)
  SUBDIRS += darwin
--- 21,27 ----
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = dynloader.o pg_sema.o pg_shmem.o posix_fadvise.o $(TAS)
  
  ifeq ($(PORTNAME), darwin)
  SUBDIRS += darwin
diff -rcN head.org/src/backend/port/posix_fadvise.c head/src/backend/port/posix_fadvise.c
*** head.org/src/backend/port/posix_fadvise.c	1970-01-01 09:00:00.000000000 +0900
--- head/src/backend/port/posix_fadvise.c	2008-12-26 12:10:54.000000000 +0900
***************
*** 0 ****
--- 1,86 ----
+ /*-------------------------------------------------------------------------
+  *
+  * posix_fadvise.c
+  *
+  * Some versions of glibc are known to crash on posix_fadvise so we want to
+  * check for unsafe versions before using posix_fadvise (a configure test isn't
+  * really good enough since the version of glibc on the build system might be
+  * different than on the running system).
+  *
+  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+  *
+  * IDENTIFICATION
+  *	  $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ 
+ #define _XOPEN_SOURCE 600
+ #include <fcntl.h>
+ #include <unistd.h>
+ 
+ /* This checks that libc has a working posix_fadvise() based on our autoconf
+  * test (which actually runs it on the build machine) but also checks that we
+  * aren't running on a buggy old version of glibc.
+  */
+ 
+ /* XXX not sure what file to put the prototype in -- currently we just define
+  * it with extern in guc.c which is the only caller */
+ bool check_posix_fadvise(int elevel);
+ 
+ bool check_posix_fadvise(int elevel) 
+ {
+ 
+ /* Check whether our configure test found a working POSIX_FADVISE */
+ #ifndef USE_POSIX_FADVISE
+ 	ereport(elevel,
+ 			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 			 errmsg("could not determine if this system has a working posix_fadvise"),
+ 			 errdetail("Check configure.log produced by configure for more information")));
+ 	return false;
+ #endif
+ 
+ 
+ /* Double check glibc isn't a buggy version */
+ #if defined(HAVE_CONFSTR) && defined(_CS_GNU_LIBC_VERSION)
+ 	{
+ 		size_t l;
+ 		char buf[100], v[100];
+ 		unsigned v1, v2, v3;
+ 		int nmatched;
+ 		
+ 		l = confstr(_CS_GNU_LIBC_VERSION, buf, sizeof(buf));
+ 		if (l >= sizeof(buf))
+ 			elog(FATAL, "unexpected result from confstr()");
+ 
+ 		nmatched = sscanf(buf, "%s %u.%u.%u", v, &v1, &v2, &v3);
+ 	
+ 		/* It looks like the bug was fixed in 2.3.5 but insist on the more
+ 		 * recent 2.3.6 release (the last of the 2.3.x currently) just to be
+ 		 * safe */
+ 		if (nmatched < 1 ||
+ 			strcmp(v, "glibc") ||
+ 			!((nmatched >= 2 && v1 > 2) ||
+ 			  (nmatched >= 3 && v1 == 2 && v2 > 3) ||
+ 			  (nmatched >= 4 && v1 == 2 && v2 == 3 && v3 >= 6)))
+ 		{
+ 			ereport(elevel,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("could not determine if this system has a working posix_fadvise"),
+ 					 errdetail("Note that some Glibc versions earlier than 2.3.6 are known to have buggy versions of posix_fadvise")));
+ 			return false;
+ 		}
+ 	}
+ #else
+ #ifdef __GLIBC__
+ 	ereport(elevel,
+ 			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 			 errmsg("could not determine if this system has a working posix_fadvise"),
+ 			 errdetail("Note that some Glibc versions earlier than 2.3.6 are known to have buggy versions of posix_fadvise")));
+ 	return false;
+ #endif
+ #endif
+ 
+ 	return true;
+ }
diff -rcN head.org/src/include/access/gin.h head/src/include/access/gin.h
*** head.org/src/include/access/gin.h	2008-11-04 05:47:49.000000000 +0900
--- head/src/include/access/gin.h	2008-12-26 12:10:54.000000000 +0900
***************
*** 256,261 ****
--- 256,262 ----
  /* ginxlog.c */
  extern void gin_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void gin_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool gin_readahead(XLogRecPtr lsn, XLogRecord *record);
  extern void gin_xlog_startup(void);
  extern void gin_xlog_cleanup(void);
  extern bool gin_safe_restartpoint(void);
diff -rcN head.org/src/include/access/gist_private.h head/src/include/access/gist_private.h
*** head.org/src/include/access/gist_private.h	2008-10-22 21:53:56.000000000 +0900
--- head/src/include/access/gist_private.h	2008-12-26 12:10:54.000000000 +0900
***************
*** 250,255 ****
--- 250,256 ----
  /* gistxlog.c */
  extern void gist_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void gist_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool gist_readahead(XLogRecPtr lsn, XLogRecord *record);
  extern void gist_xlog_startup(void);
  extern void gist_xlog_cleanup(void);
  extern bool gist_safe_restartpoint(void);
diff -rcN head.org/src/include/access/heapam.h head/src/include/access/heapam.h
*** head.org/src/include/access/heapam.h	2008-11-07 05:51:15.000000000 +0900
--- head/src/include/access/heapam.h	2008-12-26 12:10:54.000000000 +0900
***************
*** 124,131 ****
--- 124,133 ----
  
  extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
  extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool heap_readahead(XLogRecPtr lsn, XLogRecord *rptr);
  extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
  extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool heap2_readahead(XLogRecPtr lsn, XLogRecord *rptr);
  
  extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
  			  ItemPointerData from,
diff -rcN head.org/src/include/access/nbtree.h head/src/include/access/nbtree.h
*** head.org/src/include/access/nbtree.h	2008-07-14 05:45:47.000000000 +0900
--- head/src/include/access/nbtree.h	2008-12-26 12:10:54.000000000 +0900
***************
*** 591,596 ****
--- 591,597 ----
   */
  extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool btree_readahead(XLogRecPtr lns, XLogRecord *rptr);
  extern void btree_xlog_startup(void);
  extern void btree_xlog_cleanup(void);
  extern bool btree_safe_restartpoint(void);
diff -rcN head.org/src/include/access/readahead.h head/src/include/access/readahead.h
*** head.org/src/include/access/readahead.h	1970-01-01 09:00:00.000000000 +0900
--- head/src/include/access/readahead.h	2008-12-26 12:24:15.000000000 +0900
***************
*** 0 ****
--- 1,31 ----
+ /*-------------------------------------------------------------------------
+  *
+  * readahead.h
+  *		Store information of data pages which should be read ahead.
+  *
+  * Portions Copyright (c) 2008, Nippon Telegraph and Telephone Corporation
+  * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *-------------------------------------------------------------------------
+  */
+ 
+ #ifndef READAHEAD_H
+ #define READAHEAD_H
+ 
+ #include "postgres.h"
+ #include "storage/relfilenode.h"
+ #include "storage/block.h"
+ #include "access/xlogdefs.h"
+ #include "access/xlog.h"
+ 
+ /*
+  * Prototype of public function.
+  */
+ void ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff,
+ 	bool has_fpw);
+ bool ReadAheadHasRoom(int num);
+ void ReadAheadExecute(void);
+ 
+ #endif /* READAHEAD_H */
+ 
diff -rcN head.org/src/include/access/xlog.h head/src/include/access/xlog.h
*** head.org/src/include/access/xlog.h	2008-05-12 17:35:05.000000000 +0900
--- head/src/include/access/xlog.h	2008-12-26 12:10:54.000000000 +0900
***************
*** 196,201 ****
--- 196,202 ----
  
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool xlog_readahead(XLogRecPtr lsn, XLogRecord *rptr);
  
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
diff -rcN head.org/src/include/access/xlog_internal.h head/src/include/access/xlog_internal.h
*** head.org/src/include/access/xlog_internal.h	2008-08-11 20:05:11.000000000 +0900
--- head/src/include/access/xlog_internal.h	2008-12-26 12:10:54.000000000 +0900
***************
*** 235,240 ****
--- 235,241 ----
  	void		(*rm_startup) (void);
  	void		(*rm_cleanup) (void);
  	bool		(*rm_safe_restartpoint) (void);
+ 	bool		(*rm_readahead) (XLogRecPtr lsn, XLogRecord *rptr);
  } RmgrData;
  
  extern const RmgrData RmgrTable[];
diff -rcN head.org/src/include/commands/sequence.h head/src/include/commands/sequence.h
*** head.org/src/include/commands/sequence.h	2008-05-17 08:36:05.000000000 +0900
--- head/src/include/commands/sequence.h	2008-12-26 12:10:54.000000000 +0900
***************
*** 98,102 ****
--- 98,103 ----
  
  extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr);
  extern void seq_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool seq_readahead(XLogRecPtr lsn, XLogRecord *record);
  
  #endif   /* SEQUENCE_H */
diff -rcN head.org/src/include/pg_config.h.in head/src/include/pg_config.h.in
*** head.org/src/include/pg_config.h.in	2008-12-11 16:34:08.000000000 +0900
--- head/src/include/pg_config.h.in	2008-12-26 12:10:54.000000000 +0900
***************
*** 99,105 ****
  #undef HAVE_DECL_F_FULLFSYNC
  
  /* Define to 1 if you have the declaration of `posix_fadvise', and to 0 if you
!    don't. */
  #undef HAVE_DECL_POSIX_FADVISE
  
  /* Define to 1 if you have the declaration of `snprintf', and to 0 if you
--- 99,106 ----
  #undef HAVE_DECL_F_FULLFSYNC
  
  /* Define to 1 if you have the declaration of `posix_fadvise', and to 0 if you
!    don't. (posix_fadvise() is abi-incompatible with a non-prototyped call so
!    this is important) */
  #undef HAVE_DECL_POSIX_FADVISE
  
  /* Define to 1 if you have the declaration of `snprintf', and to 0 if you
***************
*** 339,344 ****
--- 340,348 ----
  /* Define to 1 if you have the POSIX signal interface. */
  #undef HAVE_POSIX_SIGNALS
  
+ /* Define to 1 if you have the posix_fadvise() call */
+ #undef HAVE_POSIX_FADVISE
+ 
  /* Define to 1 if you have the `pstat' function. */
  #undef HAVE_PSTAT
  
***************
*** 505,510 ****
--- 509,517 ----
  /* Define to 1 if you have the `sysconf' function. */
  #undef HAVE_SYSCONF
  
+ /* Define to 1 if you have the `confstr' call */
+ #undef HAVE_CONFSTR
+ 
  /* Define to 1 if you have the syslog interface. */
  #undef HAVE_SYSLOG
  
***************
*** 731,736 ****
--- 738,748 ----
  /* Define to 1 to build with PAM support. (--with-pam) */
  #undef USE_PAM
  
+ /* Define to 1 to use posix_fadvise() (Many platforms have buggy or ineffective
+  * implementations. We can test for some known bugs but it's hard to test for
+  * effectiveness) */
+ #undef USE_POSIX_FADVISE
+ 
  /* Use replacement snprintf() functions. */
  #undef USE_REPL_SNPRINTF
  
#2Gregory Stark
stark@enterprisedb.com
In reply to: Koichi Suzuki (#1)
Re: V3 of PITR performance improvement for 8.4 (WIP)

"Koichi Suzuki" <koichi.szk@gmail.com> writes:

This is the V3 of PITR performance improvement (readahead). The
change of the code is as follows:

1) Now readahead is integrated into the core so that it can deal with
sync.rep's log shipping.
2) posix_fadvise() call was integrated with Greg Stark's patch.

Wow, this is really cool. It integrates into core the readahead of WAL records
using a new RM method for each WAL type. That's great.

I haven't looked closely yet, I assume this avoids the code duplication? I did
notice there's a whole queue data structure for planning which blocks to
prefetch. Why is that necessary instead of just keeping a count of how many
blocks have been prefetched? Does it help avoid prefetching the same blocks
repeatedly?

--
Gregory Stark
EnterpriseDB http://www.enterprisedb.com
Ask me about EnterpriseDB's On-Demand Production Tuning