diff -rcN pgsql.posix_fadvise/src/backend/access/gin/ginxlog.c pgsql/src/backend/access/gin/ginxlog.c *** pgsql.posix_fadvise/src/backend/access/gin/ginxlog.c 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/access/gin/ginxlog.c 2009-01-09 20:32:13.000000000 +0900 *************** *** 15,20 **** --- 15,21 ---- #include "access/gin.h" #include "access/xlogutils.h" + #include "access/readahead.h" #include "storage/bufmgr.h" #include "utils/memutils.h" *************** *** 519,524 **** --- 520,629 ---- } } + /* + * gin_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + gin_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_GIN_CREATE_INDEX: + { + RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(*node, GIN_ROOT_BLKNO, lsn.xrecoff, false); + break; + } + case XLOG_GIN_CREATE_PTREE: + { + ginxlogCreatePostingTree *data = + (ginxlogCreatePostingTree *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(data->node, data->blkno, lsn.xrecoff, false); + break; + } + case XLOG_GIN_INSERT: + { + ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(data->node, data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_GIN_SPLIT: + { + int readahead_cnt; + ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record); + + readahead_cnt = 2; + if (data->isRootSplit) + readahead_cnt++; + + if (!ReadAheadHasRoom(readahead_cnt)) + return false; + + ReadAheadAddEntry(data->node, data->lblkno, lsn.xrecoff, false); + ReadAheadAddEntry(data->node, data->rblkno, lsn.xrecoff, false); + if (data->isRootSplit) + { + ReadAheadAddEntry(data->node, data->rootBlkno, + lsn.xrecoff, false); + } + break; + } + case XLOG_GIN_VACUUM_PAGE: + { + ginxlogVacuumPage *data = + (ginxlogVacuumPage *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(data->node, data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_GIN_DELETE_PAGE: + { + int readahead_cnt; + ginxlogDeletePage *data = + (ginxlogDeletePage *) XLogRecGetData(record); + readahead_cnt = 2; + if (data->leftBlkno != InvalidBlockNumber) + readahead_cnt++; + + if (!ReadAheadHasRoom(2)) + return false; + ReadAheadAddEntry(data->node, data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + ReadAheadAddEntry(data->node, data->parentBlkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + if (data->leftBlkno != InvalidBlockNumber) + { + ReadAheadAddEntry(data->node, data->leftBlkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3); + } + break; + } + } + + return true; + } + void gin_xlog_startup(void) { diff -rcN pgsql.posix_fadvise/src/backend/access/gist/gistxlog.c pgsql/src/backend/access/gist/gistxlog.c *** pgsql.posix_fadvise/src/backend/access/gist/gistxlog.c 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/access/gist/gistxlog.c 2009-01-09 20:32:13.000000000 +0900 *************** *** 15,20 **** --- 15,21 ---- #include "access/gist_private.h" #include "access/xlogutils.h" + #include "access/readahead.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "utils/memutils.h" *************** *** 500,505 **** --- 501,584 ---- } } + /* + * gist_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + gist_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_GIST_PAGE_UPDATE: + case XLOG_GIST_NEW_ROOT: + { + PageUpdateRecord xlrec; + + decodePageUpdateRecord(&xlrec, record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec.data->node, xlrec.data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_GIST_PAGE_SPLIT: + { + int i; + + PageSplitRecord rec; + decodePageSplitRecord(&rec, record); + + if (!ReadAheadHasRoom(rec.data->npage)) + return false; + for (i = 0; i < rec.data->npage; i++) + { + ReadAheadAddEntry(rec.data->node, rec.page[i].header->blkno, + lsn.xrecoff, false); + } + break; + } + case XLOG_GIST_INSERT_COMPLETE: + { + /* + * This WAL record never touch data page, so nothi ng + * to do. + */ + break; + } + case XLOG_GIST_CREATE_INDEX: + { + RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(*node, GIST_ROOT_BLKNO, lsn.xrecoff, false); + break; + } + case XLOG_GIST_PAGE_DELETE: + { + gistxlogPageDelete *xldata = + (gistxlogPageDelete *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xldata->node, xldata->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + } + + return true; + } + IndexTuple gist_form_invalid_tuple(BlockNumber blkno) { diff -rcN pgsql.posix_fadvise/src/backend/access/heap/heapam.c pgsql/src/backend/access/heap/heapam.c *** pgsql.posix_fadvise/src/backend/access/heap/heapam.c 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/access/heap/heapam.c 2009-01-09 20:32:11.000000000 +0900 *************** *** 42,47 **** --- 42,48 ---- #include "access/heapam.h" #include "access/hio.h" #include "access/multixact.h" + #include "access/readahead.h" #include "access/relscan.h" #include "access/sysattr.h" #include "access/transam.h" *************** *** 4970,4975 **** --- 4971,5122 ---- } /* + * heap_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + heap_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP_INSERT: + { + xl_heap_insert *xlrec = + (xl_heap_insert *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP_DELETE: + { + xl_heap_delete *xlrec = + (xl_heap_delete *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP_UPDATE: + case XLOG_HEAP_MOVE: + case XLOG_HEAP_HOT_UPDATE: + { + bool samepage; + xl_heap_update *xlrec = + (xl_heap_update *) XLogRecGetData(record); + + samepage = ItemPointerGetBlockNumber(&xlrec->newtid) == + ItemPointerGetBlockNumber(&xlrec->target.tid); + + if (!ReadAheadHasRoom(1 + (samepage ? 0 : 1))) + return false; + /* store page which contains updated tuple. */ + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + /* store another page if any. */ + if (!samepage) + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->newtid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + break; + } + case XLOG_HEAP_NEWPAGE: + { + xl_heap_newpage *xlrec = + (xl_heap_newpage *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->blkno, + lsn.xrecoff, false); + break; + } + case XLOG_HEAP_LOCK: + { + xl_heap_lock *xlrec = + (xl_heap_lock *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP_INPLACE: + { + xl_heap_inplace *xlrec = + (xl_heap_inplace *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + } + + return true; + } + + /* + * heap2_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + heap2_readahead(XLogRecPtr lsn, XLogRecord *record) + { + Assert(record); + + switch (record->xl_info) + { + case XLOG_HEAP2_FREEZE: + { + xl_heap_freeze *xlrec = + (xl_heap_freeze *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->block, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP2_CLEAN: + case XLOG_HEAP2_CLEAN_MOVE: + { + xl_heap_clean *xlrec = + (xl_heap_clean *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->block, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + } + + return true; + } + + /* * heap_sync - sync a heap, for use when no WAL has been written * * This forces the heap contents (including TOAST heap if any) down to disk. diff -rcN pgsql.posix_fadvise/src/backend/access/nbtree/nbtxlog.c pgsql/src/backend/access/nbtree/nbtxlog.c *** pgsql.posix_fadvise/src/backend/access/nbtree/nbtxlog.c 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/access/nbtree/nbtxlog.c 2009-01-09 20:32:13.000000000 +0900 *************** *** 16,21 **** --- 16,22 ---- #include "access/nbtree.h" #include "access/transam.h" + #include "access/readahead.h" #include "storage/bufmgr.h" /* *************** *** 878,883 **** --- 879,1014 ---- } } + /* + * btree_readahead - enqueue information about data pages + * + */ + bool + btree_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + case XLOG_BTREE_INSERT_UPPER: + case XLOG_BTREE_INSERT_META: + { + int readahead_cnt; + xl_btree_insert *xlrec = + (xl_btree_insert *) XLogRecGetData(record); + + readahead_cnt = 1; + if (info == XLOG_BTREE_INSERT_META) + readahead_cnt++; + + if (!ReadAheadHasRoom(readahead_cnt)) + return false; + ReadAheadAddEntry(xlrec->target.node, + BlockIdGetBlockNumber(&xlrec->target.tid.ip_blkid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + if (info == XLOG_BTREE_INSERT_META) + { + ReadAheadAddEntry(xlrec->target.node, + BTREE_METAPAGE, lsn.xrecoff, false); + } + break; + } + case XLOG_BTREE_SPLIT_L: + case XLOG_BTREE_SPLIT_L_ROOT: + case XLOG_BTREE_SPLIT_R: + case XLOG_BTREE_SPLIT_R_ROOT: + { + int readahead_cnt; + xl_btree_split *xlrec = + (xl_btree_split *) XLogRecGetData(record); + + readahead_cnt = 2; + if (xlrec->rnext != P_NONE) + readahead_cnt++; + + if (!ReadAheadHasRoom(readahead_cnt)) + return false; + + ReadAheadAddEntry(xlrec->node, xlrec->rightsib, + lsn.xrecoff, false); + ReadAheadAddEntry(xlrec->node, xlrec->leftsib, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + if (xlrec->rnext != P_NONE) + { + ReadAheadAddEntry(xlrec->node, xlrec->rnext, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + } + break; + } + case XLOG_BTREE_DELETE: + { + xl_btree_delete *xlrec = + (xl_btree_delete *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->block, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + case XLOG_BTREE_DELETE_PAGE_HALF: + { + int readahead_cnt; + xl_btree_delete_page *xlrec = + (xl_btree_delete_page *) XLogRecGetData(record); + + readahead_cnt = 3; + if (info == XLOG_BTREE_DELETE_PAGE_META) + readahead_cnt++; + if (xlrec->leftblk != P_NONE) + readahead_cnt++; + + /* parent page */ + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + /* rightsib page */ + ReadAheadAddEntry(xlrec->target.node, xlrec->rightblk, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + /* leftsib page, if exists */ + if (xlrec->leftblk != P_NONE) + { + ReadAheadAddEntry(xlrec->target.node, xlrec->leftblk, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3); + } + /* target page */ + ReadAheadAddEntry(xlrec->target.node, + xlrec->deadblk, lsn.xrecoff, false); + /* metapage, if exists */ + if (info == XLOG_BTREE_DELETE_PAGE_META) + { + ReadAheadAddEntry(xlrec->target.node, + BTREE_METAPAGE, lsn.xrecoff, false); + } + break; + } + case XLOG_BTREE_NEWROOT: + { + xl_btree_newroot *xlrec = + (xl_btree_newroot *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + /* FPW does not exists. */ + ReadAheadAddEntry(xlrec->node, xlrec->rootblk, + lsn.xrecoff, false); + break; + } + } + + return true; + } + void btree_xlog_startup(void) { diff -rcN pgsql.posix_fadvise/src/backend/access/transam/Makefile pgsql/src/backend/access/transam/Makefile *** pgsql.posix_fadvise/src/backend/access/transam/Makefile 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/access/transam/Makefile 2009-01-09 20:32:13.000000000 +0900 *************** *** 12,18 **** top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o include $(top_srcdir)/src/backend/common.mk --- 12,18 ---- top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o readahead.o include $(top_srcdir)/src/backend/common.mk diff -rcN pgsql.posix_fadvise/src/backend/access/transam/readahead.c pgsql/src/backend/access/transam/readahead.c *** pgsql.posix_fadvise/src/backend/access/transam/readahead.c 1970-01-01 09:00:00.000000000 +0900 --- pgsql/src/backend/access/transam/readahead.c 2009-01-09 20:36:13.000000000 +0900 *************** *** 0 **** --- 1,209 ---- + /*------------------------------------------------------------------------- + * + * readahead.c + * Store information of data pages which should be read ahead. + * + * Portions Copyright (c) 2008, Nippon Telegraph and Telephone Corporation + * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ + + #include + #include + #include + #include + + #include "postgres.h" + #include "access/readahead.h" + #include "access/xlog_internal.h" + #include "catalog/catalog.h" + #include "storage/relfilenode.h" + #include "storage/block.h" + #include "storage/smgr.h" + + /* + * Information about the data page which will be read ahead. + */ + struct XLogReadAhead { + /* + * The physical location of the data page. + */ + RelFileNode node; + BlockNumber blkno; + + /* + * xrecoff is the byte offset of location in the WAL segment file as + * defined in xlogdefs.h. The read ahead command does not deal with more + * than one WAL segment file at once, and xlogid is not going to be changed + * during read-ahead. This is why we need only xrecoff. + */ + uint32 xrecoff; + + /* + * has_fpw indicates whether an WAL record contains full page write or not. + * This is used to skip unnecessary read-aheads. + */ + bool has_fpw; + }; + typedef struct XLogReadAhead XLogReadAhead; + + /* + * ReadAheadQueueSize is the initail size of XLogReadAhead queue. + * When the number of XLogReadAhead reaches this amount, we execute readahead. + * Queue uses 16MB. + */ + #define ReadAheadQueueSize (16 * 1024 * 1024 / sizeof(XLogReadAhead)) + + /* The queue for XLogReadAhead entries. */ + static XLogReadAhead ReadAheadQueue[ReadAheadQueueSize]; + + /* The number of XLogReadAhead entries currently used. */ + static uint32 ReadAheadQueueUsed = 0; + + /* prototype of local function */ + static int ReadAheadCompare(const void *l, const void *r); + + /* + * Append a new XLogReadAhead entry to the queue + * + * If XLogReadAhead queue is fullfilled, prefetch first and add ReadAheadQueue + * to empty queue. + */ + void + ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff, + bool has_fpw) + { + /* all entries are used, so prefetch pages and make the queue empty */ + if (ReadAheadQueueUsed >= ReadAheadQueueSize) + { + ReadAheadExecute(); + } + + /* Append a new XLogReadAhead ReadAheadQueue to the queue. */ + ReadAheadQueue[ReadAheadQueueUsed].node = node; + ReadAheadQueue[ReadAheadQueueUsed].blkno = blkno; + ReadAheadQueue[ReadAheadQueueUsed].xrecoff = xrecoff; + ReadAheadQueue[ReadAheadQueueUsed].has_fpw = has_fpw; + + ReadAheadQueueUsed++; + } + + /* + * ReadAhead queue availability check + * + * If the XLogReadAhead queue has enough room for appending more num of + * XLogReadAhead, + * return true. If it does not, try to double the queue. + * If new queue could't be allocated, return false. + */ + bool + ReadAheadHasRoom(int num) + { + return (ReadAheadQueueUsed + num <= ReadAheadQueueSize); + } + + /* + * Check whether info1 and info2 point same data page. + */ + #define IS_SAME_PAGE(info1, info2) \ + (RelFileNodeEquals((info1).node, (info2).node) && \ + (info1).blkno == (info2).blkno) + + /* + * Execute read ahead data pages + * + * Before we actually read ahead data pages, sort the XLogReadAhead in the queue + * for avoiding duplicated disk access and hopefully, reducing seek time. + * We also skip read ahead data pages which has full page write. + * + * For performance, we keep file opened until reading another file. + */ + void + ReadAheadExecute(void) + { + int i; + SMgrRelation reln; + XLogReadAhead last_entry = { { 0, 0, 0, }, 0, 0, false }; + + ereport(DEBUG1, (errmsg("%d blocks are prefetch canditate", + ReadAheadQueueUsed))); + + /* Sort the XLogReadAhead queue for effective disk access. */ + qsort(ReadAheadQueue, ReadAheadQueueUsed, sizeof(XLogReadAhead), + ReadAheadCompare); + + for (i = 0; i < ReadAheadQueueUsed; i++) + { + /* Do read ahead once per a page if it doesn't have full page write. */ + if (IS_SAME_PAGE(last_entry, ReadAheadQueue[i]) || + ReadAheadQueue[i].has_fpw) + { + last_entry = ReadAheadQueue[i]; + continue; + } + + /* Create SMgrRelation object */ + reln = smgropen(ReadAheadQueue[i].node); + + /* Read ahead with prefetch API */ + smgrprefetch(reln, MAIN_FORKNUM, ReadAheadQueue[i].blkno); + + /* Store XLogReadAhead to skip duplicate pages. */ + last_entry = ReadAheadQueue[i]; + } + + ReadAheadQueueUsed = 0; + } + + /* + * Compare two XLogReadAhead objects + * + * When l > r, then return 1, l == r, then return 0, and l < r, then return -1. + * The priority of comparison clauses shows below; + * 1. node.spcNode + * 2. node.dbNode + * 3. node.relNode + * 4. blkno + * 5. xrecoff + */ + static int + ReadAheadCompare(const void *l, const void *r) + { + XLogReadAhead *left = (XLogReadAhead *)l; + XLogReadAhead *right = (XLogReadAhead *)r; + + /* compare node.spcNode */ + if (left->node.spcNode > right->node.spcNode) + return 1; + else if (left->node.spcNode < right->node.spcNode) + return -1; + + /* compare node.dbNode */ + if (left->node.dbNode > right->node.dbNode) + return 1; + else if (left->node.dbNode < right->node.dbNode) + return -1; + + /* compare node.relNode */ + if (left->node.relNode > right->node.relNode) + return 1; + else if (left->node.relNode < right->node.relNode) + return -1; + + /* compare blkno */ + if (left->blkno > right->blkno) + return 1; + else if (left->blkno < right->blkno) + return -1; + + /* compare xrecoff */ + if (left->xrecoff > right->xrecoff) + return 1; + else if (left->xrecoff < right->xrecoff) + return -1; + + /* These two XLogReadAhead are same. */ + return 0; + } diff -rcN pgsql.posix_fadvise/src/backend/access/transam/rmgr.c pgsql/src/backend/access/transam/rmgr.c *** pgsql.posix_fadvise/src/backend/access/transam/rmgr.c 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/access/transam/rmgr.c 2009-01-09 20:32:12.000000000 +0900 *************** *** 24,43 **** const RmgrData RmgrTable[RM_MAX_ID + 1] = { ! {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL}, ! {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL}, ! {"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL}, ! {"CLOG", clog_redo, clog_desc, NULL, NULL, NULL}, ! {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL}, ! {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL}, ! {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL}, ! {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, ! {"Reserved 8", NULL, NULL, NULL, NULL, NULL}, ! {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL}, ! {"Heap", heap_redo, heap_desc, NULL, NULL, NULL}, ! {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint}, ! {"Hash", hash_redo, hash_desc, NULL, NULL, NULL}, ! {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint}, ! {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint}, ! {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL} }; --- 24,43 ---- const RmgrData RmgrTable[RM_MAX_ID + 1] = { ! {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL, NULL}, ! {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL, NULL}, ! {"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL, NULL}, ! {"CLOG", clog_redo, clog_desc, NULL, NULL, NULL, NULL}, ! {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL, NULL}, ! {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL, NULL}, ! {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL, NULL}, ! {"Reserved 7", NULL, NULL, NULL, NULL, NULL, NULL}, ! {"Reserved 8", NULL, NULL, NULL, NULL, NULL, NULL}, ! {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL, heap2_readahead}, ! {"Heap", heap_redo, heap_desc, NULL, NULL, NULL, heap_readahead}, ! {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint, btree_readahead}, ! {"Hash", hash_redo, hash_desc, NULL, NULL, NULL, NULL}, ! {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint, gin_readahead}, ! {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint, gist_readahead}, ! {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL, seq_readahead} }; diff -rcN pgsql.posix_fadvise/src/backend/access/transam/xlog.c pgsql/src/backend/access/transam/xlog.c *** pgsql.posix_fadvise/src/backend/access/transam/xlog.c 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/access/transam/xlog.c 2009-01-09 20:32:12.000000000 +0900 *************** *** 24,29 **** --- 24,30 ---- #include "access/clog.h" #include "access/multixact.h" + #include "access/readahead.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/tuptoaster.h" *************** *** 381,386 **** --- 382,398 ---- static char *readRecordBuf = NULL; static uint32 readRecordBufSize = 0; + /* + * Buffer for queued WAL records (fixed size) + * + * This buffer is used for holding WAL records and their LSNs. When the all WAL + * records of one WAL segment file are read, redo them and make the buffer + * empty. Therefore, twice of XLogSegSize, determined by the total size of WAL + * records and LSNs, must be enough for the buffer. + */ + static char RecordQueueBuf[XLogSegSize * 2]; + static uint32 RecordQueueBufUsed = 0; + /* State information for XLOG reading */ static XLogRecPtr ReadRecPtr; /* start of last record read */ static XLogRecPtr EndRecPtr; /* end+1 of last record read */ *************** *** 441,446 **** --- 453,461 ---- static void rm_redo_error_callback(void *arg); static int get_sync_bit(int method); + static void PushRecord(XLogRecPtr lsn, XLogRecord *record); + static bool PushReadAhead(XLogRecPtr lsn, XLogRecord *record); + static void RedoRecords(void); /* * Insert an XLOG record having the specified RMID and info bytes, *************** *** 2364,2369 **** --- 2379,2386 ---- ListCell *cell; int fd; + ereport(DEBUG1, (errmsg("XLOG switch to %X/%X", log, seg))); + /* * Loop looking for a suitable timeline ID: we might need to read any of * the timelines listed in expectedTLIs. *************** *** 3424,3429 **** --- 3441,3453 ---- return (XLogRecord *) buffer; next_record_is_invalid:; + /* + * Reached to unused area of current WAL segment file, redo all of WAL + * records in the queue. + */ + ereport(DEBUG1, (errmsg("next record is invalid(maybe unused area)"))); + RedoRecords(); + if (readFile >= 0) { close(readFile); *************** *** 5154,5160 **** { bool recoveryContinue = true; bool recoveryApply = true; - ErrorContextCallback errcontext; InRedo = true; ereport(LOG, --- 5178,5183 ---- *************** *** 5196,5228 **** break; } ! /* Setup error traceback support for ereport() */ ! errcontext.callback = rm_redo_error_callback; ! errcontext.arg = (void *) record; ! errcontext.previous = error_context_stack; ! error_context_stack = &errcontext; ! ! /* nextXid must be beyond record's xid */ ! if (TransactionIdFollowsOrEquals(record->xl_xid, ! ShmemVariableCache->nextXid)) { ! ShmemVariableCache->nextXid = record->xl_xid; ! TransactionIdAdvance(ShmemVariableCache->nextXid); } ! if (record->xl_info & XLR_BKP_BLOCK_MASK) ! RestoreBkpBlocks(record, EndRecPtr); ! ! RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record); ! ! /* Pop the error context stack */ ! error_context_stack = errcontext.previous; LastRec = ReadRecPtr; record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); /* * end of main redo apply loop */ --- 5219,5262 ---- break; } ! /* ! * Push WAL record in WAL record buffer with its LSN for ! * delayed redo. ! * If the WAL record queue is full, redo all WAL records in the ! * queue and make the queue empty. ! */ ! ereport(DEBUG1, ! (errmsg("WAL record queue is used %d(%d) bytes at %X/%08X.", ! RecordQueueBufUsed, record->xl_tot_len, ! EndRecPtr.xlogid, EndRecPtr.xrecoff))); ! if (RecordQueueBufUsed + MAXALIGN(sizeof(XLogRecPtr)) + ! MAXALIGN(record->xl_tot_len) > sizeof(RecordQueueBuf)) { ! ereport(DEBUG1, (errmsg("WAL record queue is full"))); ! RedoRecords(); } + PushRecord(EndRecPtr, record); ! /* ! * Push page information to prefetch later. ! * If no more space, redo all records in queue and make the ! * queue empty. ! */ ! while (!PushReadAhead(EndRecPtr, record)) ! { ! ereport(DEBUG1, (errmsg("ReadAhead queue is full."))); ! RedoRecords(); ! } LastRec = ReadRecPtr; record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); + /* All WAL records are read, redo all queued WAL records. */ + ereport(DEBUG1, (errmsg("end of redo apply loop"))); + RedoRecords(); + /* * end of main redo apply loop */ *************** *** 5447,5452 **** --- 5481,5568 ---- } /* + * Push the pair of WAL record and its LSN. + * Both WAL records and LSNs are aligned as same as WAL segment file. + */ + static void + PushRecord(XLogRecPtr lsn, XLogRecord *record) + { + Assert(record); + + memcpy(RecordQueueBuf + RecordQueueBufUsed, &lsn, sizeof(XLogRecPtr)); + RecordQueueBufUsed += MAXALIGN(sizeof(XLogRecPtr)); + memcpy(RecordQueueBuf + RecordQueueBufUsed, record, record->xl_tot_len); + RecordQueueBufUsed += MAXALIGN(record->xl_tot_len); + } + + /* + * Push page information to readahead module. + */ + static bool + PushReadAhead(XLogRecPtr lsn, XLogRecord *record) + { + Assert(record); + + if (!RmgrTable[record->xl_rmid].rm_readahead) + return true; + + return RmgrTable[record->xl_rmid].rm_readahead(lsn, record); + } + + /* + * Redo all WAL records in the queue and make the it empty. + */ + static void + RedoRecords(void) + { + ErrorContextCallback errcontext; + uint32 off = 0; + + /* Readahead data pages which will be modified during redo. */ + ReadAheadExecute(); + + while (off < RecordQueueBufUsed) + { + XLogRecPtr lsn; + XLogRecord *record; + + /* Extract LSN and WAL record image from local buffer. */ + memcpy(&lsn, RecordQueueBuf + off, sizeof(XLogRecPtr)); + off += MAXALIGN(sizeof(XLogRecPtr)); + record = (XLogRecord *)(RecordQueueBuf + off); + + /* Setup error traceback support for ereport() */ + errcontext.callback = rm_redo_error_callback; + errcontext.arg = (void *) record; + errcontext.previous = error_context_stack; + error_context_stack = &errcontext; + + /* nextXid must be beyond record's xid */ + if (TransactionIdFollowsOrEquals(record->xl_xid, + ShmemVariableCache->nextXid)) + { + ShmemVariableCache->nextXid = record->xl_xid; + TransactionIdAdvance(ShmemVariableCache->nextXid); + } + + if (record->xl_info & XLR_BKP_BLOCK_MASK) + RestoreBkpBlocks(record, lsn); + + /* Redo with WAL record and its LSN. */ + RmgrTable[record->xl_rmid].rm_redo(lsn, record); + + /* Pop the error context stack */ + error_context_stack = errcontext.previous; + + off += MAXALIGN(record->xl_tot_len); + } + + /* Make RecordQueueBuf empty. */ + MemSet(RecordQueueBuf, 0, sizeof(RecordQueueBuf)); + RecordQueueBufUsed = 0; + } + + /* * Subroutine to try to fetch and validate a prior checkpoint record. * * whichChkpt identifies the checkpoint (merely for reporting purposes). diff -rcN pgsql.posix_fadvise/src/backend/commands/sequence.c pgsql/src/backend/commands/sequence.c *** pgsql.posix_fadvise/src/backend/commands/sequence.c 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/backend/commands/sequence.c 2009-01-09 20:32:07.000000000 +0900 *************** *** 15,20 **** --- 15,21 ---- #include "postgres.h" #include "access/heapam.h" + #include "access/readahead.h" #include "access/transam.h" #include "access/xact.h" #include "access/xlogutils.h" *************** *** 1382,1384 **** --- 1383,1415 ---- appendStringInfo(buf, "rel %u/%u/%u", xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode); } + + /* + * seq_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + seq_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_SEQ_LOG: + { + xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, 0, lsn.xrecoff, false); + break; + } + } + + return true; + } diff -rcN pgsql.posix_fadvise/src/backend/storage/smgr/md.c pgsql/src/backend/storage/smgr/md.c *** pgsql.posix_fadvise/src/backend/storage/smgr/md.c 2009-01-09 20:43:22.000000000 +0900 --- pgsql/src/backend/storage/smgr/md.c 2009-01-09 20:32:09.000000000 +0900 *************** *** 559,565 **** long seekpos; MdfdVec *v; ! v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); Assert(seekpos < BLCKSZ * RELSEG_SIZE); --- 559,567 ---- long seekpos; MdfdVec *v; ! v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_RETURN_NULL); ! if (!v) ! return; seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); Assert(seekpos < BLCKSZ * RELSEG_SIZE); diff -rcN pgsql.posix_fadvise/src/include/access/gin.h pgsql/src/include/access/gin.h *** pgsql.posix_fadvise/src/include/access/gin.h 2009-01-09 20:43:22.000000000 +0900 --- pgsql/src/include/access/gin.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 256,261 **** --- 256,262 ---- /* ginxlog.c */ extern void gin_redo(XLogRecPtr lsn, XLogRecord *record); extern void gin_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool gin_readahead(XLogRecPtr lsn, XLogRecord *record); extern void gin_xlog_startup(void); extern void gin_xlog_cleanup(void); extern bool gin_safe_restartpoint(void); diff -rcN pgsql.posix_fadvise/src/include/access/gist_private.h pgsql/src/include/access/gist_private.h *** pgsql.posix_fadvise/src/include/access/gist_private.h 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/include/access/gist_private.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 250,255 **** --- 250,256 ---- /* gistxlog.c */ extern void gist_redo(XLogRecPtr lsn, XLogRecord *record); extern void gist_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool gist_readahead(XLogRecPtr lsn, XLogRecord *record); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); extern bool gist_safe_restartpoint(void); diff -rcN pgsql.posix_fadvise/src/include/access/heapam.h pgsql/src/include/access/heapam.h *** pgsql.posix_fadvise/src/include/access/heapam.h 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/include/access/heapam.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 124,131 **** --- 124,133 ---- extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool heap_readahead(XLogRecPtr lsn, XLogRecord *rptr); extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool heap2_readahead(XLogRecPtr lsn, XLogRecord *rptr); extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from, diff -rcN pgsql.posix_fadvise/src/include/access/nbtree.h pgsql/src/include/access/nbtree.h *** pgsql.posix_fadvise/src/include/access/nbtree.h 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/include/access/nbtree.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 591,596 **** --- 591,597 ---- */ extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool btree_readahead(XLogRecPtr lns, XLogRecord *rptr); extern void btree_xlog_startup(void); extern void btree_xlog_cleanup(void); extern bool btree_safe_restartpoint(void); diff -rcN pgsql.posix_fadvise/src/include/access/readahead.h pgsql/src/include/access/readahead.h *** pgsql.posix_fadvise/src/include/access/readahead.h 1970-01-01 09:00:00.000000000 +0900 --- pgsql/src/include/access/readahead.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 0 **** --- 1,31 ---- + /*------------------------------------------------------------------------- + * + * readahead.h + * Store information of data pages which should be read ahead. + * + * Portions Copyright (c) 2008, Nippon Telegraph and Telephone Corporation + * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ + + #ifndef READAHEAD_H + #define READAHEAD_H + + #include "postgres.h" + #include "storage/relfilenode.h" + #include "storage/block.h" + #include "access/xlogdefs.h" + #include "access/xlog.h" + + /* + * Prototype of public function. + */ + void ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff, + bool has_fpw); + bool ReadAheadHasRoom(int num); + void ReadAheadExecute(void); + + #endif /* READAHEAD_H */ + diff -rcN pgsql.posix_fadvise/src/include/access/xlog.h pgsql/src/include/access/xlog.h *** pgsql.posix_fadvise/src/include/access/xlog.h 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/include/access/xlog.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 196,201 **** --- 196,202 ---- extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool xlog_readahead(XLogRecPtr lsn, XLogRecord *rptr); extern void UpdateControlFile(void); extern Size XLOGShmemSize(void); diff -rcN pgsql.posix_fadvise/src/include/access/xlog_internal.h pgsql/src/include/access/xlog_internal.h *** pgsql.posix_fadvise/src/include/access/xlog_internal.h 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/include/access/xlog_internal.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 235,240 **** --- 235,241 ---- void (*rm_startup) (void); void (*rm_cleanup) (void); bool (*rm_safe_restartpoint) (void); + bool (*rm_readahead) (XLogRecPtr lsn, XLogRecord *rptr); } RmgrData; extern const RmgrData RmgrTable[]; diff -rcN pgsql.posix_fadvise/src/include/commands/sequence.h pgsql/src/include/commands/sequence.h *** pgsql.posix_fadvise/src/include/commands/sequence.h 2009-01-09 20:42:53.000000000 +0900 --- pgsql/src/include/commands/sequence.h 2009-01-09 20:32:07.000000000 +0900 *************** *** 98,102 **** --- 98,103 ---- extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void seq_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool seq_readahead(XLogRecPtr lsn, XLogRecord *record); #endif /* SEQUENCE_H */