diff -rcN postgresql-CVS-01-21.org/src/backend/access/gin/ginxlog.c postgresql-CVS-01-21/src/backend/access/gin/ginxlog.c *** postgresql-CVS-01-21.org/src/backend/access/gin/ginxlog.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/gin/ginxlog.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 14,19 **** --- 14,20 ---- #include "postgres.h" #include "access/gin.h" + #include "access/xlog.h" #include "access/xlogutils.h" #include "storage/bufmgr.h" #include "utils/memutils.h" *************** *** 521,526 **** --- 522,631 ---- } } + /* + * gin_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + gin_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_GIN_CREATE_INDEX: + { + RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(*node, GIN_ROOT_BLKNO, lsn.xrecoff, false); + break; + } + case XLOG_GIN_CREATE_PTREE: + { + ginxlogCreatePostingTree *data = + (ginxlogCreatePostingTree *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(data->node, data->blkno, lsn.xrecoff, false); + break; + } + case XLOG_GIN_INSERT: + { + ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(data->node, data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_GIN_SPLIT: + { + int readahead_cnt; + ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record); + + readahead_cnt = 2; + if (data->isRootSplit) + readahead_cnt++; + + if (!ReadAheadHasRoom(readahead_cnt)) + return false; + + ReadAheadAddEntry(data->node, data->lblkno, lsn.xrecoff, false); + ReadAheadAddEntry(data->node, data->rblkno, lsn.xrecoff, false); + if (data->isRootSplit) + { + ReadAheadAddEntry(data->node, data->rootBlkno, + lsn.xrecoff, false); + } + break; + } + case XLOG_GIN_VACUUM_PAGE: + { + ginxlogVacuumPage *data = + (ginxlogVacuumPage *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(data->node, data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_GIN_DELETE_PAGE: + { + int readahead_cnt; + ginxlogDeletePage *data = + (ginxlogDeletePage *) XLogRecGetData(record); + readahead_cnt = 2; + if (data->leftBlkno != InvalidBlockNumber) + readahead_cnt++; + + if (!ReadAheadHasRoom(2)) + return false; + ReadAheadAddEntry(data->node, data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + ReadAheadAddEntry(data->node, data->parentBlkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + if (data->leftBlkno != InvalidBlockNumber) + { + ReadAheadAddEntry(data->node, data->leftBlkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3); + } + break; + } + } + + return true; + } + void gin_xlog_startup(void) { diff -rcN postgresql-CVS-01-21.org/src/backend/access/gist/gistxlog.c postgresql-CVS-01-21/src/backend/access/gist/gistxlog.c *** postgresql-CVS-01-21.org/src/backend/access/gist/gistxlog.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/gist/gistxlog.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 14,19 **** --- 14,20 ---- #include "postgres.h" #include "access/gist_private.h" + #include "access/xlog.h" #include "access/xlogutils.h" #include "miscadmin.h" #include "storage/bufmgr.h" *************** *** 501,506 **** --- 502,585 ---- } } + /* + * gist_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + gist_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_GIST_PAGE_UPDATE: + case XLOG_GIST_NEW_ROOT: + { + PageUpdateRecord xlrec; + + decodePageUpdateRecord(&xlrec, record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec.data->node, xlrec.data->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_GIST_PAGE_SPLIT: + { + int i; + + PageSplitRecord rec; + decodePageSplitRecord(&rec, record); + + if (!ReadAheadHasRoom(rec.data->npage)) + return false; + for (i = 0; i < rec.data->npage; i++) + { + ReadAheadAddEntry(rec.data->node, rec.page[i].header->blkno, + lsn.xrecoff, false); + } + break; + } + case XLOG_GIST_INSERT_COMPLETE: + { + /* + * This WAL record never touch data page, so nothi ng + * to do. + */ + break; + } + case XLOG_GIST_CREATE_INDEX: + { + RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(*node, GIST_ROOT_BLKNO, lsn.xrecoff, false); + break; + } + case XLOG_GIST_PAGE_DELETE: + { + gistxlogPageDelete *xldata = + (gistxlogPageDelete *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xldata->node, xldata->blkno, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + } + + return true; + } + IndexTuple gist_form_invalid_tuple(BlockNumber blkno) { diff -rcN postgresql-CVS-01-21.org/src/backend/access/heap/heapam.c postgresql-CVS-01-21/src/backend/access/heap/heapam.c *** postgresql-CVS-01-21.org/src/backend/access/heap/heapam.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/heap/heapam.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 49,54 **** --- 49,55 ---- #include "access/valid.h" #include "access/visibilitymap.h" #include "access/xact.h" + #include "access/xlog.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/namespace.h" *************** *** 4975,4980 **** --- 4976,5127 ---- } /* + * heap_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + heap_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP_INSERT: + { + xl_heap_insert *xlrec = + (xl_heap_insert *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP_DELETE: + { + xl_heap_delete *xlrec = + (xl_heap_delete *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP_UPDATE: + case XLOG_HEAP_MOVE: + case XLOG_HEAP_HOT_UPDATE: + { + bool samepage; + xl_heap_update *xlrec = + (xl_heap_update *) XLogRecGetData(record); + + samepage = ItemPointerGetBlockNumber(&xlrec->newtid) == + ItemPointerGetBlockNumber(&xlrec->target.tid); + + if (!ReadAheadHasRoom(1 + (samepage ? 0 : 1))) + return false; + /* store page which contains updated tuple. */ + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + /* store another page if any. */ + if (!samepage) + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->newtid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + break; + } + case XLOG_HEAP_NEWPAGE: + { + xl_heap_newpage *xlrec = + (xl_heap_newpage *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->blkno, + lsn.xrecoff, false); + break; + } + case XLOG_HEAP_LOCK: + { + xl_heap_lock *xlrec = + (xl_heap_lock *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP_INPLACE: + { + xl_heap_inplace *xlrec = + (xl_heap_inplace *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&xlrec->target.tid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + } + + return true; + } + + /* + * heap2_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + heap2_readahead(XLogRecPtr lsn, XLogRecord *record) + { + Assert(record); + + switch (record->xl_info) + { + case XLOG_HEAP2_FREEZE: + { + xl_heap_freeze *xlrec = + (xl_heap_freeze *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->block, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_HEAP2_CLEAN: + case XLOG_HEAP2_CLEAN_MOVE: + { + xl_heap_clean *xlrec = + (xl_heap_clean *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->block, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + } + + return true; + } + + /* * heap_sync - sync a heap, for use when no WAL has been written * * This forces the heap contents (including TOAST heap if any) down to disk. diff -rcN postgresql-CVS-01-21.org/src/backend/access/nbtree/nbtxlog.c postgresql-CVS-01-21/src/backend/access/nbtree/nbtxlog.c *** postgresql-CVS-01-21.org/src/backend/access/nbtree/nbtxlog.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/nbtree/nbtxlog.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 16,21 **** --- 16,22 ---- #include "access/nbtree.h" #include "access/transam.h" + #include "access/xlog.h" #include "storage/bufmgr.h" /* *************** *** 880,885 **** --- 881,1016 ---- } } + /* + * btree_readahead - enqueue information about data pages + * + */ + bool + btree_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + case XLOG_BTREE_INSERT_UPPER: + case XLOG_BTREE_INSERT_META: + { + int readahead_cnt; + xl_btree_insert *xlrec = + (xl_btree_insert *) XLogRecGetData(record); + + readahead_cnt = 1; + if (info == XLOG_BTREE_INSERT_META) + readahead_cnt++; + + if (!ReadAheadHasRoom(readahead_cnt)) + return false; + ReadAheadAddEntry(xlrec->target.node, + BlockIdGetBlockNumber(&xlrec->target.tid.ip_blkid), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + if (info == XLOG_BTREE_INSERT_META) + { + ReadAheadAddEntry(xlrec->target.node, + BTREE_METAPAGE, lsn.xrecoff, false); + } + break; + } + case XLOG_BTREE_SPLIT_L: + case XLOG_BTREE_SPLIT_L_ROOT: + case XLOG_BTREE_SPLIT_R: + case XLOG_BTREE_SPLIT_R_ROOT: + { + int readahead_cnt; + xl_btree_split *xlrec = + (xl_btree_split *) XLogRecGetData(record); + + readahead_cnt = 2; + if (xlrec->rnext != P_NONE) + readahead_cnt++; + + if (!ReadAheadHasRoom(readahead_cnt)) + return false; + + ReadAheadAddEntry(xlrec->node, xlrec->rightsib, + lsn.xrecoff, false); + ReadAheadAddEntry(xlrec->node, xlrec->leftsib, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + if (xlrec->rnext != P_NONE) + { + ReadAheadAddEntry(xlrec->node, xlrec->rnext, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + } + break; + } + case XLOG_BTREE_DELETE: + { + xl_btree_delete *xlrec = + (xl_btree_delete *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, xlrec->block, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + break; + } + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + case XLOG_BTREE_DELETE_PAGE_HALF: + { + int readahead_cnt; + xl_btree_delete_page *xlrec = + (xl_btree_delete_page *) XLogRecGetData(record); + + readahead_cnt = 3; + if (info == XLOG_BTREE_DELETE_PAGE_META) + readahead_cnt++; + if (xlrec->leftblk != P_NONE) + readahead_cnt++; + + /* parent page */ + ReadAheadAddEntry(xlrec->target.node, + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_1); + /* rightsib page */ + ReadAheadAddEntry(xlrec->target.node, xlrec->rightblk, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_2); + /* leftsib page, if exists */ + if (xlrec->leftblk != P_NONE) + { + ReadAheadAddEntry(xlrec->target.node, xlrec->leftblk, + lsn.xrecoff, record->xl_info & XLR_BKP_BLOCK_3); + } + /* target page */ + ReadAheadAddEntry(xlrec->target.node, + xlrec->deadblk, lsn.xrecoff, false); + /* metapage, if exists */ + if (info == XLOG_BTREE_DELETE_PAGE_META) + { + ReadAheadAddEntry(xlrec->target.node, + BTREE_METAPAGE, lsn.xrecoff, false); + } + break; + } + case XLOG_BTREE_NEWROOT: + { + xl_btree_newroot *xlrec = + (xl_btree_newroot *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + /* FPW does not exists. */ + ReadAheadAddEntry(xlrec->node, xlrec->rootblk, + lsn.xrecoff, false); + break; + } + } + + return true; + } + void btree_xlog_startup(void) { diff -rcN postgresql-CVS-01-21.org/src/backend/access/transam/Makefile postgresql-CVS-01-21/src/backend/access/transam/Makefile *** postgresql-CVS-01-21.org/src/backend/access/transam/Makefile 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/transam/Makefile 2009-01-21 10:35:03.000000000 +0900 *************** *** 12,18 **** top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o include $(top_srcdir)/src/backend/common.mk --- 12,18 ---- top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o multixact.o twophase.o twophase_rmgr.o readahead.o include $(top_srcdir)/src/backend/common.mk diff -rcN postgresql-CVS-01-21.org/src/backend/access/transam/readahead.c postgresql-CVS-01-21/src/backend/access/transam/readahead.c *** postgresql-CVS-01-21.org/src/backend/access/transam/readahead.c 1970-01-01 09:00:00.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/transam/readahead.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 0 **** --- 1,231 ---- + /*------------------------------------------------------------------------- + * + * readahead.c + * Store information of data pages which should be read ahead. + * + * Original coding 2008, Koichi Suzuki. + * + * Portions Copyright (c) 1998-2009, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ + + #include + #include + #include + #include + + #include "postgres.h" + #include "access/xlog_internal.h" + #include "catalog/catalog.h" + #include "storage/relfilenode.h" + #include "storage/block.h" + #include "storage/smgr.h" + + /* + * Information about the data page which will be read ahead. + */ + struct XLogReadAhead { + /* + * The physical location of the data page. + */ + RelFileNode node; + BlockNumber blkno; + + /* + * xrecoff is the byte offset of location in the WAL segment file as + * defined in xlogdefs.h. The read ahead command does not deal with more + * than one WAL segment file at once, and xlogid is not going to be changed + * during read-ahead. This is why we need only xrecoff. + */ + uint32 xrecoff; + + /* + * has_fpw indicates whether an WAL record contains full page write or not. + * This is used to skip unnecessary read-aheads. + */ + bool has_fpw; + }; + typedef struct XLogReadAhead XLogReadAhead; + + /* + * ReadAheadQueueSize is the initail size of XLogReadAhead queue. + * When the number of XLogReadAhead reaches this amount, we execute readahead. + * Queue uses 16MB. + */ + #define ReadAheadQueueSize (16 * 1024 * 1024) + + /* The queue for XLogReadAhead entries. */ + static XLogReadAhead *ReadAheadQueue = NULL; + + /* The number of XLogReadAhead entries currently used. */ + static uint32 ReadAheadQueueUsed = 0; + + /* prototype of local function */ + static int ReadAheadCompare(const void *l, const void *r); + + /* + * Initialize the buffer for storing information about data pages + */ + void + ReadAheadInit(void) + { + /* Allocate a buffer for storing information about data pages */ + ReadAheadQueue = (XLogReadAhead *) malloc(ReadAheadQueueSize); + Assert(ReadAheadQueue != NULL); + } + + /* + * Append a new XLogReadAhead entry to the queue + * + * If XLogReadAhead queue is fullfilled, prefetch first and add ReadAheadQueue + * to empty queue. + */ + void + ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, uint32 xrecoff, + bool has_fpw) + { + /* all entries are used, so prefetch pages and make the queue empty */ + if (ReadAheadQueueUsed >= ReadAheadQueueSize / sizeof(XLogReadAhead)) + { + ReadAheadExecute(); + } + + /* Append a new XLogReadAhead ReadAheadQueue to the queue. */ + ReadAheadQueue[ReadAheadQueueUsed].node = node; + ReadAheadQueue[ReadAheadQueueUsed].blkno = blkno; + ReadAheadQueue[ReadAheadQueueUsed].xrecoff = xrecoff; + ReadAheadQueue[ReadAheadQueueUsed].has_fpw = has_fpw; + + ReadAheadQueueUsed++; + } + + /* + * ReadAhead queue availability check + * + * If the XLogReadAhead queue has enough room for appending more num of + * XLogReadAhead, + * return true. If it does not, try to double the queue. + * If new queue could't be allocated, return false. + */ + bool + ReadAheadHasRoom(int num) + { + return (ReadAheadQueueUsed + num <= ReadAheadQueueSize / + sizeof(XLogReadAhead)); + } + + /* + * Check whether info1 and info2 point same data page. + */ + #define IS_SAME_PAGE(info1, info2) \ + (RelFileNodeEquals((info1).node, (info2).node) && \ + (info1).blkno == (info2).blkno) + + /* + * Execute read ahead data pages + * + * Before we actually read ahead data pages, sort the XLogReadAhead in the queue + * for avoiding duplicated disk access and hopefully, reducing seek time. + * We also skip read ahead data pages which has full page write. + * + * For performance, we keep file opened until reading another file. + */ + void + ReadAheadExecute(void) + { + int i; + SMgrRelation reln; + XLogReadAhead last_entry = { { 0, 0, 0, }, 0, 0, false }; + + ereport(DEBUG1, (errmsg("%d blocks are prefetch candidate", + ReadAheadQueueUsed))); + + /* Sort the XLogReadAhead queue for effective disk access. */ + qsort(ReadAheadQueue, ReadAheadQueueUsed, sizeof(XLogReadAhead), + ReadAheadCompare); + + for (i = 0; i < ReadAheadQueueUsed; i++) + { + /* Do read ahead once per a page if it doesn't have full page write. */ + if (IS_SAME_PAGE(last_entry, ReadAheadQueue[i]) || + ReadAheadQueue[i].has_fpw) + { + last_entry = ReadAheadQueue[i]; + continue; + } + + /* Create SMgrRelation object */ + reln = smgropen(ReadAheadQueue[i].node); + + /* Read ahead with prefetch API */ + smgrprefetch(reln, MAIN_FORKNUM, ReadAheadQueue[i].blkno); + + /* Store XLogReadAhead to skip duplicate pages. */ + last_entry = ReadAheadQueue[i]; + } + ReadAheadQueueUsed = 0; + } + + /* + * Compare two XLogReadAhead objects + * + * When l > r, then return 1, l == r, then return 0, and l < r, then return -1. + * The priority of comparison clauses shows below; + * 1. node.spcNode + * 2. node.dbNode + * 3. node.relNode + * 4. blkno + * 5. xrecoff + */ + static int + ReadAheadCompare(const void *l, const void *r) + { + XLogReadAhead *left = (XLogReadAhead *)l; + XLogReadAhead *right = (XLogReadAhead *)r; + + /* compare node.spcNode */ + if (left->node.spcNode > right->node.spcNode) + return 1; + else if (left->node.spcNode < right->node.spcNode) + return -1; + + /* compare node.dbNode */ + if (left->node.dbNode > right->node.dbNode) + return 1; + else if (left->node.dbNode < right->node.dbNode) + return -1; + + /* compare node.relNode */ + if (left->node.relNode > right->node.relNode) + return 1; + else if (left->node.relNode < right->node.relNode) + return -1; + + /* compare blkno */ + if (left->blkno > right->blkno) + return 1; + else if (left->blkno < right->blkno) + return -1; + + /* compare xrecoff */ + if (left->xrecoff > right->xrecoff) + return 1; + else if (left->xrecoff < right->xrecoff) + return -1; + + /* These two XLogReadAhead are same. */ + return 0; + } + + /* + * Release ReadAheadQueue buffer + */ + void + ReadAheadFinish(void) + { + if (ReadAheadQueue) + { + free(ReadAheadQueue); + } + } diff -rcN postgresql-CVS-01-21.org/src/backend/access/transam/rmgr.c postgresql-CVS-01-21/src/backend/access/transam/rmgr.c *** postgresql-CVS-01-21.org/src/backend/access/transam/rmgr.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/transam/rmgr.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 24,43 **** const RmgrData RmgrTable[RM_MAX_ID + 1] = { ! {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL}, ! {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL}, ! {"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL}, ! {"CLOG", clog_redo, clog_desc, NULL, NULL, NULL}, ! {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL}, ! {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL}, ! {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL}, ! {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, ! {"Reserved 8", NULL, NULL, NULL, NULL, NULL}, ! {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL}, ! {"Heap", heap_redo, heap_desc, NULL, NULL, NULL}, ! {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint}, ! {"Hash", hash_redo, hash_desc, NULL, NULL, NULL}, ! {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint}, ! {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint}, ! {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL} }; --- 24,43 ---- const RmgrData RmgrTable[RM_MAX_ID + 1] = { ! {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL, NULL}, ! {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL, NULL}, ! {"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL, NULL}, ! {"CLOG", clog_redo, clog_desc, NULL, NULL, NULL, NULL}, ! {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL, NULL}, ! {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL, NULL}, ! {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL, NULL}, ! {"Reserved 7", NULL, NULL, NULL, NULL, NULL, NULL}, ! {"Reserved 8", NULL, NULL, NULL, NULL, NULL, NULL}, ! {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL, heap2_readahead}, ! {"Heap", heap_redo, heap_desc, NULL, NULL, NULL, heap_readahead}, ! {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint, btree_readahead}, ! {"Hash", hash_redo, hash_desc, NULL, NULL, NULL, NULL}, ! {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint, gin_readahead}, ! {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint, gist_readahead}, ! {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL, seq_readahead} }; diff -rcN postgresql-CVS-01-21.org/src/backend/access/transam/xlog.c postgresql-CVS-01-21/src/backend/access/transam/xlog.c *** postgresql-CVS-01-21.org/src/backend/access/transam/xlog.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/access/transam/xlog.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 382,387 **** --- 382,399 ---- static char *readRecordBuf = NULL; static uint32 readRecordBufSize = 0; + /* + * Buffer for queued WAL records (fixed size) + * + * This buffer is used for holding WAL records and their LSNs. When the all WAL + * records of one WAL segment file are read, redo them and make the buffer + * empty. Therefore, twice of XLogSegSize, determined by the total size of WAL + * records and LSNs, must be enough for the buffer. + */ + #define RECORD_QUEUE_BUF_SIZE (XLogSegSize * 2) + static char *RecordQueueBuf = NULL; + static uint32 RecordQueueBufUsed = 0; + /* State information for XLOG reading */ static XLogRecPtr ReadRecPtr; /* start of last record read */ static XLogRecPtr EndRecPtr; /* end+1 of last record read */ *************** *** 442,447 **** --- 454,462 ---- static void rm_redo_error_callback(void *arg); static int get_sync_bit(int method); + static void PushRecord(XLogRecPtr lsn, XLogRecord *record); + static void PushReadAhead(XLogRecPtr lsn, XLogRecord *record); + static void RedoRecords(void); /* * Insert an XLOG record having the specified RMID and info bytes, *************** *** 2365,2370 **** --- 2380,2387 ---- ListCell *cell; int fd; + ereport(DEBUG1, (errmsg("XLOG switch to %X/%X", log, seg))); + /* * Loop looking for a suitable timeline ID: we might need to read any of * the timelines listed in expectedTLIs. *************** *** 2386,2391 **** --- 2403,2415 ---- if (InArchiveRecovery) { + /* + * Wait until next WAL segment file. It might takes long time. + * Therefore, redo with stored WAL records and LSNs here. + */ + ereport(DEBUG1, (errmsg("XLOG will be switched"))); + RedoRecords(); + /* Report recovery progress in PS display */ snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", xlogfname); *************** *** 3424,3429 **** --- 3448,3460 ---- return (XLogRecord *) buffer; next_record_is_invalid:; + /* + * Reached to unused area of current WAL segment file, redo all of WAL + * records in the queue. + */ + ereport(DEBUG1, (errmsg("next record is invalid(maybe unused area)"))); + RedoRecords(); + if (readFile >= 0) { close(readFile); *************** *** 4941,4946 **** --- 4972,4988 ---- ValidateXLOGDirectoryStructure(); /* + * To postpone the actual redo, store WAL records and EndRecPtrs. + * Therefore, this buffer must be allocated here because the buffer + * will be used by RedoRecords(); ReadRecord() may call RedoRecords(). + */ + RecordQueueBuf = (char *) malloc(RECORD_QUEUE_BUF_SIZE); + Assert(RecordQueueBuf != NULL); + + /* Allocate the buffer for storing information about data pages. */ + ReadAheadInit(); + + /* * Initialize on the assumption we want to recover to the same timeline * that's active according to pg_control. */ *************** *** 5154,5160 **** { bool recoveryContinue = true; bool recoveryApply = true; - ErrorContextCallback errcontext; InRedo = true; ereport(LOG, --- 5196,5201 ---- *************** *** 5196,5225 **** break; } ! /* Setup error traceback support for ereport() */ ! errcontext.callback = rm_redo_error_callback; ! errcontext.arg = (void *) record; ! errcontext.previous = error_context_stack; ! error_context_stack = &errcontext; ! ! /* nextXid must be beyond record's xid */ ! if (TransactionIdFollowsOrEquals(record->xl_xid, ! ShmemVariableCache->nextXid)) ! { ! ShmemVariableCache->nextXid = record->xl_xid; ! TransactionIdAdvance(ShmemVariableCache->nextXid); ! } ! ! RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record); ! /* Pop the error context stack */ ! error_context_stack = errcontext.previous; LastRec = ReadRecPtr; record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); /* * end of main redo apply loop */ --- 5237,5270 ---- break; } ! /* ! * Push WAL record in WAL record buffer with its LSN for ! * delayed redo. ! * If the WAL record queue is full, redo all WAL records in the ! * queue and make the queue empty. ! */ ! ereport(DEBUG1, ! (errmsg("WAL record queue is used %d(%d) bytes at %X/%08X.", ! RecordQueueBufUsed, record->xl_tot_len, ! EndRecPtr.xlogid, EndRecPtr.xrecoff))); ! PushRecord(EndRecPtr, record); ! /* ! * Push page information to prefetch later. ! * If no more space, redo all records in queue and make the ! * queue empty. ! */ ! PushReadAhead(EndRecPtr, record); LastRec = ReadRecPtr; record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); + /* All WAL records are read, redo all queued WAL records. */ + ereport(DEBUG1, (errmsg("end of redo apply loop"))); + RedoRecords(); + /* * end of main redo apply loop */ *************** *** 5441,5446 **** --- 5486,5587 ---- readRecordBuf = NULL; readRecordBufSize = 0; } + if (RecordQueueBuf) + { + free(RecordQueueBuf); + RecordQueueBufUsed = 0; + ReadAheadFinish(); + } + } + + /* + * Push the pair of WAL record and its LSN. + * Both WAL records and LSNs are aligned as same as WAL segment file. + */ + static void + PushRecord(XLogRecPtr lsn, XLogRecord *record) + { + Assert(record); + + if (RecordQueueBufUsed + MAXALIGN(sizeof(XLogRecPtr)) + + MAXALIGN(record->xl_tot_len) > RECORD_QUEUE_BUF_SIZE) + { + ereport(DEBUG1, (errmsg("WAL record queue is full."))); + RedoRecords(); + } + + memcpy(RecordQueueBuf + RecordQueueBufUsed, &lsn, sizeof(XLogRecPtr)); + RecordQueueBufUsed += MAXALIGN(sizeof(XLogRecPtr)); + memcpy(RecordQueueBuf + RecordQueueBufUsed, record, record->xl_tot_len); + RecordQueueBufUsed += MAXALIGN(record->xl_tot_len); + } + + /* + * Push page information to readahead module. + */ + static void + PushReadAhead(XLogRecPtr lsn, XLogRecord *record) + { + Assert(record); + + if (!RmgrTable[record->xl_rmid].rm_readahead) + return; + + while (!RmgrTable[record->xl_rmid].rm_readahead(lsn, record)) + { + ereport(DEBUG1, (errmsg("ReadAhead queue is full."))); + RedoRecords(); + } + } + + /* + * Redo all WAL records in the queue and make the it empty. + */ + static void + RedoRecords(void) + { + ErrorContextCallback errcontext; + uint32 off = 0; + + /* Readahead data pages which will be modified during redo. */ + ReadAheadExecute(); + + while (off < RecordQueueBufUsed) + { + XLogRecPtr lsn; + XLogRecord *record; + + /* Extract LSN and WAL record image from local buffer. */ + memcpy(&lsn, RecordQueueBuf + off, sizeof(XLogRecPtr)); + off += MAXALIGN(sizeof(XLogRecPtr)); + record = (XLogRecord *)(RecordQueueBuf + off); + + /* Setup error traceback support for ereport() */ + errcontext.callback = rm_redo_error_callback; + errcontext.arg = (void *) record; + errcontext.previous = error_context_stack; + error_context_stack = &errcontext; + + /* nextXid must be beyond record's xid */ + if (TransactionIdFollowsOrEquals(record->xl_xid, + ShmemVariableCache->nextXid)) + { + ShmemVariableCache->nextXid = record->xl_xid; + TransactionIdAdvance(ShmemVariableCache->nextXid); + } + + /* Redo with WAL record and its LSN. */ + RmgrTable[record->xl_rmid].rm_redo(lsn, record); + + /* Pop the error context stack */ + error_context_stack = errcontext.previous; + + off += MAXALIGN(record->xl_tot_len); + } + + /* Make RecordQueueBuf empty. */ + MemSet(RecordQueueBuf, 0, sizeof(RecordQueueBuf)); + RecordQueueBufUsed = 0; } /* diff -rcN postgresql-CVS-01-21.org/src/backend/commands/sequence.c postgresql-CVS-01-21/src/backend/commands/sequence.c *** postgresql-CVS-01-21.org/src/backend/commands/sequence.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/commands/sequence.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 17,22 **** --- 17,23 ---- #include "access/heapam.h" #include "access/transam.h" #include "access/xact.h" + #include "access/xlog.h" #include "access/xlogutils.h" #include "catalog/dependency.h" #include "catalog/namespace.h" *************** *** 1385,1387 **** --- 1386,1418 ---- appendStringInfo(buf, "rel %u/%u/%u", xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode); } + + /* + * seq_readahead - enqueue information about data pages + * + * The readahead module stores information about pages that are modified through + * redo-ing record. + * + */ + bool + seq_readahead(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + Assert(record); + + switch (info) + { + case XLOG_SEQ_LOG: + { + xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record); + + if (!ReadAheadHasRoom(1)) + return false; + ReadAheadAddEntry(xlrec->node, 0, lsn.xrecoff, false); + break; + } + } + + return true; + } diff -rcN postgresql-CVS-01-21.org/src/backend/storage/smgr/md.c postgresql-CVS-01-21/src/backend/storage/smgr/md.c *** postgresql-CVS-01-21.org/src/backend/storage/smgr/md.c 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/backend/storage/smgr/md.c 2009-01-21 10:35:03.000000000 +0900 *************** *** 560,566 **** off_t seekpos; MdfdVec *v; ! v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); --- 560,568 ---- off_t seekpos; MdfdVec *v; ! v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_RETURN_NULL); ! if (!v) ! return; seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); diff -rcN postgresql-CVS-01-21.org/src/include/access/gin.h postgresql-CVS-01-21/src/include/access/gin.h *** postgresql-CVS-01-21.org/src/include/access/gin.h 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/include/access/gin.h 2009-01-21 10:35:02.000000000 +0900 *************** *** 256,261 **** --- 256,262 ---- /* ginxlog.c */ extern void gin_redo(XLogRecPtr lsn, XLogRecord *record); extern void gin_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool gin_readahead(XLogRecPtr lsn, XLogRecord *record); extern void gin_xlog_startup(void); extern void gin_xlog_cleanup(void); extern bool gin_safe_restartpoint(void); diff -rcN postgresql-CVS-01-21.org/src/include/access/gist_private.h postgresql-CVS-01-21/src/include/access/gist_private.h *** postgresql-CVS-01-21.org/src/include/access/gist_private.h 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/include/access/gist_private.h 2009-01-21 10:35:02.000000000 +0900 *************** *** 250,255 **** --- 250,256 ---- /* gistxlog.c */ extern void gist_redo(XLogRecPtr lsn, XLogRecord *record); extern void gist_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool gist_readahead(XLogRecPtr lsn, XLogRecord *record); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); extern bool gist_safe_restartpoint(void); diff -rcN postgresql-CVS-01-21.org/src/include/access/heapam.h postgresql-CVS-01-21/src/include/access/heapam.h *** postgresql-CVS-01-21.org/src/include/access/heapam.h 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/include/access/heapam.h 2009-01-21 10:35:02.000000000 +0900 *************** *** 124,131 **** --- 124,133 ---- extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool heap_readahead(XLogRecPtr lsn, XLogRecord *rptr); extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool heap2_readahead(XLogRecPtr lsn, XLogRecord *rptr); extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from, diff -rcN postgresql-CVS-01-21.org/src/include/access/nbtree.h postgresql-CVS-01-21/src/include/access/nbtree.h *** postgresql-CVS-01-21.org/src/include/access/nbtree.h 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/include/access/nbtree.h 2009-01-21 10:35:02.000000000 +0900 *************** *** 591,596 **** --- 591,597 ---- */ extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool btree_readahead(XLogRecPtr lns, XLogRecord *rptr); extern void btree_xlog_startup(void); extern void btree_xlog_cleanup(void); extern bool btree_safe_restartpoint(void); diff -rcN postgresql-CVS-01-21.org/src/include/access/xlog.h postgresql-CVS-01-21/src/include/access/xlog.h *** postgresql-CVS-01-21.org/src/include/access/xlog.h 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/include/access/xlog.h 2009-01-21 10:35:02.000000000 +0900 *************** *** 14,20 **** --- 14,23 ---- #include "access/rmgr.h" #include "access/xlogdefs.h" #include "lib/stringinfo.h" + #include "postgres.h" #include "storage/buf.h" + #include "storage/relfilenode.h" + #include "storage/block.h" #include "utils/pg_crc.h" #include "utils/timestamp.h" *************** *** 198,203 **** --- 201,207 ---- extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool xlog_readahead(XLogRecPtr lsn, XLogRecord *rptr); extern void UpdateControlFile(void); extern Size XLOGShmemSize(void); *************** *** 212,215 **** --- 216,227 ---- extern XLogRecPtr GetInsertRecPtr(void); extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch); + /* Implimented in readahead.c. */ + extern void ReadAheadInit(void); + extern void ReadAheadAddEntry(RelFileNode node, BlockNumber blkno, + uint32 xrecoff, bool has_fpw); + extern bool ReadAheadHasRoom(int num); + extern void ReadAheadExecute(void); + extern void ReadAheadFinish(void); + #endif /* XLOG_H */ diff -rcN postgresql-CVS-01-21.org/src/include/access/xlog_internal.h postgresql-CVS-01-21/src/include/access/xlog_internal.h *** postgresql-CVS-01-21.org/src/include/access/xlog_internal.h 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/include/access/xlog_internal.h 2009-01-21 10:35:02.000000000 +0900 *************** *** 235,240 **** --- 235,241 ---- void (*rm_startup) (void); void (*rm_cleanup) (void); bool (*rm_safe_restartpoint) (void); + bool (*rm_readahead) (XLogRecPtr lsn, XLogRecord *rptr); } RmgrData; extern const RmgrData RmgrTable[]; diff -rcN postgresql-CVS-01-21.org/src/include/commands/sequence.h postgresql-CVS-01-21/src/include/commands/sequence.h *** postgresql-CVS-01-21.org/src/include/commands/sequence.h 2009-01-21 10:39:04.000000000 +0900 --- postgresql-CVS-01-21/src/include/commands/sequence.h 2009-01-21 10:35:02.000000000 +0900 *************** *** 98,102 **** --- 98,103 ---- extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void seq_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool seq_readahead(XLogRecPtr lsn, XLogRecord *record); #endif /* SEQUENCE_H */