diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index bbd90c9..2edc8f6 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -132,8 +132,37 @@ BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded) { uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded); + BlockNumber target_block, nblocks; - return fsm_search(rel, min_cat); + target_block = fsm_search(rel, min_cat); + + /* + * The callers are usually equipped to handle cases where FSM + * information is not accurate and the returned block contains less free + * space than the caller expected. But if FSM returns a block which is + * beyond the current size of the relation, then that's a critical + * situation. If such a situation arises because of say some corruption in + * FSM, the database may get into unrecoverable state. So instead of + * returning a wrong block, we keep trying until we find a block within a + * valid range or no block is found. To ensure, we don't create infinite + * recursion, we update FSM for each block found outside the valid range so + * that the same block is not visited twice. + * + * RelationGetNumberOfBlocks() call is not exactly cheap, but there are + * other optimizations to minimize calls to GetPageWithFreeSpace() and + * hence this should not be a big drag on performance. + * + * Keep retrying until FSM hands out a block outside the range or no + * suitable block is found. + */ + nblocks = RelationGetNumberOfBlocks(rel); + while (target_block >= nblocks && target_block != InvalidBlockNumber) + { + target_block = RecordAndGetPageWithFreeSpace(rel, target_block, 0, + spaceNeeded); + } + + return target_block; } /* @@ -154,6 +183,7 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, FSMAddress addr; uint16 slot; int search_slot; + BlockNumber target_block, nblocks; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(oldPage, &slot); @@ -165,9 +195,22 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, * Otherwise, search as usual. */ if (search_slot != -1) - return fsm_get_heap_blk(addr, search_slot); + target_block = fsm_get_heap_blk(addr, search_slot); else - return fsm_search(rel, search_cat); + target_block = fsm_search(rel, search_cat); + + /* + * See comments in GetPageWithFreeSpace about handling outside the valid + * range blocks + */ + nblocks = RelationGetNumberOfBlocks(rel); + while (target_block >= nblocks && target_block != InvalidBlockNumber) + { + target_block = RecordAndGetPageWithFreeSpace(rel, target_block, 0, + spaceNeeded); + } + + return target_block; } /* @@ -328,7 +371,32 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); - MarkBufferDirtyHint(buf, false); + + /* + * It's not enough to just use a hint here because + * MarkBufferDirtyHint() may not mark the buffer as DIRTY on a standby. + * Failure to write the buffer to the permanent storage may result in + * FSM corruption. Now typically FSM corruption in terms of wrong + * reporting of available free space is tolerable and callers are + * prepared to handle those cases, but if FSM returns a block outside + * the valid range of the main relation, then that's a catastrophic + * error and may result in unusable database. So we must ensure that + * FSM truncation is always guaranteed with main relation truncation. + * + * Note that there is no separate WAL logging for this operation, but + * this is covered by XLOG_SMGR_TRUNCATE and smgr_redo() will take care + * of truncating the FSM during replay, if necessary. If a CHECKPOINT + * occurs in between, marking the FSM buffer dirty will guarantee that + * the buffer is written to disk. + * + * Note: To ensure that the system can cope with FSM corruption that + * may have already happened before we fixed this bug, we also ensure + * that RecordPageWithFreeSpace and RecordAndGetPageWithFreeSpace + * checks for main relation size and never returns a block beyond the + * valid range. + */ + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;